In [21]:
import torch
import tqdm
import numpy as np
import pandas as pd
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
# from sklearn.metrics import precision_recall_fscore_support

from src.dataset.dataset import TrainDataset, TestDataset

## Data description

The Sentinel-2 data was acquired through the Sentinel2 satellite program and pre-processed by [Ecodatacube](https://stac.ecodatacube.eu/) to produce raster files scaled to the entire European continent and projected into a unique CRS. 
Each TIFF file corresponds to a unique observation location (via "surveyId"). To load the patches for a selected observation, take the "surveyId" from any occurrence CSV and load it following this rule --> '…/CD/AB/XXXXABCD.jpeg'. For example, the image location for the surveyId 3018575 is "./75/85/3018575.tiff". For all "surveyId" with less than four digits, you can use a similar rule. For a "surveyId" 1 is "./1/1.tiff".
The data can simply be loaded using the following method:

```python
def construct_patch_path(output_path, survey_id):
    """Construct the patch file path based on survey_id as './CD/AB/XXXXABCD.tiff'"""
    path = output_path
    for d in (str(survey_id)[-2:], str(survey_id)[-4:-2]):
        path = os.path.join(path, d)

    path = os.path.join(path, f"{survey_id}.tiff")

    return path
```

**For more information about data processing, normalization, and visualization, please refer to the following notebook**: [Kaggle Notebook](https://www.kaggle.com/code/picekl/sentinel-2-data-processing-and-normalization).

**References:**
- *Traceability (lineage): The dataset was produced entirely by mosaicking and seasonally aggregating imagery from the Sentinel-2 Level-2A product (https://sentinels.copernicus.eu/web/sentinel/user-guides/sentinel-2-msi/product-types/level-2a)*
- *Ecodatacube.eu: Analysis-ready open environmental data cube for Europe (https://doi.org/10.21203/rs.3.rs-2277090/v3)*

## Prepare custom dataset loader

We have to slightly update the Dataset to provide the relevant data in the appropriate format.

### Load metadata and prepare data loaders

In [22]:
# Dataset and DataLoader
batch_size = 128
num_workers = 8

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5, 0.5)),
])

# Load Test metadata
test_data_path = "data/SatellitePatches/PA-test/"
test_metadata_path = "data/GLC25_PA_metadata_test.csv"
test_metadata = pd.read_csv(test_metadata_path)
test_dataset = TestDataset(test_data_path, test_metadata, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

print(f"Test dataset size: {len(test_dataset)}")

# Load Training metadata
train_data_path = "data/SatellitePatches/PA-train"
train_metadata_path = "data/GLC25_PA_metadata_train.csv"
train_metadata = pd.read_csv(train_metadata_path)
train_dataset = TrainDataset(train_data_path, train_metadata, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

Test dataset size: 14784


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x111511e40>
Traceback (most recent call last):
  File "/Users/lukas/Offline/DHBW/semester2/conda/project/.venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py", line 1663, in __del__
    self._shutdown_workers()
  File "/Users/lukas/Offline/DHBW/semester2/conda/project/.venv/lib/python3.13/site-packages/torch/utils/data/dataloader.py", line 1621, in _shutdown_workers
    if self._persistent_workers or self._workers_status[worker_id]:
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'


In [23]:
print(train_dataset.metadata)
print(train_dataset.box_survey_dict)

             lon        lat  year  geoUncertaintyInM  areaInM2         region  \
0       3.099038  43.134956  2021                5.0     100.0  MEDITERRANEAN   
1       9.884560  56.912140  2017               10.0      79.0    CONTINENTAL   
2       8.256020  55.637050  2019               10.0      79.0       ATLANTIC   
3      -0.402590  43.505630  2018                1.0       NaN       ATLANTIC   
4      -0.517360  45.806430  2017                1.0       NaN       ATLANTIC   
...          ...        ...   ...                ...       ...            ...   
88982  10.327990  57.305850  2018               10.0      79.0    CONTINENTAL   
88983  15.003900  55.090170  2017               10.0     707.0    CONTINENTAL   
88984   8.935060  55.461500  2018               10.0      79.0       ATLANTIC   
88985  17.252948  53.901434  2021                5.0      25.0    CONTINENTAL   
88986   9.913980  56.827580  2018               10.0     707.0    CONTINENTAL   

       country  speciesId  

## Modify pretrained ResNet-18 model

To fully use all the R,G,B and NIR channels, we have to modify the input layer of the standard ResNet-18.
That is all :)

In [None]:
from src.helpers import select_device

# Check if cuda is available
device = select_device()
print(f"Using device: {device}")

# Hyperparameters
learning_rate = 0.0001
num_epochs = 25
positive_weigh_factor = 1.0
num_classes = 11255 # Number of all unique classes within the PO and PA data.

DEVICE = MPS


In [25]:
boxes = train_dataset.box_survey_dict

filtered_rows = boxes[boxes.apply(lambda x: len(x) > 1)]

# Print the filtered rows
print(filtered_rows)

print(len(filtered_rows)/len(boxes))

box
1002_1513                   [326649, 577029, 2664152, 3171360]
1003_1496                                   [1793166, 2045314]
1003_2361                          [1269918, 2488420, 2859120]
1004_1497                  [477274, 1555880, 3055778, 3197328]
1004_1498                           [688327, 2350009, 3809498]
                                   ...                        
981_1517                                    [1397125, 2005491]
985_1502                                    [2871206, 3086648]
992_1584     [377065, 1675495, 1932649, 1998214, 2034085, 2...
996_2347                    [205525, 853001, 2503701, 3687012]
996_2348                           [1297938, 1845074, 2359428]
Name: surveyId, Length: 13646, dtype: object
0.4126024249387718


In [26]:
from src.model.ResNets import ResNet50


model = ResNet50()
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=25)

In [27]:
print(model)

ResNet50(
  (model): ResNet(
    (conv1): Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2))
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act1): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act1): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (drop_block): Identity()
        (act2): ReLU(inplace=True)
        (aa): Identity()
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_

In [28]:
def set_seed(seed):
    # Set seed for Python's built-in random number generator
    torch.manual_seed(seed)
    # Set seed for numpy
    np.random.seed(seed)
    # Set seed for CUDA if available
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        # Set cuDNN's random number generator seed for deterministic behavior
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(77)

## Training Loop

Nothing special, just a standard Pytorch training loop.

In [29]:
print(f"Training for {num_epochs} epochs started.")

for epoch in range(num_epochs):
    model.train()
    for batch_idx, (data, targets, _) in enumerate(train_loader):

        data = data.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(data)

        pos_weight = targets*positive_weigh_factor  # All positive weights are equal to 10
        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

        if batch_idx % 348 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item()}")

    scheduler.step()
    print("Scheduler:",scheduler.state_dict())

# Save the trained model
model.eval()
torch.save(model.state_dict(), "resnet50-untrained.pth")

Training for 25 epochs started.


KeyboardInterrupt: 

## Test Loop

Again, nothing special, just a standard inference.

In [None]:
from src.helpers import test_loop

surveys, top_k_indices = test_loop(model, test_loader, device)

## Save prediction file! 🎉🥳🙌🤗

In [None]:
data_concatenated = [' '.join(map(str, row)) for row in top_k_indices]

pd.DataFrame(
    {
        'surveyId': surveys,
        'predictions': data_concatenated,
    }
).to_csv("submission.csv", index = False)