## Initialization

In [1]:
!pip install validators matplotlib




Import packages and download pretrained model

In [1]:
import gc
# del variables
gc.collect()

import torch
import torch.nn as nn
import torch.optim as optim 
torch.cuda.empty_cache()

import torchvision
from torch.utils.data import DataLoader, Dataset
import torch.utils.data as utils
from torchvision import transforms    
from torch.optim import lr_scheduler as lrs

import numpy as np
import pandas as pd
import os
from pathlib import Path
import matplotlib.image as mpimg
import matplotlib.pyplot as plt



# For evaluation and submission
from GLC.metrics import top_30_error_rate, top_k_error_rate_from_sets, predict_top_30_set
from GLC.submission import generate_submission_file
from sklearn.metrics import accuracy_score


# For data loading and visualization
from GLC.data_loading.common import load_patch
from GLC.plotting import visualize_observation_patch
from GLC.data_loading.environmental_raster import PatchExtractor

# For time monitoring
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Using {device} for inference')


Using cuda for inference


In [3]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)

model

Using cache found in C:\Users\chiro/.cache\torch\hub\pytorch_vision_v0.10.0


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

Turn the model into a classifier with the 17k+ classes

In [3]:
model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_efficientnet_b0', pretrained=True)
utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')

model.eval().to(device)

model.stem = nn.Conv2d(6, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)


model.classifier.fc = nn.Sequential(
    nn.Linear(in_features=1280, out_features=4096),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(in_features=4096, out_features=17037)
)

# Unfreeze model weights
for param in model.parameters():
    param.requires_grad = True

model = model.cuda()

optimizer = optim.Adam(model.parameters(), lr=0.1)  #they take 0.01
loss_func = nn.CrossEntropyLoss()
scheduler = lrs.StepLR(optimizer, step_size=2, gamma=0.1)

model

Using cache found in C:\Users\chiro/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub
Using cache found in C:\Users\chiro/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub


EfficientNet(
  (stem): Conv2d(6, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (layers): Sequential(
    (0): Sequential(
      (block0): MBConvBlock(
        (depsep): Sequential(
          (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (se): SequentialSqueezeAndExcitation(
          (squeeze): Linear(in_features=32, out_features=8, bias=True)
          (expand): Linear(in_features=8, out_features=32, bias=True)
          (activation): SiLU(inplace=True)
          (sigmoid): Sigmoid()
          (mul_a_quantizer): Identity()
          (mul_b_quantizer): Identity()
        )
        (proj): Sequential(
          (conv): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(16, eps=0.001, momentum=0.0100000000000000

Define data paths

In [4]:
# Change this path to adapt to where you downloaded the data
DATA_PATH = Path("./geolifeclef-2022-lifeclef-2022-fgvc9")


################
# DATA LOADING #
################
print("Observations loading")

# Load train set of observations from France and USA and merge
df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")
df_obs = pd.concat((df_obs_fr, df_obs_us))

# Extract training and validation subsets as np arrays
obs_id_train = df_obs.index[df_obs["subset"] == "train"].values
obs_id_val = df_obs.index[df_obs["subset"] == "val"].values

# Separate values to predict
# y_train = df_obs.loc[obs_id_train]["species_id"].values
# y_val = df_obs.loc[obs_id_val]["species_id"].values

df_train =  df_obs.loc[obs_id_train].reset_index(drop=False)
df_val =  df_obs.loc[obs_id_val].reset_index(drop=False)
display(df_train.head(2))

# Same with test set of observations
df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id")
df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

# Extract observaions as np array
obs_id_test = df_obs_test.index.values
df_test =  df_obs_test.loc[obs_id_test].reset_index(drop=False)

# Load landcover metadata to use the patches
df_landcover_labels = pd.read_csv(DATA_PATH / "metadata" / "landcover_original_labels.csv", sep=";")
df_suggested_landcover_alignment = pd.read_csv(DATA_PATH / "metadata" / "landcover_suggested_alignment.csv", sep=";")
landcover_mapping = df_suggested_landcover_alignment["suggested_landcover_code"].values

display(df_landcover_labels.head(2))
display(df_suggested_landcover_alignment.head(2))

Observations loading


Unnamed: 0,observation_id,latitude,longitude,species_id,subset
0,10561949,45.705116,1.424622,241,train
1,10131188,45.146973,6.416794,101,train


Unnamed: 0,landcover_code,original_landcover_code,landcover_label
0,0,0,Missing Data
1,1,11,Annual Summer Crops


Unnamed: 0,landcover_code,suggested_landcover_code,suggested_landcover_label
0,0,0,Missing Data
1,1,11,Cultivated Crops


In [11]:
df_obs.species_id.value_counts(sort=False).sort_index()

0        980
1        248
2        436
3        520
4        844
        ... 
17032      3
17033      3
17034      3
17035      3
17036      3
Name: species_id, Length: 17037, dtype: int64

In [84]:
aug_th = 50
df_train_small = pd.DataFrame()

labels, counts = np.unique(df_train.species_id, return_counts=True)

small_species = labels[counts<aug_th]

for species in tqdm(small_species, total=len(small_species)):
    species_count = counts[species]

    species_subset = df_train[df_train.species_id == species]
    
    for i in range(aug_th - species_count):
        species_row = species_subset.sample()
        df_train_small = pd.concat([df_train_small, species_row], ignore_index=True)

df_train_small.shape

100%|█████████▉| 12533/12539 [21:49<00:00,  9.57it/s]


IndexError: index 17031 is out of bounds for axis 0 with size 17031

In [86]:
df_train = pd.concat([df_train, df_train_small], ignore_index=True).reset_index()

import pickle as pkl
with open("./df_train_augmented.pkl", "wb") as f:
    pkl.dump(df_train, f)

In [87]:
df_train.shape

(2022425, 6)

Define the data generator

In [5]:
class ImageData(Dataset):
    def __init__(self, df, data_path, load, landcover_mapping):
        super().__init__()
        self.df = df
        self.data_path = data_path
        self.load = load
        self.landcover_mapping = landcover_mapping
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        obs_id = self.df.observation_id.loc[idx]
        species = self.df.species_id.loc[idx]
        patch = self.load(obs_id, self.data_path, landcover_mapping=self.landcover_mapping)
        patch = [patch[0],
                 np.reshape(patch[1], (patch[1].shape[0], patch[1].shape[1],1)),
                  np.reshape(patch[2], (patch[2].shape[0], patch[2].shape[1],1)),
                   np.reshape(patch[3], (patch[3].shape[0], patch[3].shape[1],1))]
        patch = torch.FloatTensor(np.concatenate(patch, axis=2))
        patch = torch.movedim(patch, 2, 0)
        return patch, torch.FloatTensor([species])


class ImageDataTest(Dataset):
    def __init__(self, df, data_path, load, landcover_mapping):
        super().__init__()
        self.df = df
        self.data_path = data_path
        self.load = load
        self.landcover_mapping = landcover_mapping
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        obs_id = self.df.observation_id.loc[idx]
        # species = self.df.species_id.loc[idx]
        patch = self.load(obs_id, self.data_path, landcover_mapping=self.landcover_mapping)
        patch = [patch[0],
                 np.reshape(patch[1], (patch[1].shape[0], patch[1].shape[1],1)),
                  np.reshape(patch[2], (patch[2].shape[0], patch[2].shape[1],1)),
                   np.reshape(patch[3], (patch[3].shape[0], patch[3].shape[1],1))]
        patch = torch.FloatTensor(np.concatenate(patch, axis=2))
        patch = torch.movedim(patch, 2, 0)
        return patch #, torch.FloatTensor([species])



Training

In [6]:
%%time
# User params
epochs = 5
batch_size = 16
batch_size_val = 8
num_workers=0
plt_steps=1000
early_stop = 30

# Data loader call
# data_transf = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])
train_data = ImageData(df = df_train, data_path = DATA_PATH, load = load_patch, landcover_mapping=landcover_mapping)
train_loader = DataLoader(dataset = train_data, batch_size = batch_size, num_workers=num_workers)

val_data = ImageData(df = df_val, data_path = DATA_PATH, load = load_patch, landcover_mapping=landcover_mapping)
val_loader = DataLoader(dataset = val_data, shuffle=False, batch_size=batch_size_val)
# y_val = df_obs.loc[obs_id_val]["species_id"].values


torch.cuda.empty_cache()


def training():

    # Train model
    loss_log_train = []
    acc_log_train = []
    loss_log_val = []
    acc_log_val = []
    list_top_30_train = []
    list_top_30_val = []

    stop = 0
    total = len(df_train)//batch_size
    total_val = len(df_val)//batch_size_val
    m = nn.Softmax(dim=1)

    for epoch in range(epochs):    
        # Put the model in training mode (dropout layers unfreezed)
        model.train()    

        with tqdm(enumerate(train_loader), total=total) as tepoch:
            for ii, (data, target) in tepoch:
                tepoch.set_description(f"Epoch {epoch} training")

                target = target.squeeze().long()
                data, target = data.cuda(), target.cuda()
                
                optimizer.zero_grad()
                output = model(data)                

                # Compute loss
                loss = loss_func(m(output), target)
                loss.backward()

                # Update weights
                optimizer.step()  
                scheduler.step()

                if ii % plt_steps == 0:
                    loss_log_train.append(loss.item())       
                    
                    # Show accuracy while training every x batches
                    pred = torch.argmax(m(output), dim=1)
                    acc = accuracy_score(target.cpu(), pred.cpu())
                    acc_log_train.append(acc.item())       

                    top_30_error_train = top_30_error_rate(target.cpu().detach().numpy(), m(output).cpu().detach().numpy())
                    list_top_30_train.append(top_30_error_train)


                    tepoch.set_postfix(loss=loss.item(), accuracy=acc, top_30=top_30_error_train)

                # stop+=1
                # if stop > early_stop:
                #     stop = 0
                #     break

        # Save model before validation in case of memory overflow
        export_model = "./models/model_epoch_"+str(epoch)+".pth"
        torch.save(model, export_model)


        # Put the model in evaluation mode
        model.eval()

        with tqdm(enumerate(val_loader), total=total_val) as tepoch_val:
            for ii, (data, target) in tepoch_val:
                tepoch_val.set_description(f"Epoch {epoch} validation")

                target = target.squeeze().long()
                data, target = data.cuda(), target.cuda()
                
                output = model(data)                

                # Compute validation loss and accuracy
                if ii % plt_steps == 0:
                    loss_val = loss_func(m(output), target)
                    loss_log_val.append(loss_val.item())       
                    
                    # Show accuracy while training every x batches
                    pred = torch.argmax(m(output), dim=1)
                    acc_val = accuracy_score(target.cpu(), pred.cpu())
                    acc_log_val.append(acc_val.item())       

                    # top_30_val = predict_top_30_set(pred.cpu().detach().numpy())
                    top_30_error_val = top_30_error_rate(target.cpu().detach().numpy(), m(output).cpu().detach().numpy())
                    list_top_30_val.append(top_30_error_val)


                    tepoch_val.set_postfix(val_loss=loss_val.item(), val_accuracy=acc_val, val_top_30=top_30_error_val)

                # stop+=1
                # if stop > early_stop:
                #     stop = 0
                #     break

        print('Epoch: {} - Loss (train): {:.6f}'.format(epoch, loss.item()), " - Accuracy (train): {:.6f}".format(acc), " - Top 30 error (train): {:.6f}".format(top_30_error_train),
                             ' - Loss (val): {:.6f}'.format(loss_val.item()), " - Accuracy (val): {:.6f}".format(acc_val),  " - Top 30 error (val): {:.6f}".format(top_30_error_val))


        # Visualization monitoring
        export_loss_png = "./models/loss_epochs_"+str(epochs)+"_batch_size_"+str(batch_size)+".png"
        x = range(len(loss_log_train))
        
        fig = plt.figure(figsize=(14,8))
        plt.subplot(131)
        plt.plot(x, loss_log_train, label="train_loss")
        plt.plot(x, loss_log_val, label="val_loss")
        plt.legend()
        
        plt.subplot(132)
        plt.plot(x, acc_log_train, label="train_acc")
        plt.plot(x, acc_log_val, label="val_acc")
        plt.legend()
        
        plt.subplot(133)
        plt.plot(x, list_top_30_train, label="train_top_30_error")
        plt.plot(x, list_top_30_val, label="val_top_30_error")
        plt.legend()
        
        plt.suptitle("Training metrics")
        
        fig.patch.set_facecolor('white')
        fig.savefig(export_loss_png, transparent=False)


if __name__ == '__main__':
    training()

Epoch 0 training:   0%|          | 0/99212 [00:00<?, ?it/s]

Load weights and evaluate model

In [6]:
SUBMISSION_PATH = "./submissions/"

batch_size_test = 32
total_test = len(df_test)//batch_size_test
m = nn.Softmax(dim=1)
stop = 0
early_stop = 30

test_data = ImageDataTest(df = df_test, data_path = DATA_PATH, load = load_patch, landcover_mapping=landcover_mapping)
test_loader = DataLoader(dataset = test_data, shuffle=False, batch_size=batch_size_test)
preds = np.zeros([0,30])

# Load model
model = torch.load("./models/model_epoch_0_date_11_05.pth")

# Put the model in evaluation mode
model.eval()


with tqdm(enumerate(test_loader), total=total_test) as tepoch_test:
    for ii, data in tepoch_test:
        tepoch_test.set_description("Test prediction progress: ")
        
        # Pass data to cuda and make a prediction
        data = data.cuda()
        output = model(data)                
        pred = m(output)

        # Convert the prediction to a numpy array
        pred = pred.cpu().detach().numpy()
        pred = predict_top_30_set(pred)
        preds = np.concatenate([preds, pred], axis=0)

        # #Early stopping
        # if stop > early_stop:
        #     stop = 0
        #     break
        # else:
        #     stop+=1

    preds = np.array(preds).astype(np.int32)



# Generate the submission file
generate_submission_file(SUBMISSION_PATH + "efficient_net_1_epochs.csv", df_obs_test.index[:preds.shape[0]], preds)


Test prediction progress: : : 1139it [04:39,  4.08it/s]                        
