- Notebook modified from https://www.kaggle.com/code/markwijkhuizen/planttraits2024-eda-training-pub.
- Training only, EDA part not included.
- Image model only, tabular data not used.

## Import Libraries 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imageio.v3 as imageio
import albumentations as A

from albumentations.pytorch import ToTensorV2
from torch.utils.data import Dataset, DataLoader
from torch import nn
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score

import torch
import timm
import glob
import torchmetrics
import time
import psutil
import os
import time
import pickle

tqdm.pandas()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
class Config():
    IMAGE_SIZE = 384
#     BACKBONE = 'swin_large_patch4_window12_384.ms_in22k_ft_in1k'
    BACKBONE = 'tf_efficientnet_b0'
    TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
    
    OUTLIERS = {'X4_mean': (0, 1), 
                'X11_mean': (0.0001, 0.998), 
                'X18_mean': (0.0001, 0.99), 
                'X50_mean': (0.0001, 0.999), 
                'X26_mean': (0.0001, 0.985), 
                'X3112_mean': (0.0001, 0.99)}
    
    N_TARGETS = len(TARGET_COLUMNS)
    BATCH_SIZE = 32
    LR_MAX = 1e-4
    WEIGHT_DECAY = 0.01
    N_EPOCHS = 8
    TRAIN_MODEL = True
    IS_INTERACTIVE = os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive'
    
#     MODEL_PATH = '/kaggle/input/plainttraits2024-swintransformer/model.pth'
    MODEL_PATH = '/kaggle/input/planttraits2024-swintransformer-tabular/model.pth'
        
CONFIG = Config()

## Load Data

In [3]:
def plot_hist(df):
    num_columns = len(df.columns)
    num_rows = (num_columns + 1) // 2
    
    fig, axes = plt.subplots(num_rows, 2, figsize=(14, 8))
    fig.subplots_adjust(hspace=0.5, wspace=0.3)
    
    for i, column in enumerate(df.columns):
        row = i // 2
        col = i % 2
        ax = axes[row, col]
        ax.hist(df[column], bins=30, color='skyblue', edgecolor='black')
        ax.set_title(f'Histogram of {column}')
        ax.set_xlabel(column)
        ax.set_ylabel('Frequency')
        ax.grid(True)
    
    # Remove any unused subplots
    for i in range(num_columns, num_rows * 2):
        fig.delaxes(axes.flatten()[i])
    
    plt.show()

In [4]:
%%time
read_images = False

if not read_images:
    train = pd.read_pickle('/kaggle/input/plainttraits2024-swintransformer/train.pkl')
    test = pd.read_pickle('/kaggle/input/plainttraits2024-swintransformer/test.pkl')
else: 
    train = pd.read_csv('/kaggle/input/planttraits2024/train.csv')
    train['file_path'] = train['id'].apply(lambda s: f'/kaggle/input/planttraits2024/train_images/{s}.jpeg')
    train['jpeg_bytes'] = train['file_path'].progress_apply(lambda fp: open(fp, 'rb').read())
    train.to_pickle('train.pkl')

    test = pd.read_csv('/kaggle/input/planttraits2024/test.csv')
    test['file_path'] = test['id'].apply(lambda s: f'/kaggle/input/planttraits2024/test_images/{s}.jpeg')
    test['jpeg_bytes'] = test['file_path'].progress_apply(lambda fp: open(fp, 'rb').read())
    test.to_pickle('test.pkl')

for col, value in CONFIG.OUTLIERS.items():
    lower_quantile = train[col].quantile(value[0])
    upper_quantile = train[col].quantile(value[1])  
    train = train[(train[col] >= lower_quantile) & (train[col] <= upper_quantile)]    

train = train[train['X4_mean'] > 0]
    
sd_columns = [col for col in train.columns if col.endswith('_sd')]
train = train.drop(columns=sd_columns)
    
CONFIG.N_TRAIN_SAMPLES = len(train)
CONFIG.N_STEPS_PER_EPOCH = (CONFIG.N_TRAIN_SAMPLES // CONFIG.BATCH_SIZE)
CONFIG.N_STEPS = CONFIG.N_STEPS_PER_EPOCH * CONFIG.N_EPOCHS + 1    
CONFIG.TABULAR_COLUMNS = train.filter(regex='^(WORLDCLIM_BIO|SOIL|MODIS_2000|VOD)').columns
    
if CONFIG.TRAIN_MODEL:
    print('N_TRAIN_SAMPLES:', len(train), 'N_TEST_SAMPLES:', len(test))
else:
    print('N_TEST_SAMPLES:', len(test))

N_TRAIN_SAMPLES: 53333 N_TEST_SAMPLES: 6545
CPU times: user 1.96 s, sys: 3.05 s, total: 5.01 s
Wall time: 36.1 s


In [5]:
# if CONFIG.TRAIN_MODEL:
LOG_FEATURES = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

y_df = np.zeros_like(train[CONFIG.TARGET_COLUMNS], dtype=np.float32)
for target_idx, target in enumerate(CONFIG.TARGET_COLUMNS):
    v = train[target].values
    if target in LOG_FEATURES:
        v = np.log10(v)
    y_df[:, target_idx] = v

SCALER = StandardScaler()
y_df = SCALER.fit_transform(y_df)

In [6]:
# Splitting the data into training and validation sets
train_df, val_df, y_train, y_val = train_test_split(train, y_df, test_size=0.2, random_state=42)

In [7]:
SCALER_tabular = StandardScaler()
tabular_df_train = SCALER_tabular.fit_transform(train_df[CONFIG.TABULAR_COLUMNS])
tabular_df_val = SCALER_tabular.transform(val_df[CONFIG.TABULAR_COLUMNS])
tabular_df_test = SCALER_tabular.fit_transform(test[CONFIG.TABULAR_COLUMNS])

In [8]:
MEAN = np.array([0.485, 0.456, 0.406])
STD = np.array([0.229, 0.224, 0.225])

TRAIN_TRANSFORMS = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.4),
        A.RandomSizedCrop(
            [int(0.85*CONFIG.IMAGE_SIZE), CONFIG.IMAGE_SIZE],
            CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE, w2h_ratio=1.0, p=0.75),
        A.Resize(CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE),
        A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.25),
        A.HueSaturationValue(p=0.3),
        A.ImageCompression(quality_lower=85, quality_upper=100, p=0.25),
        A.ToFloat(),
        A.Normalize(mean=MEAN, std=STD, max_pixel_value=1),
        ToTensorV2(),
    ])

TEST_TRANSFORMS = A.Compose([
        A.Resize(CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE),
        A.ToFloat(),
        A.Normalize(mean=MEAN, std=STD, max_pixel_value=1),
        ToTensorV2(),
    ])

class Dataset(Dataset):
    def __init__(self, X_jpeg_bytes, X_tabular, y, transforms=None):
        self.X_jpeg_bytes = X_jpeg_bytes
        self.X_tabular = X_tabular
        self.y = y
        self.transforms = transforms

    def __len__(self):
        return len(self.X_jpeg_bytes)

    def __getitem__(self, index):
        X_sample = self.transforms(
            image=imageio.imread(self.X_jpeg_bytes[index]),
        )
        X_sample['tabular'] = self.X_tabular[index].astype('float32')
        y_sample = self.y[index]
        
        return X_sample, y_sample

if CONFIG.TRAIN_MODEL:
    # Creating datasets for training and validation
    train_dataset = Dataset(
        train_df['jpeg_bytes'].values,
        tabular_df_train,
        y_train,
        TRAIN_TRANSFORMS,
    )

    val_dataset = Dataset(
        val_df['jpeg_bytes'].values,
        tabular_df_val,
        y_val,
        TEST_TRANSFORMS,
    )

    # Creating dataloaders for training and validation
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=CONFIG.BATCH_SIZE,
        shuffle=True,
        drop_last=True,
        num_workers=psutil.cpu_count(),
    )

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=CONFIG.BATCH_SIZE,
        shuffle=False,  # No need to shuffle validation data
        num_workers=psutil.cpu_count(),
    )

test_dataset = Dataset(
    test['jpeg_bytes'].values,
    tabular_df_test,
    test['id'].values,
    TEST_TRANSFORMS,
)

In [9]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = timm.create_model(
                CONFIG.BACKBONE,
                pretrained=True,
                num_classes=0,
        )
        
        # EfficientNet = 1280, SwinTrans = 1536, Tabular = 163
        self.custom_layers = nn.Sequential(
            nn.Linear(1280+163, 512),
            nn.ReLU(), 
            nn.Linear(512, 6)  
        )
        
    def forward(self, inputs):
        image = inputs['image']
        tabular = inputs['tabular']

        x = self.backbone(image)
        x = torch.cat((tabular, x), dim=1)
        x = self.custom_layers(x)
        
        return x


model = Model()
model = model.to(device)
# print(model.backbone.head)

model.safetensors:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

In [10]:
def get_lr_scheduler(optimizer):
    return torch.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        max_lr=CONFIG.LR_MAX,
        total_steps=CONFIG.N_STEPS,
        pct_start=0.1,
        anneal_strategy='cos',
        div_factor=1e1,
        final_div_factor=1e1,
    )

class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val):
        self.sum += val.sum()
        self.count += val.numel()
        self.avg = self.sum / self.count

if CONFIG.TRAIN_MODEL:
    MAE = torchmetrics.regression.MeanAbsoluteError().to(device)
    R2 = torchmetrics.regression.R2Score(num_outputs=CONFIG.N_TARGETS, multioutput='uniform_average').to(device)
    LOSS = AverageMeter()

    Y_MEAN = torch.tensor(y_df).mean(dim=0).to(device)
    EPS = torch.tensor([1e-6]).to(device)

    def r2_loss(y_pred, y_true):
        ss_res = torch.sum((y_true - y_pred)**2, dim=0)
        ss_total = torch.sum((y_true - Y_MEAN)**2, dim=0)
        ss_total = torch.maximum(ss_total, EPS)
        r2 = torch.mean(ss_res / ss_total)
        return r2

    LOSS_FN = r2_loss

    optimizer = torch.optim.AdamW(
        params=model.parameters(),
        lr=CONFIG.LR_MAX,
        weight_decay=CONFIG.WEIGHT_DECAY,
    )

    LR_SCHEDULER = get_lr_scheduler(optimizer)

In [11]:
if CONFIG.TRAIN_MODEL:
    print("Start Training:")
    
    best = float('inf')
    for epoch in range(CONFIG.N_EPOCHS):
        MAE.reset()
        R2.reset()
        LOSS.reset()
        model.train()

        for step, (X_batch, y_true) in enumerate(train_dataloader):
            X_batch['image'] = X_batch['image'].to(device)
            X_batch['tabular'] = X_batch['tabular'].to(device)
            y_true = y_true.to(device)
            t_start = time.perf_counter_ns()
            y_pred = model(X_batch)
            loss = LOSS_FN(y_pred, y_true)
            LOSS.update(loss)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            LR_SCHEDULER.step()
            MAE.update(y_pred, y_true)
            R2.update(y_pred, y_true)

            if not CONFIG.IS_INTERACTIVE and (step+1) == len(train_dataloader):
                print(
                    f'EPOCH {epoch+1:02d}, {step+1:04d}/{len(train_dataloader)} | ' + 
                    f'loss: {LOSS.avg:.4f}, mae: {MAE.compute().item():.4f}, r2: {R2.compute().item():.4f}, ' +
                    f'step: {(time.perf_counter_ns()-t_start)*1e-9:.3f}s, lr: {LR_SCHEDULER.get_last_lr()[0]:.2e}',
                )
            elif CONFIG.IS_INTERACTIVE:
                print(
                    f'\rEPOCH {epoch+1:02d}, {step+1:04d}/{len(train_dataloader)} | ' + 
                    f'loss: {LOSS.avg:.4f}, mae: {MAE.compute().item():.4f}, r2: {R2.compute().item():.4f}, ' +
                    f'step: {(time.perf_counter_ns()-t_start)*1e-9:.3f}s, lr: {LR_SCHEDULER.get_last_lr()[0]:.2e}',
                    end='\n' if (step + 1) == CONFIG.N_STEPS_PER_EPOCH else '', flush=True,
                )
        
        if CONFIG.IS_INTERACTIVE:
            print()
        
        MAE.reset()
        R2.reset()
        LOSS.reset()
        model.eval()
        
        for step, (X_batch, y_true) in enumerate(val_dataloader):
            X_batch['image'] = X_batch['image'].to(device)
            X_batch['tabular'] = X_batch['tabular'].to(device)
            y_true = y_true.to(device)
            t_start = time.perf_counter_ns()
            with torch.no_grad():
                y_pred = model(X_batch)
                loss = LOSS_FN(y_pred, y_true)
                
            LOSS.update(loss)
            MAE.update(y_pred, y_true)
            R2.update(y_pred, y_true)
            if not CONFIG.IS_INTERACTIVE and (step+1) == len(val_dataloader):
                print(
                    f'EPOCH VAL, {epoch+1:02d}, {step+1:04d}/{len(val_dataloader)} | ' + 
                    f'loss: {LOSS.avg:.4f}, mae: {MAE.compute().item():.4f}, r2: {R2.compute().item():.4f}, ' +
                    f'step: {(time.perf_counter_ns()-t_start)*1e-9:.3f}s, lr: {LR_SCHEDULER.get_last_lr()[0]:.2e}',
                )
            elif CONFIG.IS_INTERACTIVE:
                print(
                    f'\rEPOCH VAL {epoch+1:02d}, {step+1:04d}/{len(val_dataloader)} | ' + 
                    f'loss: {LOSS.avg:.4f}, mae: {MAE.compute().item():.4f}, r2: {R2.compute().item():.4f}, ' +
                    f'step: {(time.perf_counter_ns()-t_start)*1e-9:.3f}s, lr: {LR_SCHEDULER.get_last_lr()[0]:.2e}',
                    end='\n' if (step + 1) == CONFIG.N_STEPS_PER_EPOCH else '', flush=True,
                )
        print()
        if LOSS.avg < best:
            best = LOSS.avg
            torch.save(model, 'model.pth')
            
            
else:
    model = torch.load(CONFIG.MODEL_PATH)
    model.to(device)

Start Training:
EPOCH 01, 1333/1333 | loss: 0.7615, mae: 0.6712, r2: 0.2432, step: 0.745s, lr: 1.00e-04
EPOCH VAL, 01, 0334/334 | loss: 0.6478, mae: 0.6074, r2: 0.3568, step: 0.087s, lr: 1.00e-04

EPOCH 02, 1333/1333 | loss: 0.6372, mae: 0.6053, r2: 0.3682, step: 0.288s, lr: 9.70e-05
EPOCH VAL, 02, 0334/334 | loss: 0.6132, mae: 0.5886, r2: 0.3935, step: 0.027s, lr: 9.70e-05

EPOCH 03, 1333/1333 | loss: 0.5952, mae: 0.5816, r2: 0.4115, step: 0.288s, lr: 8.84e-05
EPOCH VAL, 03, 0334/334 | loss: 0.5938, mae: 0.5783, r2: 0.4130, step: 0.027s, lr: 8.84e-05

EPOCH 04, 1333/1333 | loss: 0.5607, mae: 0.5630, r2: 0.4461, step: 0.288s, lr: 7.52e-05
EPOCH VAL, 04, 0334/334 | loss: 0.5864, mae: 0.5720, r2: 0.4217, step: 0.027s, lr: 7.52e-05

EPOCH 05, 1333/1333 | loss: 0.5312, mae: 0.5466, r2: 0.4761, step: 0.288s, lr: 5.91e-05
EPOCH VAL, 05, 0334/334 | loss: 0.5744, mae: 0.5662, r2: 0.4345, step: 0.027s, lr: 5.91e-05

EPOCH 06, 1333/1333 | loss: 0.5062, mae: 0.5328, r2: 0.5011, step: 0.288s, lr: 

In [12]:
model = torch.load('/kaggle/working/model.pth')
model.to(device);

In [13]:
SUBMISSION_ROWS = []
model.eval()

for X_sample_test, test_id in tqdm(test_dataset):
    X_sample_test['image'] = torch.Tensor(X_sample_test['image']).unsqueeze(0).to(device)
    X_sample_test['tabular'] = torch.Tensor(X_sample_test['tabular']).unsqueeze(0).to(device)
    with torch.no_grad():
        y_pred = model(X_sample_test).detach().cpu().numpy()
    
    y_pred = SCALER.inverse_transform(y_pred).squeeze()
    row = {'id': test_id}
    
    for k, v in zip(CONFIG.TARGET_COLUMNS, y_pred):
        if k in LOG_FEATURES:
            row[k.replace('_mean', '')] = 10 ** v
        else:
            row[k.replace('_mean', '')] = v

    SUBMISSION_ROWS.append(row)
    
submission_df = pd.DataFrame(SUBMISSION_ROWS)
submission_df.to_csv('submission.csv', index=False)
print("Submit!")

  0%|          | 0/6545 [00:00<?, ?it/s]

Submit!
