- Notebook modified from https://www.kaggle.com/code/markwijkhuizen/planttraits2024-eda-training-pub.
- Training only, EDA part not included.
- Image model only, tabular data not used.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imageio.v3 as imageio
import albumentations as A

from albumentations.pytorch import ToTensorV2
from torch.utils.data import Dataset, DataLoader
from torch import nn
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score

import xgboost as xgb

import torch
import timm
import glob
import torchmetrics
import time
import psutil
import os
import time
import pickle

tqdm.pandas()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
class Config():
    IMAGE_SIZE = 384
  #  BACKBONE = 'swin_large_patch4_window12_384.ms_in22k_ft_in1k'
    BACKBONE = 'tf_efficientnet_b0'
    TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean', 'X4_sd', 'X11_sd', 'X18_sd', 'X50_sd', 'X26_sd', 'X3112_sd']
    N_TARGETS = len(TARGET_COLUMNS)
    BATCH_SIZE = 46
    LR_MAX = 1e-4
    WEIGHT_DECAY = 0.01
    N_EPOCHS = 6
    TRAIN_MODEL = True
    IS_INTERACTIVE = os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive'
    
    MODEL_PATH = '/kaggle/input/plainttraits2024-swintransformer/model.pth'
        
CONFIG = Config()

In [3]:
%%time
read_images = False

if not read_images:
    train = pd.read_pickle('/kaggle/input/plainttraits2024-swintransformer/train.pkl')
    test = pd.read_pickle('/kaggle/input/plainttraits2024-swintransformer/test.pkl')
else: 
    # if CONFIG.TRAIN_MODEL:
    train = pd.read_csv('/kaggle/input/planttraits2024/train.csv')
    train['file_path'] = train['id'].apply(lambda s: f'/kaggle/input/planttraits2024/train_images/{s}.jpeg')
    train['jpeg_bytes'] = train['file_path'].progress_apply(lambda fp: open(fp, 'rb').read())
    train.to_pickle('train.pkl')

    test = pd.read_csv('/kaggle/input/planttraits2024/test.csv')
    test['file_path'] = test['id'].apply(lambda s: f'/kaggle/input/planttraits2024/test_images/{s}.jpeg')
    test['jpeg_bytes'] = test['file_path'].progress_apply(lambda fp: open(fp, 'rb').read())
    test.to_pickle('test.pkl')

for column in CONFIG.TARGET_COLUMNS[:6]:
    lower_quantile = train[column].quantile(0.005)
    upper_quantile = train[column].quantile(0.985)  
    train = train[(train[column] >= lower_quantile) & (train[column] <= upper_quantile)]    
    
CONFIG.N_TRAIN_SAMPLES = len(train)
CONFIG.N_STEPS_PER_EPOCH = (CONFIG.N_TRAIN_SAMPLES // CONFIG.BATCH_SIZE)
CONFIG.N_STEPS = CONFIG.N_STEPS_PER_EPOCH * CONFIG.N_EPOCHS + 1    
    
if CONFIG.TRAIN_MODEL:
    print('N_TRAIN_SAMPLES:', len(train), 'N_TEST_SAMPLES:', len(test))
else:
    print('N_TEST_SAMPLES:', len(test))

N_TRAIN_SAMPLES: 49168 N_TEST_SAMPLES: 6545
CPU times: user 1.64 s, sys: 2.85 s, total: 4.5 s
Wall time: 26.5 s


In [4]:
# if CONFIG.TRAIN_MODEL:
LOG_FEATURES = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

#fill nan with the mean of the column
from scipy.stats import truncnorm

def fill_missing_with_truncnorm(series):
    mean = series.mean()
    std = series.std()
    lower_bound = mean - 2 * std  # Lower bound of the truncated normal distribution
    upper_bound = mean + 2 * std  # Upper bound of the truncated normal distribution
    size = series.isnull().sum()  # Number of missing values to fill
    # Generate random numbers from the truncated normal distribution
    random_values = truncnorm.rvs((lower_bound - mean) / std, (upper_bound - mean) / std, loc=mean, scale=std, size=size)
    # Replace NaN values with the generated random values
    series.loc[series.isnull()] = random_values
    return series

# Apply the function to fill missing values in each column
train[CONFIG.TARGET_COLUMNS[6:]] = train[CONFIG.TARGET_COLUMNS[6:]].apply(fill_missing_with_truncnorm, axis=0)
# temp_mean = train[CONFIG.TARGET_COLUMNS[6:]].mean(skipna=True)
# train[CONFIG.TARGET_COLUMNS[6:]] = train[CONFIG.TARGET_COLUMNS[6:]].fillna(temp_mean, inplace=False)

y_df = np.zeros_like(train[CONFIG.TARGET_COLUMNS], dtype=np.float32)
for target_idx, target in enumerate(CONFIG.TARGET_COLUMNS):
    v = train[target].values
    if target in LOG_FEATURES:
        v = np.log10(v)
    y_df[:, target_idx] = v

SCALER = StandardScaler()
y_df = SCALER.fit_transform(y_df)

In [5]:
MEAN = np.array([0.485, 0.456, 0.406])
STD = np.array([0.229, 0.224, 0.225])

TRAIN_TRANSFORMS = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomSizedCrop(
            [int(0.85*CONFIG.IMAGE_SIZE), CONFIG.IMAGE_SIZE],
            CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE, w2h_ratio=1.0, p=0.75),
        A.Resize(CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE),
        A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.25),
        A.ImageCompression(quality_lower=85, quality_upper=100, p=0.25),
        A.ToFloat(),
        A.Normalize(mean=MEAN, std=STD, max_pixel_value=1),
        ToTensorV2(),
    ])

TEST_TRANSFORMS = A.Compose([
        A.Resize(CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE),
        A.ToFloat(),
        A.Normalize(mean=MEAN, std=STD, max_pixel_value=1),
        ToTensorV2(),
    ])

class Dataset(Dataset):
    def __init__(self, X_jpeg_bytes, y, transforms=None):
        self.X_jpeg_bytes = X_jpeg_bytes
        self.y = y
        self.transforms = transforms

    def __len__(self):
        return len(self.X_jpeg_bytes)

    def __getitem__(self, index):
        X_sample = self.transforms(
            image=imageio.imread(self.X_jpeg_bytes[index]),
        )['image']
        y_sample = self.y[index]
        
        return X_sample, y_sample

if CONFIG.TRAIN_MODEL:
    # Splitting the data into training and validation sets
    train_df, val_df, y_train, y_val = train_test_split(train, y_df, test_size=0.2, random_state=42)

    # Creating datasets for training and validation
    train_dataset = Dataset(
        train_df['jpeg_bytes'].values,
        y_train,
        TRAIN_TRANSFORMS,
    )

    val_dataset = Dataset(
        val_df['jpeg_bytes'].values,
        y_val,
        TEST_TRANSFORMS,
    )

    # Creating dataloaders for training and validation
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=CONFIG.BATCH_SIZE,
        shuffle=True,
        drop_last=True,
        num_workers=psutil.cpu_count(),
    )

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=CONFIG.BATCH_SIZE,
        shuffle=False,  # No need to shuffle validation data
        num_workers=psutil.cpu_count(),
    )


test_dataset = Dataset(
    test['jpeg_bytes'].values,
    test['id'].values,
    TEST_TRANSFORMS,
)

In [6]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = timm.create_model(
                CONFIG.BACKBONE,
                num_classes=CONFIG.N_TARGETS,
                pretrained=True)
        
    def forward(self, inputs):
        return self.backbone(inputs)

model = Model()
model = model.to(device)
# print(model.backbone.head)

model.safetensors:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

In [7]:
create_features = False

if create_features:
    feature_model = torch.load(CONFIG.MODEL_PATH)
    feature_model.backbone.head.fc = nn.Identity()
    feature_model.to(device);

#     dataset = Dataset(
#         train['jpeg_bytes'].values,
#         y_df,
#         TEST_TRANSFORMS,
#     )
    dataset = test_dataset

    # Create a DataLoader for the dataset
    dataloader = DataLoader(dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False)

    # Define the shape of the features array
    num_samples = len(dataset)
    num_features = 1536

    # Initialize an empty array filled with zeros to store features
    features_array = np.zeros((num_samples, num_features), dtype=np.float32)

    # Set the model to evaluation mode
    model.eval()

    # Initialize the index counter
    index = 0

    # Iterate over the data loader batches
    for batch in dataloader:
        if index % 10 == 0:
            print(index, end=', ')

        # Extract inputs and labels from the batch
        inputs, _ = batch
        inputs = inputs.to(device)

        # Forward pass through the model to extract features
        with torch.no_grad():
            features = feature_model(inputs)

        # Flatten the features
        features = features.view(features.size(0), -1)

        # Calculate the end index for the current batch
        end_index = index + inputs.size(0)

        # Assign features to the pre-allocated array
        features_array[index:end_index] = features.cpu().numpy()

        # Update the index counter
        index = end_index

    # Save the features as a NumPy array
    np.save('features_test.npy', features_array)
    


In [8]:
# def count_parameters(model):
#     return sum(p.numel() for p in model.parameters() if p.requires_grad)

# # Count parameters
# num_parameters = count_parameters(model)
# print("Number of parameters in the model:", num_parameters)

In [9]:
def get_lr_scheduler(optimizer):
    return torch.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        max_lr=CONFIG.LR_MAX,
        total_steps=CONFIG.N_STEPS,
        pct_start=0.1,
        anneal_strategy='cos',
        div_factor=1e1,
        final_div_factor=1e1,
    )

class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val):
        self.sum += val.sum()
        self.count += val.numel()
        self.avg = self.sum / self.count

if CONFIG.TRAIN_MODEL:
    MAE = torchmetrics.regression.MeanAbsoluteError().to(device)
    R2 = torchmetrics.regression.R2Score(num_outputs=CONFIG.N_TARGETS, multioutput='uniform_average').to(device)
    LOSS = AverageMeter()

    Y_MEAN = torch.tensor(y_train).mean(dim=0).to(device)
    EPS = torch.tensor([1e-6]).to(device)

    def r2_loss(y_pred, y_true):
        ss_res = torch.sum((y_true - y_pred)**2, dim=0)
        ss_total = torch.sum((y_true - Y_MEAN)**2, dim=0)
        ss_total = torch.maximum(ss_total, EPS)
        r2 = torch.mean(ss_res / ss_total)
        return r2

    LOSS_FN = r2_loss

    optimizer = torch.optim.AdamW(
        params=model.parameters(),
        lr=CONFIG.LR_MAX,
        weight_decay=CONFIG.WEIGHT_DECAY,
    )

    LR_SCHEDULER = get_lr_scheduler(optimizer)

In [10]:
if CONFIG.TRAIN_MODEL:
    print("Start Training:")
    for epoch in range(CONFIG.N_EPOCHS):
        MAE.reset()
        R2.reset()
        LOSS.reset()
        model.train()

        for step, (X_batch, y_true) in enumerate(train_dataloader):
            X_batch = X_batch.to(device)
            y_true = y_true.to(device)
            t_start = time.perf_counter_ns()
            y_pred = model(X_batch)
            loss = LOSS_FN(y_pred, y_true)
            LOSS.update(loss)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            LR_SCHEDULER.step()
            MAE.update(y_pred, y_true)
            R2.update(y_pred, y_true)

            if not CONFIG.IS_INTERACTIVE and (step+1) == len(train_dataloader):
                print(
                    f'EPOCH {epoch+1:02d}, {step+1:04d}/{len(train_dataloader)} | ' + 
                    f'loss: {LOSS.avg:.4f}, mae: {MAE.compute().item():.4f}, r2: {R2.compute().item():.4f}, ' +
                    f'step: {(time.perf_counter_ns()-t_start)*1e-9:.3f}s, lr: {LR_SCHEDULER.get_last_lr()[0]:.2e}',
                )
            elif CONFIG.IS_INTERACTIVE:
                print(
                    f'\rEPOCH {epoch+1:02d}, {step+1:04d}/{len(train_dataloader)} | ' + 
                    f'loss: {LOSS.avg:.4f}, mae: {MAE.compute().item():.4f}, r2: {R2.compute().item():.4f}, ' +
                    f'step: {(time.perf_counter_ns()-t_start)*1e-9:.3f}s, lr: {LR_SCHEDULER.get_last_lr()[0]:.2e}',
                    end='\n' if (step + 1) == CONFIG.N_STEPS_PER_EPOCH else '', flush=True,
                )
            
        MAE.reset()
        R2.reset()
        LOSS.reset()
        model.eval()
        
        print()
        
        for step, (X_batch, y_true) in enumerate(val_dataloader):
            X_batch = X_batch.to(device)
            y_true = y_true.to(device)
            t_start = time.perf_counter_ns()
            with torch.no_grad():
                y_pred = model(X_batch)
                loss = LOSS_FN(y_pred, y_true)
                
            LOSS.update(loss)
            MAE.update(y_pred, y_true)
            R2.update(y_pred, y_true)
            if not CONFIG.IS_INTERACTIVE and (step+1) == len(val_dataloader):
                print(
                    f'EPOCH VAL, {epoch+1:02d}, {step+1:04d}/{len(val_dataloader)} | ' + 
                    f'loss: {LOSS.avg:.4f}, mae: {MAE.compute().item():.4f}, r2: {R2.compute().item():.4f}, ' +
                    f'step: {(time.perf_counter_ns()-t_start)*1e-9:.3f}s, lr: {LR_SCHEDULER.get_last_lr()[0]:.2e}',
                )
            elif CONFIG.IS_INTERACTIVE:
                print(
                    f'\rEPOCH {epoch+1:02d}, {step+1:04d}/{len(val_dataloader)} | ' + 
                    f'loss: {LOSS.avg:.4f}, mae: {MAE.compute().item():.4f}, r2: {R2.compute().item():.4f}, ' +
                    f'step: {(time.perf_counter_ns()-t_start)*1e-9:.3f}s, lr: {LR_SCHEDULER.get_last_lr()[0]:.2e}',
                    end='\n' if (step + 1) == CONFIG.N_STEPS_PER_EPOCH else '', flush=True,
                )
        print()

    torch.save(model, 'model.pth')
else:
    model = torch.load(CONFIG.MODEL_PATH)
    model.to(device)

Start Training:
EPOCH 01, 0855/855 | loss: 1.3073, mae: 0.7828, r2: -0.2148, step: 0.843s, lr: 9.97e-05

EPOCH VAL, 01, 0214/214 | loss: 0.9250, mae: 0.6537, r2: 0.0891, step: 0.084s, lr: 9.97e-05

EPOCH 02, 0855/855 | loss: 0.8958, mae: 0.6393, r2: 0.1125, step: 0.405s, lr: 9.18e-05

EPOCH VAL, 02, 0214/214 | loss: 0.8660, mae: 0.6221, r2: 0.1410, step: 0.080s, lr: 9.18e-05

EPOCH 03, 0855/855 | loss: 0.8543, mae: 0.6193, r2: 0.1509, step: 0.405s, lr: 7.52e-05

EPOCH VAL, 03, 0214/214 | loss: 0.8427, mae: 0.6160, r2: 0.1637, step: 0.080s, lr: 7.52e-05

EPOCH 04, 0855/855 | loss: 0.8330, mae: 0.6083, r2: 0.1719, step: 0.405s, lr: 5.33e-05

EPOCH VAL, 04, 0214/214 | loss: 0.8308, mae: 0.6062, r2: 0.1734, step: 0.080s, lr: 5.33e-05

EPOCH 05, 0855/855 | loss: 0.8152, mae: 0.6003, r2: 0.1887, step: 0.405s, lr: 3.08e-05

EPOCH VAL, 05, 0214/214 | loss: 0.8248, mae: 0.6010, r2: 0.1807, step: 0.080s, lr: 3.08e-05

EPOCH 06, 0855/855 | loss: 0.8031, mae: 0.5947, r2: 0.2003, step: 0.405s, lr: 

In [11]:
# train_features = np.load('/kaggle/input/planttraits2024-image-features/features_train.npy')
# test_features = np.load('/kaggle/input/planttraits2024-image-features/features_test.npy')

# features_df = pd.DataFrame(train_features)
# features_df.columns = [f'feature_{i}' for i in range(train_features.shape[1])]
# features_df.head()

# train_df = pd.concat([train.reset_index(drop=True), features_df.reset_index(drop=True)], axis=1)

# features_df = pd.DataFrame(test_features)
# features_df.columns = [f'feature_{i}' for i in range(test_features.shape[1])]
# features_df.head()

# test_df = pd.concat([test.reset_index(drop=True), features_df.reset_index(drop=True)], axis=1)

In [12]:
# do_cv = True

# remove_cols = [ 
#  'X4_sd',
#  'X11_sd',
#  'X18_sd',
#  'X26_sd',
#  'X50_sd',
#  'X3112_sd',
#  'file_path',
#  'jpeg_bytes'
# ]

# X_full = train_df.drop(columns=CONFIG.TARGET_COLUMNS + remove_cols)
# Y_full = train_df[CONFIG.TARGET_COLUMNS]

# models = {}

# for column in Y_full.columns:

#     model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=150, learning_rate=0.1, max_depth=10)

#     if do_cv:
#         print(f"\nDoing cross-validation scoring for {column}...")
#         scores = cross_val_score(model, X_full, Y_full[column],
#                                  cv=KFold(n_splits=3, shuffle=True, random_state=42),
#                                  scoring='r2')        
#         print(f"R^2 score for {column}: {np.mean(scores)}")
    
#     #train model with all data
#     print(f"Training model for {column}...")
#     model.fit(X_full, Y_full[column])
#     models[column] = model
    
# with open('all_models.pkl', 'wb') as file:
#     pickle.dump(models, file)

In [13]:
# with open('all_models.pkl', 'rb') as file:
#     models = pickle.load(file)

In [14]:
# mean_values = Y_full.mean()
# submission = pd.DataFrame({'id': test_df['id']})
# submission[Y_full.columns] = mean_values

# #rename from _mean
# submission.columns = submission.columns.str.replace('_mean', '')
# submission

In [15]:
# remove_cols = [ 
#  'file_path',
#  'jpeg_bytes'
# ]

# X_test = test_df.drop(columns=remove_cols)

# submission['X4'] = models['X4_mean'].predict(X_test)
# submission['X11'] = models['X11_mean'].predict(X_test)
# submission['X18'] = models['X18_mean'].predict(X_test)
# submission['X50'] = models['X50_mean'].predict(X_test)
# submission['X26'] = models['X26_mean'].predict(X_test)
# submission['X3112'] = models['X3112_mean'].predict(X_test)

# submission

In [16]:
# submission.to_csv('submission.csv', index=False)

In [17]:
SUBMISSION_ROWS = []
model.eval()

for X_sample_test, test_id in tqdm(test_dataset):
    with torch.no_grad():
        y_pred = model(X_sample_test.unsqueeze(0).to(device)).detach().cpu().numpy()
    
    y_pred = SCALER.inverse_transform(y_pred).squeeze()
    row = {'id': test_id}
    
    for k, v in zip(CONFIG.TARGET_COLUMNS, y_pred[:6]):
        if k in LOG_FEATURES:
            row[k.replace('_mean', '')] = 10 ** v
        else:
            row[k.replace('_mean', '')] = v

    SUBMISSION_ROWS.append(row)
    
submission_df = pd.DataFrame(SUBMISSION_ROWS)
submission_df.to_csv('submission.csv', index=False)
print("Submit!")

  0%|          | 0/6545 [00:00<?, ?it/s]

Submit!


In [18]:
submission_df


Unnamed: 0,id,X4,X11,X18,X50,X26,X3112
0,201238668,0.567912,10.537067,0.935950,1.764433,2.530322,179.446643
1,202310319,0.376271,18.070832,0.323159,1.202091,0.591087,563.261426
2,202604412,0.573178,11.760076,0.813296,1.735611,4.540753,271.220300
3,201353439,0.456050,21.773234,0.184608,1.062815,0.203505,449.425292
4,195351745,0.468225,14.780636,0.145836,1.275627,0.411594,212.546681
...,...,...,...,...,...,...,...
6540,195548469,0.514686,11.436031,0.292879,1.618907,0.900842,154.331179
6541,199261251,0.597640,13.786541,9.237129,1.286534,15.466591,3437.066122
6542,203031744,0.457860,18.465233,0.902051,1.404124,5.867988,1968.572011
6543,197736382,0.397470,19.909937,0.525531,1.017316,0.470583,372.962193
