# Swin Transformer and Support Vector Regression Ensemble
After training the Swin Transformer, we try to ensemble it with a Support Vector Regression model to see if it improves our performance. We train an SVR model for each of the 5 Swin Transformer folds using the output embeddings from the Swin Transformer and the photo metadata. We take the weighted average between the Swin Transformer and its respective SVR to make the final predictions. The ensemble performed marginally better than the Swin Transformer alone.

# Imports

In [1]:
# Add tez and timm libraries to Kaggle
import sys
sys.path.append("../input/tez-lib/")
sys.path.append("../input/timmmaster/")

# Basics
import numpy as np
import pandas as pd

# Image manipulation
import cv2

# Main neural network library
import torch.nn as nn
import torch

# Easier PyTorch development
import tez
from tez.callbacks import EarlyStopping

# Image augmentations
import albumentations

# Swin Transformer backbone
import timm

# Calculate MSE
from sklearn.metrics import mean_squared_error

# Support Vector Regression
import cuml
from cuml.svm import SVR

# Model serialization
import pickle

# General

In [2]:
# Fold to train
FOLD = 0

# Input image size
IMAGE_SIZE = 384

# Weighted average weight for Swin Transformer
SWIN_WEIGHT = 0.5

In [3]:
# To denormalize predictions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Dataset

In [4]:
class Dataset:
    def __init__(self, image_paths, dense_features, targets, augmentations):
        self.image_paths = image_paths
        self.dense_features = dense_features
        self.targets = targets
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, item):
        # Read image and convert BGR to RGB
        image = cv2.imread(self.image_paths[item])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Augmentations
        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]
            
        # OpenCV is (height, width, channel)
        # PyTorch wants (channel, height, width)
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        
        # Photo metadata
        features = self.dense_features[item, :]
        
        # Important: divide by 100 to normalize targets for BCE loss
        targets = self.targets[item] / 100.0
        
        return {
            "image": torch.tensor(image, dtype=torch.float),
            "features": torch.tensor(features, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.float),
        }

In [5]:
# Load community-provided 5-fold preprocessed CSV file to dataframe
df = pd.read_csv("../input/same-old-creating-folds/train_5folds.csv")

# Dataframes
df_train = df[df.kfold != FOLD].reset_index(drop=True)
df_valid = df[df.kfold == FOLD].reset_index(drop=True)

# Image paths
train_image_paths = [f"../input/petfinder-pawpularity-score/train/{x}.jpg" for x in df_train["Id"].values]
valid_image_paths = [f"../input/petfinder-pawpularity-score/train/{x}.jpg" for x in df_valid["Id"].values]

# Dense features (photo metadata)
photo_metadata = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']
train_dense_features = df_train[photo_metadata].values
valid_dense_features = df_valid[photo_metadata].values

# Targets (Pawpularity scores)
train_targets = df_train["Pawpularity"].values
valid_targets = df_valid["Pawpularity"].values

# Augmentations: always resize and normalize, others for train set only
train_aug = albumentations.Compose(
    [
        albumentations.Resize(IMAGE_SIZE, IMAGE_SIZE, p=1),
        albumentations.HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5),
        albumentations.RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5),
        albumentations.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
        albumentations.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)
valid_aug = albumentations.Compose(
    [
        albumentations.Resize(IMAGE_SIZE, IMAGE_SIZE, p=1),
        albumentations.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)

# Create datasets
train_dataset = Dataset(
    image_paths=train_image_paths,
    dense_features=train_dense_features,
    targets=train_targets,
    augmentations=train_aug,
)
valid_dataset = Dataset(
    image_paths=valid_image_paths,
    dense_features=valid_dense_features,
    targets=valid_targets,
    augmentations=valid_aug,
)

# Swin Model

In [6]:
class Swin(tez.Model):
    def __init__(self):
        super().__init__()

        # Swin Transformer backbone
        self.model = timm.create_model("swin_large_patch4_window12_384", pretrained=True, in_chans=3)
        
        # Classification head
        self.model.head = nn.Linear(self.model.head.in_features, 128)
        self.dropout = nn.Dropout(0.1)
        self.dense1 = nn.Linear(140, 64)
        
        # Important: add ReLU activation function for dense head layer to introduce non-linearity
        self.relu1 = nn.ReLU()
        
        # Single output without sigmoid, BCE loss will apply sigmoid automatically
        self.dense2 = nn.Linear(64, 1)
        
        # Step the schedule after each epoch
        self.step_scheduler_after = "epoch"

    def monitor_metrics(self, outputs, targets):
        # Important: apply sigmoid function to output predictions and multiply by 100 to denormalize
        outputs = sigmoid(outputs.cpu().detach().numpy()) * 100
        
        # Important: multiply targets by 100 to denormalize
        targets = targets.cpu().detach().numpy() * 100
        
        # Calculate RMSE
        rmse = mean_squared_error(targets, outputs, squared=False)
        return {"rmse": rmse}

    def fetch_scheduler(self):
        # Cosine annealing scheduler
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            self.optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )
        return scheduler

    def fetch_optimizer(self):
        # Adam optimizer
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
        return optimizer

    def forward(self, image, features, targets=None):
        # Forward pass through model
        x = self.model(image)
        x = self.dropout(x)
        x = torch.cat([x, features], dim=1)
        x = self.dense1(x)
        
        # Important: pass through ReLU activation
        x = self.relu1(x)
        
        # Output node
        x = self.dense2(x)

        if targets is not None:
            # Important: use binary cross entropy loss instead of mean squared error loss
            loss = nn.BCEWithLogitsLoss()(x, targets.view(-1, 1))
            
            metrics = self.monitor_metrics(x, targets)
            return x, loss, metrics
        return x, 0, {}

# Swin Inference

In [7]:
# Create test dataset
df_test = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
test_image_paths = [f"../input/petfinder-pawpularity-score/test/{x}.jpg" for x in df_test["Id"].values]
test_dense_features = df_test[photo_metadata].values
test_targets = np.ones(len(test_image_paths))
test_aug = albumentations.Compose(
    [
        albumentations.Resize(IMAGE_SIZE, IMAGE_SIZE, p=1),
        albumentations.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
            p=1.0,
        ),
    ],
    p=1.0,
)
test_dataset = Dataset(
    image_paths=test_image_paths,
    dense_features=test_dense_features,
    targets=test_targets,
    augmentations=test_aug,
)

# Get predictions for all 5 folds
all_swin_preds = []
for fold in range(5):
    # Load Swin for current fold
    cur_swin = Swin()
    cur_swin.load(f"../input/bce-20-epochs/model_f{fold}.bin", device="cuda", weights_only=True)

    # Predict test set
    test_preds = cur_swin.predict(test_dataset, batch_size=16, n_jobs=-1)

    cur_swin_preds = []
    for preds in test_preds:
        # Denormalize
        preds = sigmoid(preds) * 100    
        
        # Convert to list
        preds = preds.ravel().tolist()
        
        # Add to current fold list
        cur_swin_preds.extend(preds)
    
    # Add current fold list to all folds list
    all_swin_preds.append(cur_swin_preds)

# Final prediction is the average of all folds
df_test["Pawpularity"] = np.mean(np.column_stack(all_swin_preds), axis=1)
df_test = df_test[["Id", "Pawpularity"]]
df_test.to_csv("submission.csv", index=False)

Downloading: "https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22kto1k.pth" to /root/.cache/torch/hub/checkpoints/swin_large_patch4_window12_384_22kto1k.pth
100%|██████████| 1/1 [00:00<00:00,  1.24it/s, stage=test]
100%|██████████| 1/1 [00:00<00:00,  1.63it/s, stage=test]
100%|██████████| 1/1 [00:00<00:00,  1.70it/s, stage=test]
100%|██████████| 1/1 [00:00<00:00,  1.32it/s, stage=test]
100%|██████████| 1/1 [00:00<00:00,  1.60it/s, stage=test]


In [8]:
df_test.head()

Unnamed: 0,Id,Pawpularity
0,4128bae22183829d2b5fea10effdb0c3,40.683147
1,43a2262d7738e3d420d453815151079e,41.5589
2,4e429cead1848a298432a0acad014c9d,40.380956
3,80bc3ccafcc51b66303c2c263aa38486,40.51514
4,8f49844c382931444e68dffbe20228f4,40.567381


# Swin SVR Model

In [9]:
class SwinSvr(tez.Model):
    def __init__(self):
        super().__init__()
        self.model = timm.create_model("swin_large_patch4_window12_384", pretrained=False, in_chans=3)
        self.model.head = nn.Linear(self.model.head.in_features, 128)
        self.dropout = nn.Dropout(0.1)
        self.dense1 = nn.Linear(140, 64)
        self.relu1 = nn.ReLU()
        self.dense2 = nn.Linear(64, 1)

    def forward(self, image, features, targets=None):
        embeddings = self.model(image)
        x = self.dropout(embeddings)
        x = torch.cat([x, features], dim=1)
        x = self.dense1(x)
        x = self.relu1(x)
        x = self.dense2(x)
        
        # Important: include embeddings and features
        x = torch.cat([x, embeddings, features], dim=1)
        
        return x, 0, {}

# SVR Training

In [10]:
# Use Swin to get embeddings
swin_svr = SwinSvr()
swin_svr.load(f"../input/bce-20-epochs/model_f{FOLD}.bin", device="cuda", weights_only=True)
swin_train_preds = swin_svr.predict(train_dataset, batch_size=16, n_jobs=-1)
train_embeddings = np.array([]).reshape((0, 140))
for preds in swin_train_preds:
    train_embeddings = np.concatenate([train_embeddings, preds[:, 1:]], axis=0)

# Train SVR on embeddings
svr = SVR(C=20.0)
svr.fit(train_embeddings, train_targets)
pickle.dump(svr, open(f"SVR_fold_{FOLD}.pkl", "wb"))

# Validation MSE
swin_valid_preds = swin_svr.predict(valid_dataset, batch_size=16, n_jobs=-1)
valid_embeddings = np.array([]).reshape((0, 140))
denormalized_swin_valid_preds = []
for preds in swin_valid_preds:
    valid_embeddings = np.concatenate([valid_embeddings, preds[:, 1:]], axis=0)
    preds = sigmoid(preds[:, 0]) * 100    
    preds = preds.ravel().tolist()
    denormalized_swin_valid_preds.extend(preds)
svr_valid_preds = svr.predict(valid_embeddings)
ensemble_preds = SWIN_WEIGHT * np.array(denormalized_swin_valid_preds) + (1 - SWIN_WEIGHT) * svr_valid_preds
valid_mse = mean_squared_error(valid_targets, ensemble_preds, squared=False)
print("Validation MSE:", valid_mse)

100%|██████████| 496/496 [05:46<00:00,  1.43it/s, stage=test]
100%|██████████| 124/124 [01:27<00:00,  1.42it/s, stage=test]

Validation MSE: 18.170138826483203





# Ensemble Inference

In [11]:
all_svr_preds = []
for fold in range(5):
    # Use Swin to get embeddings
    swin = SwinSvr()
    swin.load(f"../input/bce-20-epochs/model_f{fold}.bin", device="cuda", weights_only=True)
    swin_test_preds = swin.predict(test_dataset, batch_size=16, n_jobs=-1)
    embeddings = np.array([]).reshape((0, 140))
    for preds in swin_test_preds:
        embeddings = np.concatenate([embeddings, preds[:, 1:]], axis=0)
    
    # Load SVR and predict using embeddings
    svr = pickle.load(open(f"../input/svr-bce-20-epochs-5-folds/SVR_fold_{fold}.pkl", "rb"))
    svr_preds = svr.predict(embeddings)
    all_svr_preds.append(svr_preds)

# Final prediction is the weighted average of Swin Transformer and SVR predictions
all_swin_preds = np.mean(np.column_stack(all_swin_preds), axis=1)
all_svr_preds = np.mean(np.column_stack(all_svr_preds), axis=1)
df_test["Pawpularity"] = SWIN_WEIGHT * all_swin_preds + (1 - SWIN_WEIGHT) * all_svr_preds
df_test = df_test[["Id", "Pawpularity"]]
df_test.to_csv("submission.csv", index=False)

100%|██████████| 1/1 [00:00<00:00,  1.54it/s, stage=test]
100%|██████████| 1/1 [00:00<00:00,  1.61it/s, stage=test]
100%|██████████| 1/1 [00:00<00:00,  1.65it/s, stage=test]
100%|██████████| 1/1 [00:00<00:00,  1.63it/s, stage=test]
100%|██████████| 1/1 [00:00<00:00,  1.48it/s, stage=test]


In [12]:
df_test.head()

Unnamed: 0,Id,Pawpularity
0,4128bae22183829d2b5fea10effdb0c3,41.071823
1,43a2262d7738e3d420d453815151079e,42.346951
2,4e429cead1848a298432a0acad014c9d,40.599312
3,80bc3ccafcc51b66303c2c263aa38486,40.497409
4,8f49844c382931444e68dffbe20228f4,40.870295
