In [1]:
import os
import random
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torchvision
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F

from torchvision.io import read_image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, accuracy_score, classification_report

from tqdm.notebook import tqdm, trange

os.environ['WANDB_CONSOLE'] = 'off' # assertion error workaround

In [2]:
INPUT_FOLDER = '../input/'
DATA_PATH = INPUT_FOLDER + 'ails3-data'
SEED = 2022
NUM_CLASSES = 9
OUTPUT_FOLDER = './'
SUBMISSION_FILE = 'submission.csv'
EXTRACT_FEATURES = True

## Exploration

### Targets

In [3]:
train_df = pd.read_csv(os.path.join(DATA_PATH, 'y_train.csv'))
train_df

In [4]:
train_df['cell_line'].value_counts()

In [5]:
target2code = {target:idx for (idx, target) in enumerate(sorted(train_df['cell_line'].unique()))}
code2target = {idx:target for (target, idx) in target2code.items()}

train_df['target'] = train_df['cell_line'].map(target2code)

### Images

In [6]:
def get_ids(subset='train'):
    files = os.listdir(os.path.join(DATA_PATH, 'images_' + subset, 'images_' + subset))
    file_ids = set(int(x[:5]) for x in files)
    return sorted(file_ids)

In [7]:
def read_sample(file_id, subset='train', normalize=False):
    file_id = str(file_id)
    path = os.path.join(DATA_PATH, 'images_' + subset, 'images_' + subset)
    prefix = (5-len(file_id)) * '0' + file_id
    file_blue = os.path.join(path, prefix + '_blue.png')
    file_red = os.path.join(path, prefix + '_red.png')
    file_yellow = os.path.join(path, prefix + '_red.png')
    img_blue = read_image(file_blue)
    img_red = read_image(file_red)
    img_yellow = read_image(file_yellow)
    if normalize:
        return torch.cat((img_blue/255, img_red/255, img_yellow/255))
    else: 
        return torch.cat((img_blue, img_red, img_yellow))

In [8]:
train_ids = get_ids('train')
test_ids = get_ids('test')

normalize = True
train_images = torch.stack([read_sample(id, 'train', normalize) for id in tqdm(train_ids)])
test_images = torch.stack([read_sample(id, 'test', normalize) for id in tqdm(test_ids)])

print(f'Train images ... {train_images.shape}')
print(f'Test images ... {test_images.shape}')

## Train-Validation Split

In [10]:
val_size = 0.2
train_df, val_df = train_test_split(train_df, 
                                    test_size=val_size, 
                                    random_state=SEED, 
                                    shuffle=True, 
                                    stratify=train_df['cell_line'])

In [11]:
train_images = torch.stack([read_sample(id, 'train', normalize) for id in tqdm(train_df['file_id'])])
val_images = torch.stack([read_sample(id, 'train', normalize) for id in tqdm(val_df['file_id'])])

train_targets = train_df['target'].to_numpy()
val_targets = val_df['target'].to_numpy()

In [12]:
print(train_images.shape)
print(val_images.shape)
print(test_images.shape)

## Feature Extraction

### Dataset & Preprocessing

In [13]:
class ImageDataset(Dataset):
    def __init__(self, images, transform=None):
        self.images = images
        self.transform = transform
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        image = self.images[idx]
        if self.transform:
            image = self.transform(image)
        return image

### Model

In [14]:
class BaseVGG(nn.Module):
    def __init__(self, features, pool=1):
        super().__init__()
        self.features = features
        self.pool = nn.AdaptiveAvgPool2d(pool)
    
    @torch.no_grad()
    def forward(self, x):
        x = self.features(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        return x

In [15]:
class Identity(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return x

### Extract and Save Features

In [16]:
@torch.no_grad()
def get_features(net_list, dataloader):
    for net in net_list:
        net.to(device)
        net.eval()
    feature_list = []
    for img in tqdm(dataloader):
        img = img.to(device)
        batch = []
        for net in net_list:
            batch.append(net(img))
        batch = torch.cat(batch, dim=1)
        feature_list.append(batch.cpu())
    return torch.cat(feature_list, dim=0).numpy()

In [None]:
n_train_sets = 6
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
transformations = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

augmentations = transforms.Compose([
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomVerticalFlip(0.5),
    transforms.RandomApply([transforms.RandomRotation(degrees=(-45, 45))], p=0.5),
])

In [None]:
train_dataset = ImageDataset(train_images, transforms.Compose([transformations, augmentations]))
val_dataset = ImageDataset(val_images, transformations)
full_dataset = ImageDataset(torch.cat((train_images, val_images), dim=0), transforms.Compose([transformations, augmentations]))
test_dataset = ImageDataset(test_images, transformations)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
if EXTRACT_FEATURES:
    vgg19 = BaseVGG(features=models.vgg19(pretrained=True).features, pool=1)
    resnet18 = models.resnet18(pretrained=True)
    resnet18.fc = nn.Flatten()
    efficientnetb0 = BaseVGG(features=models.efficientnet_b0(pretrained=True).features, pool=1)
    vit16 = models.vit_b_16(pretrained=True)
    vit16.heads = Identity()

    net_list = [vgg19, resnet18, efficientnetb0, vit16]
    train_sets = []
    for _ in range(n_train_sets):
        train_sets.append(get_features(net_list, train_loader))
    full_sets = []
    for _ in range(n_train_sets):
        full_sets.append(get_features(net_list, full_loader))
    X_train = np.concatenate(train_sets, axis=0)
    X_val = get_features(net_list, val_loader)
    X_full = np.concatenate(full_sets, axis=0)
    X_test = get_features(net_list, test_loader)

    np.save(OUTPUT_FOLDER + 'train.npy', X_train)
    np.save(OUTPUT_FOLDER + 'val.npy', X_val)
    np.save(OUTPUT_FOLDER + 'full.npy', X_full)
    np.save(OUTPUT_FOLDER + 'test.npy', X_test)

In [None]:
if not EXTRACT_FEATURES:
    X_train = np.load(INPUT_FOLDER + 'npy-data/train.npy')
    X_val = np.load(INPUT_FOLDER + 'npy-data/val.npy')
    X_full = np.load(INPUT_FOLDER + 'npy_data/full.npy')
    X_test = np.load(INPUT_FOLDER + 'npy-data/test.npy')

In [None]:
y_train = np.tile(train_targets, n_train_sets)
y_val = val_targets
y_full = np.tile(np.concatenate((train_targets, val_targets)), n_train_sets)

## Training

In [None]:
def fit_predict(models, train_loader, X_train, X_val):
    y_pred_train_list = []
    y_pred_val_list = []
    y_proba_train_list = []
    y_proba_val_list = []
    
    for model in models:
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size)

        model.train()
        for e in range(epochs):
            epoch_losses = []
            for x, y in train_loader:
                logits = model(x)
                loss = criterion(logits, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                epoch_losses.append(loss.item())
            scheduler.step()
            if (e+1)%5==0: print(e+1, np.mean(epoch_losses))

        model.eval()
        y_pred_train_list.append(model.predict(X_train))
        y_pred_val_list.append(model.predict(X_val))
        y_proba_train_list.append(model.predict_proba(X_train))
        y_proba_val_list.append(model.predict_proba(X_val))
    
    return y_pred_train_list, y_pred_val_list, y_proba_train_list, y_proba_val_list

In [None]:
class BaseMLP(nn.Module):
    def __init__(self, sequential):
        super().__init__()
        self.sequential = sequential
        
    def forward(self, x):
        return self.sequential(x)
    
    @torch.no_grad()
    def predict(self, x):
        logits = self.forward(torch.tensor(x))
        return logits.argmax(dim=1).detach().numpy()

    @torch.no_grad()
    def predict_proba(self, x):
        logits = self.forward(torch.tensor(x))
        return F.softmax(logits, dim=1).detach().numpy()

In [None]:
sequential1 = nn.Sequential(
    nn.Linear(3072, 128), 
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(128, NUM_CLASSES)
)

sequential2 = nn.Sequential(
    nn.Linear(3072, 512), 
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(512, 512),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(512, NUM_CLASSES)
)

sequential3 = nn.Sequential(
    nn.Linear(3072, 256), 
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(256, NUM_CLASSES)
)

sequentials = [sequential1, sequential2, sequential3]
models = [BaseMLP(s) for s in sequentials]

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
batch_size = 32
epochs = 30
lr = 0.001
weight_decay = 0.00001
step_size = 10

train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train_scaled), torch.tensor(y_train))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float))

y_pred_train_list, y_pred_val_list, y_proba_train_list, y_proba_val_list = fit_predict(models, train_loader, X_train_scaled, X_val_scaled)

## Evaluation

In [None]:
ensemble_pred_train = np.mean(y_proba_train_list, axis=0).argmax(axis=1)
ensemble_pred_val = np.mean(y_proba_val_list, axis=0).argmax(axis=1)

In [None]:
for y_pred_train, y_pred_val in zip(y_pred_train_list, y_pred_val_list):
    print(balanced_accuracy_score(y_train, y_pred_train))
    print(balanced_accuracy_score(y_val, y_pred_val))
    
print(balanced_accuracy_score(y_train, ensemble_pred_train))
print(balanced_accuracy_score(y_val, ensemble_pred_val))

In [None]:
for y_pred_train, y_pred_val in zip(y_pred_train_list, y_pred_val_list):
    print(accuracy_score(y_train, y_pred_train))
    print(accuracy_score(y_val, y_pred_val))
    
print(accuracy_score(y_train, ensemble_pred_train))
print(accuracy_score(y_val, ensemble_pred_val))

In [None]:
for y_pred_train, y_pred_val in zip(y_pred_train_list, y_pred_val_list):
    print(confusion_matrix(y_train, y_pred_train))
    print(confusion_matrix(y_val, y_pred_val))
    
print(confusion_matrix(y_train, ensemble_pred_train))
print(confusion_matrix(y_val, ensemble_pred_val))

In [None]:
for y_pred_train, y_pred_val in zip(y_pred_train_list, y_pred_val_list):
    print(classification_report(y_train, y_pred_train))
    print(classification_report(y_val, y_pred_val))
    
print(classification_report(y_train, ensemble_pred_train))
print(classification_report(y_val, ensemble_pred_val))

## Final Fit

In [None]:
sequential1 = nn.Sequential(
    nn.Linear(3072, 128), 
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(128, NUM_CLASSES)
)

sequential2 = nn.Sequential(
    nn.Linear(3072, 512), 
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(512, 512),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(512, NUM_CLASSES)
)

sequential3 = nn.Sequential(
    nn.Linear(3072, 256), 
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(256, 256),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(256, NUM_CLASSES)
)

sequentials = [sequential1, sequential2, sequential3]
models = [BaseMLP(s) for s in sequentials]

In [None]:
scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)
X_test_scaled = scaler.transform(X_test)

In [None]:
batch_size = 32
epochs = 30
lr = 0.001
weight_decay = 0.00001
step_size = 10

full_dataset = torch.utils.data.TensorDataset(torch.tensor(X_full_scaled), torch.tensor(y_full))
full_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True)
class_weights = compute_class_weight('balanced', classes=np.unique(y_full), y=y_full)
criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float))

y_pred_full_list, y_pred_test_list, y_proba_full_list, y_proba_test_list = fit_predict(models, full_loader, X_full_scaled, X_test_scaled)

## Submission

In [None]:
preds = np.mean(y_proba_test_list, axis=0).argmax(axis=1)
preds = np.vectorize(code2target.get)(preds)

In [None]:
submission = pd.DataFrame({
    'file_id': test_ids,
    'cell_line': preds
})

submission.to_csv(OUTPUT_FOLDER + SUBMISSION_FILE, index=False)