## Load libraries

In [1]:
!pip install -r requirements.txt

[31mfloyd-cli 0.11.17 has requirement click<7,>=6.7, but you'll have click 7.0 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import sys
import os
import numpy as np
import pandas as pd

from PIL import Image

import torch
import torch.nn as nn
import torch.utils.data as D
from torch.optim.lr_scheduler import ExponentialLR
import torch.nn.functional as F
from torch.autograd import Variable

from torchvision import transforms
from torchvision import models

from ignite.engine import Events
from scripts.ignite import create_supervised_evaluator, create_supervised_trainer
from ignite.metrics import Loss, Accuracy
from ignite.contrib.handlers.tqdm_logger import ProgressBar
from ignite.handlers import  EarlyStopping, ModelCheckpoint
from ignite.contrib.handlers import LinearCyclicalScheduler, CosineAnnealingScheduler

import random

from tqdm import tqdm_notebook

from sklearn.model_selection import train_test_split

from scripts.evaluate import eval_model
from scripts.plates_leak import apply_plates_leak

import warnings
warnings.filterwarnings('ignore')

In [4]:
!ls /storage/rxrxai

pixel_stats.csv		       test.csv		  train.zip
pixel_stats.csv.zip	       test.zip		  train_controls.csv
pixel_stats_agg.csv	       test_controls.csv  training_aug.csv
recursion_dataset_license.pdf  train		  validation.csv
sample_submission.csv	       train.csv
test			       train.csv.zip


## Define dataset and model

In [5]:
img_dir = '/storage/rxrxai'
path_data = '/storage/rxrxai'
stats_df = pd.read_csv(path_data + f'/pixel_stats_agg.csv')
model_name = 'densenet121'
device = 'cuda'
batch_size = 4
torch.manual_seed(0)
init_lr = 3e-4
end_lr = 1e-7
classes = 1108

In [5]:
class ImagesDS(D.Dataset):
    def __init__(self, df, cell, img_dir=img_dir, mode='train', validation=False, channels=[1,2,3,4,5,6]):
        self.records = df.to_records(index=False)
        self.mode = mode
        self.img_dir = img_dir
        self.len = df.shape[0]
        self.validation = validation
        self.channels = channels
        self.cell = cell

    def _get_img_path(self, index, channel, site):
        experiment, well, plate = self.records[index].experiment, self.records[index].well, self.records[index].plate
        return '/'.join([self.img_dir,self.mode,experiment,f'Plate{plate}',f'{well}_s{site}_w{channel}.png'])
        
    @staticmethod
    def _load_img_as_tensor(file_name, cell, channel):
        with Image.open(file_name) as img:
            std_mean = stats_df[(stats_df['cell'] == cell) & (stats_df['channel'] == float(channel))][['std', 'mean']]
            
            mean = std_mean.iloc[0]['mean']
            std = std_mean.iloc[0]['std']
    
            return transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=[mean], std=[std])
            ])
        
    def __getitem__(self, index):        
        img1 = torch.cat([self._load_img_as_tensor(self._get_img_path(index, ch, 1), self.cell, ch) for ch in self.channels])
        img2 = torch.cat([self._load_img_as_tensor(self._get_img_path(index, ch, 2), self.cell, ch) for ch in self.channels])
        
        if random.random() > 0.5 and not self.validation:
            img1, img2 = img2, img1
        
        if self.mode == 'train':
            return img1, img2, int(self.records[index].sirna)
        else:
            return img1, img2, self.records[index].id_code
    
    def __len__(self):
        return self.len

In [6]:
# dataframes for training, cross-validation, and testing
df = pd.read_csv(path_data+'/training_aug.csv')
df['category'] = df['experiment'].apply(lambda x: x.split('-')[0])
df_val = pd.read_csv(path_data+'/validation.csv')
df_val['category'] = df_val['experiment'].apply(lambda x: x.split('-')[0])
df_test = pd.read_csv(path_data+'/test.csv')
df_test['category'] = df_test['experiment'].apply(lambda x: x.split('-')[0])

In [7]:
class DenseNetTwoInputs(nn.Module):
    def __init__(self):
        super(DenseNetTwoInputs, self).__init__()
        self.classes = 1108
        
        model = models.densenet121(pretrained=True)
        num_ftrs = model.classifier.in_features
        model.classifier = nn.Identity()

        # let's make our model work with 6 channels
        trained_kernel = model.features.conv0.weight
        new_conv = nn.Conv2d(6, 64, kernel_size=7, stride=2, padding=3, bias=False)
        with torch.no_grad():
            new_conv.weight[:,:] = torch.stack([torch.mean(trained_kernel, 1)]*6, dim=1)
        model.features.conv0 = new_conv
        
        self.densenet = model
        self.fc = nn.Linear(num_ftrs * 2, self.classes)

    def forward(self, x1, x2):
        x1_out = self.densenet(x1)
        x2_out = self.densenet(x2)
   
        N, _, _, _ = x1.size()
        x1_out = x1_out.view(N, -1)
        x2_out = x2_out.view(N, -1)
        
        out = torch.cat((x1_out, x2_out), 1)
        out = self.fc(out)

        return out 

In [8]:
cells = df['category'].unique()
epochs_per_cell = {
    cells[0]: 25,
    cells[1]: 15,
    cells[2]: 25,
    cells[3]: 40,
}
print(epochs_per_cell)

{'HEPG2': 25, 'HUVEC': 15, 'RPE': 25, 'U2OS': 40}


In [None]:
# utilities to save best epoch
def get_saved_model_path(epoch, cell=''):
    return f'/artifacts/Model_{model_name}{cell}_{epoch}.pth'

best_acc = 0.
best_epoch = 1
best_epoch_file = ''

In [9]:
!mkdir -p models

In [10]:
!mkdir -p /artifacts

In [9]:
all_preds = []

for cell in cells:
    cat_train_df = df[df['category'] == cell].copy()
    cat_test_df = df_test[df_test['category'] == cell].copy()
    cat_val_df = df_val[df_val['category'] == cell].copy()

    print('\n' + '=' * 40)
    print("CURRENT CATEGORY:", cell)
    print('-' * 40)
    
    # pytorch training dataset & loader
    cat_train_ds = ImagesDS(cat_train_df, cell, mode='train', validation=False)
    cat_train_loader = D.DataLoader(cat_train_ds, batch_size=batch_size, shuffle=True, num_workers=8)

    # pytorch cross-validation dataset & loader
    cat_val_ds = ImagesDS(cat_val_df, cell, mode='train', validation=True)
    cat_val_loader = D.DataLoader(cat_val_ds, batch_size=batch_size, shuffle=True, num_workers=8)

    # model
    model = DenseNetTwoInputs()
    model.load_state_dict(torch.load('/storage/rxrxmodels/Model_densenet121_57.pth'))
    model.train()
    
    # metrics
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=init_lr)
    
    # multi-gpus training
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs to train on {cell}")
        model = nn.DataParallel(model)    
    
    metrics = {
        'loss': Loss(criterion),
        'accuracy': Accuracy(),
    }

    trainer = create_supervised_trainer(model, optimizer, criterion, device=device)
    val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
    
    # LR Scheduler
    scheduler = CosineAnnealingScheduler(optimizer, 'lr', init_lr, end_lr, len(cat_train_loader))
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # print lr
    def print_lr(engine):
        epoch = engine.state.epoch
        iteration = engine.state.iteration
    
        if epoch < 2 and iteration % 100 == 0:
            print(f'Iteration {iteration} | LR {optimizer.param_groups[0]["lr"]}')
    trainer.add_event_handler(Events.ITERATION_COMPLETED, print_lr)

        
    # save best epoch only
    def save_best_epoch_only(engine):
        epoch = engine.state.epoch

        global best_acc
        global best_epoch
        global best_epoch_file
        best_acc = 0. if epoch == 1 else best_acc
        best_epoch = 1 if epoch == 1 else best_epoch
        best_epoch_file = '' if epoch == 1 else best_epoch_file

        metrics = val_evaluator.run(cat_val_loader).metrics
        print("Validation Results - Epoch: {} | Average Loss: {:.4f} | Accuracy: {:.4f} "
              .format(engine.state.epoch, metrics['loss'], metrics['accuracy']))

        if metrics['accuracy'] > best_acc:
            prev_best_epoch_file = get_saved_model_path(best_epoch, cell)
            if os.path.exists(prev_best_epoch_file):
                os.remove(prev_best_epoch_file)

            best_acc = metrics['accuracy']
            best_epoch = epoch
            best_epoch_file = get_saved_model_path(best_epoch, cell)
            print(f'\nEpoch: {best_epoch} - New best accuracy! Accuracy: {best_acc}\n\n\n')
            torch.save(model.state_dict(), best_epoch_file)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, save_best_epoch_only)
                  
    print('Training started\n')
    trainer.run(cat_train_loader, max_epochs=epochs_per_cell[cell])
     
    cat_test_ds = ImagesDS(cat_test_df, cell, mode='test', validation=True)
    cat_test_loader = D.DataLoader(cat_test_ds, batch_size=1, shuffle=False, num_workers=8)
    cell_preds, _ = eval_model_per_cell(model, cat_test_loader, best_epoch_file, path_data, cat_test_df.copy().drop(['category'], axis=1), sub_file=f'/artifacts/submission_{cell}.csv')
    all_preds += cell_preds

In [None]:
# aggregate submission files
submissions = []
for cell in cells:
    submissions += [pd.read_csv(f'/artifacts/submission_{cell}.csv')]

submissions = pd.concat(submissions)
submissions.to_csv(f'/artifacts/submission.csv', index=False, columns=['id_code','sirna'])

In [None]:
apply_plates_leak(all_preds)