# Env

In [None]:
!nvidia-smi

In [2]:
%%capture
#!pip install matplotlib==3.5.0
!pip install xmltodict==0.12.0
!pip install datumaro==0.1.10.1 
!pip install yacs==0.1.8 pyyaml==5.4.1 colorlog==4.7.2 pytorch_lightning==1.5.4
!pip install timm==0.4.12

In [3]:
%%capture
!pip install transformers==4.17.0

In [4]:
! pip freeze | grep torch

pytorch-lightning==1.5.4
torch @ https://download.pytorch.org/whl/cu111/torch-1.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl
torchaudio @ https://download.pytorch.org/whl/cu111/torchaudio-0.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl
torchmetrics==0.7.2
torchsummary==1.5.1
torchtext==0.11.0
torchvision @ https://download.pytorch.org/whl/cu111/torchvision-0.11.1%2Bcu111-cp37-cp37m-linux_x86_64.whl


# Imports and utils

In [5]:
import json 
import requests
import os
import random
import json
import io
import math
import copy
import yaml
from abc import ABC,ABCMeta, abstractmethod
from functools import wraps
from yacs.config import CfgNode as CN
from datetime import datetime
from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('dark_background')

import time
from tqdm.notebook import trange, tqdm
from functools import reduce
from sklearn.model_selection import StratifiedKFold

import torch
from torch import nn
import colorlog
import timm
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import LearningRateMonitor,ModelCheckpoint
from pytorch_lightning.callbacks import Callback
import torchmetrics
import sklearn
import pathlib

In [6]:
!mkdir -p logs/tensorboard models configs data results

In [7]:
LOGS_PATH = "logs"
TENSORBOARD_PATH = os.path.join(LOGS_PATH , "tensorboard")
MODELS_PATH = "models"
CONFIG_PATH = "configs"
DATA_PATH = "data"
RESULTS_PATH = "results"

In [8]:
os.environ["LOGS_PATH"] = LOGS_PATH
os.environ["TENSORBOARD_PATH"] = TENSORBOARD_PATH
os.environ["MODELS_PATH"] = MODELS_PATH
os.environ["CONFIG_PATH"] = CONFIG_PATH
os.environ["DATA_PATH"] = DATA_PATH
os.environ["RESULTS_PATH"] = RESULTS_PATH

In [9]:
_C = CN()

_C.preprocess = CN()

_C.model = CN()
_C.model.base_lr = 5*1e-4
_C.model.min_lr = 1*1e-5
_C.model.name = "starter"

_C.model.train_bs = 512
_C.model.test_bs = 512
_C.model.epochs = 50

_C.model.scheduler = CN()
_C.model.scheduler.step = 30
_C.model.scheduler.factor = 0.8

_C.model.early_stopping = CN()
_C.model.early_stopping.min_delta = 0
_C.model.early_stopping.patience = 8


_C.n_folds = 10
_C.num_classes = 1

_C.seed = 42
_C.device = "cuda" if torch.cuda.is_available() else "cpu"
_C.experiment_id = "exp_v6_regression_10folds"
os.environ["EXP_ID"] = _C.experiment_id

def get_cfg_defaults():
    """Get a yacs CfgNode object with default values for my_project."""
    # Return a clone so that the defaults will not be altered
    # This is for the "local variable" use pattern
    #return _C.clone()
    return _C

def dump_cfg(config = get_cfg_defaults() , path = "experiment.yaml"):
    """Save a yacs CfgNode object in a yaml file in path."""
    stream = open(path, 'w')
    stream.write(config.dump())
    stream.close()

def inject_config(funct):
    """Inject a yacs CfgNode object in a function as first arg."""
    @wraps(funct)
    def function_wrapper(*args,**kwargs):
        return funct(*args,**kwargs,config=_C)  
    return function_wrapper

def dump_dict(config,path="config.yaml"):
        stream = open(path, 'w')
        yaml.dump(config,stream)
        stream.close()

c=get_cfg_defaults()

In [10]:
@inject_config
def seed_all(config):
    seed_value=config["seed"]
    pl.utilities.seed.seed_everything(seed=seed_value, workers=True)
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False
    

In [11]:
class RMSE():
  @inject_config
  def __init__(self , config : CN):
    self.config = config

  def __call__(self , preds , target):
    #print("preds : ",preds)
    #print("target : ",target)
    return torchmetrics.functional.mean_squared_error(preds , target , squared=False)

In [12]:
class Logger():

  logger=None
  
  def __init__(self,name,path):
    self.path=path
    self.name=name

  def get_logger(self,):
    if (self.logger is not None):
      return self.logger
    else : 
      self.logger=self.new_logger()
      return self.logger

  @inject_config
  def new_logger(self,config : CN):
    log_file=os.path.join(LOGS_PATH,self.path)
    logger = getLogger(self.name)
    logger.setLevel(INFO)
    handler1 = colorlog.StreamHandler()
    #handler1.setFormatter(Formatter('%(asctime)s  - %(levelname)s - %(message)s'))
    handler1.setFormatter(colorlog.ColoredFormatter(
	  f'%(log_color)s[%(asctime)s  -  %(levelname)s] : %(name)s : %(white)s%(message)s'))
    
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter(f'%(asctime)s  - %(levelname)s - %(message)s'))
    logger.handlers.clear()
    #logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

main_logger=Logger(path="main.log",name="main")
logger=main_logger.get_logger()
#logger.info("warning")

In [13]:
class LoggerCallback(Callback):

    def __init__(self,logger):
        self.logger=logger  

    def on_train_epoch_end(self,trainer, pl_module, ):
        loss  = trainer.callback_metrics["train_loss"].detach().cpu().numpy()
        metric=trainer.callback_metrics["train_metric"].detach().cpu().numpy()
        self.logger.info(f'Epoch = {pl_module.current_epoch} Train loss : {loss} Train metric : {metric}')
        #self.logger.info(f'Epoch = {pl_module.current_epoch} Train loss : {loss}')
    
    def on_validation_epoch_end(self,trainer, pl_module):
        loss  = trainer.callback_metrics["val_loss"].detach().cpu().numpy()
        metric=trainer.callback_metrics["val_metric"].detach().cpu().numpy()
        self.logger.info(f'Epoch = {pl_module.current_epoch} Val loss : {loss} Val metric : {metric}')
        #self.logger.info(f'Epoch = {pl_module.current_epoch} Val loss : {loss}')

# Data collection

In [14]:
df = pd.read_csv("https://storage.googleapis.com/umojahack2022/train.csv")
test_df = pd.read_csv("https://storage.googleapis.com/umojahack2022/test.csv")
df["signal_interval"] = df["Signal"].apply(lambda x : int(x*10))
targets = df["signal_interval"].unique()

In [15]:
target_mapper = dict(zip(targets,[i for i in range(len(targets))]))
inverse_target_mapper = dict(zip([i for i in range(len(targets))],targets))

In [16]:
classes = targets
class_to_idx = target_mapper
idx_to_class = inverse_target_mapper

In [17]:
def get_seq_column_map(train, test, col):
    sequences = []
    for seq in train[col]:
        sequences.extend(list(seq))
    for seq in test[col]:
        sequences.extend(list(seq))
    unique = np.unique(sequences)
    return {k: v for k, v in zip(unique, range(len(unique)))}

def get_column_map(train, test, col):
    sequences = []
    unique_values = pd.concat([train[col], test[col]]).unique().tolist()
    return {k: v for k, v in zip(unique_values, range(len(unique_values)))}

amino_acid_map = get_seq_column_map(df, test_df, "Toxin_Kmer")
print("unique amino acid map",len(amino_acid_map))

antivenom_map = get_column_map(df, test_df, "Antivenom")
print("unique Antivenom map", len(antivenom_map))

unique amino acid map 20
unique Antivenom map 8


# Validation Strategy

In [18]:
class ValidationStrategy:
  @inject_config
  def __init__(self , df , config):
    self.df = df
    self.config = config
    self.result_df = None

  def kfold_split(self , df ):
    seed_all()
    df["folds"]=-1
    df = df.copy()
    label = "signal_interval"
    kf = sklearn.model_selection.StratifiedKFold(n_splits=self.config.n_folds, shuffle = True , random_state=42)
    for fold, (_, val_index) in enumerate(kf.split(df,df[label])):
            df.loc[val_index, "folds"] = fold
    return df

  def __call__(self , ):
    if self.result_df is not None:
      return self.result_df
    result_df = self.df.copy()
    self.result_df = self.kfold_split(result_df)
    return self.result_df

In [19]:
seed_all()
split = ValidationStrategy(df)
df = split()

Global seed set to 42
Global seed set to 42


# Torch utils

In [20]:
class AntivenomChallengeDataSet(torch.utils.data.Dataset):
    def __init__(
        self,
        amino_acid_map,
        antivenom_map,
        data,
        is_train,
        label_name=None,
        classification = False
      ):
        self.amino_acid_map = amino_acid_map
        self.antivenom_map = antivenom_map
        self.data = data
        self.is_train = is_train
        self.label_name = label_name
        self.classification = classification

    def __len__(self):
        return len(self.data) 

    def __getitem__(self,idx):
        row = self.data.iloc[idx]
        kmer_seq = torch.as_tensor([self.amino_acid_map[e] for e in list(row["Toxin_Kmer"])])
        antivenom = torch.as_tensor(self.antivenom_map[row["Antivenom"]])
        position_start = torch.as_tensor(row["Kmer_Position_start"])
        position_end = torch.as_tensor(row["Kmer_Position_end"])
        
        inputs = {
            "K_mer": kmer_seq,
            "antivenom": antivenom,
            "position_start": position_start,
            "position_end": position_end,
        }

        if self.is_train: 
            if self.classification:
              #return inputs, target_transformation(torch.as_tensor([row[self.label_name]]))
              return inputs, torch.as_tensor(target_mapper[row[self.label_name]])
            else : return inputs, torch.as_tensor([row[self.label_name]]).float()
        return inputs

# Modeling

In [21]:
class PLModel(pl.LightningModule):
    def __init__(self, optimizer=None , loss=None,scheduler=None , metric=None ):
        super().__init__()
        self.optimizer=optimizer
        self.scheduler=scheduler
        self.loss=loss
        self.metric=metric

    
    def training_step(self, batch, batch_idx):
          x=batch[0]
          y=batch[1]
          y_hat = self(x)
          result={"y_hat" : y_hat.detach() , "y" : y.detach() }
          if self.loss:
            loss = self.loss(y_hat , y)
            self.log('train_loss', loss, prog_bar=True, logger=False , on_epoch = True , on_step = True)
            result["loss"]=loss
          
          return result
        
    def validation_step(self, batch, batch_idx):
        x=batch[0]
        y=batch[1]
        y_hat = self(x)

        result={"y_hat" : y_hat.detach() , "y" : y.detach() }
        # print("y_hat.shape : ",y_hat.shape)
        # print("y.shape : ",y.shape)
        # print("y_hat.dtype : ",y_hat.dtype)
        # print("y.dtype : ",y.dtype)

        # print("y_hat : ",y_hat[0])
        # print("y : ",y[0])

        if self.loss: 
            loss = self.loss(y_hat , y)
            
            self.log('val_loss', loss, prog_bar=True,logger=False)
            result["loss"]=loss
        
        return result

    def test_step(self, batch, batch_idx):
        x=batch[0]
        y=batch[1]
        y_hat = self(x)

        result={"pred" : y_hat.detach().cpu().numpy() }
        return result

    
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.logger[0].experiment.add_scalar("Loss/Train",
	                                            avg_loss,
	                                            self.current_epoch)
        result = {"train_loss" : avg_loss}

        if self.metric:
          y_hat = torch.cat([x['y_hat'] for x in outputs])
          y = torch.cat([x['y'] for x in outputs])
          metric = self.metric(y_hat, y)
          self.log('train_metric', metric, prog_bar=True , logger=False)

          
          self.logger[0].experiment.add_scalar("Metric/Train",
                                                metric,
                                                self.current_epoch)

          result["train_metric"] = metric
        
        
        
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.logger[0].experiment.add_scalar("Loss/Valid",
	                                            avg_loss,
	                                            self.current_epoch)
        result = {"val_loss" : avg_loss}

        if self.metric:
          y_hat = torch.cat([x['y_hat'] for x in outputs])
          y = torch.cat([x['y'] for x in outputs])
          metric = self.metric(y_hat, y)
          self.log('val_metric', metric, prog_bar=True , logger=False)
          
          
          
          self.logger[0].experiment.add_scalar("Metric/Valid",
                                                metric,
                                                self.current_epoch)
          result["val_metric"] = metric
        
        


    def configure_optimizers(self):
        optimizers=[self.optimizer]
        schedulers = [
        {
          'scheduler': self.scheduler,
          'monitor': 'val_metric',
          'interval': 'epoch',
          'frequency': 1,
          'strict': True,
        }]
        return optimizers,schedulers
    
    def set_optimizer(self,optimizer):
        self.optimizer = optimizer
    
    def set_scheduler(self,scheduler):
        self.scheduler = scheduler
    
    def set_loss(self,loss):
        self.loss = loss
    
    def set_metric(self,metric):
        self.metric = metric

In [22]:
class SimpleSeqModel(PLModel):
    def __init__(
        self,
        K_mer_emb_size=128,
        K_mer_nunique=len(amino_acid_map),
        antivenom_emb_size=64,
        antivenom_unique=len(antivenom_map),
        max_Position_start=596,
        max_Position_end=611,
        Position_start_emb_size=32,
        Position_end_emb_size=32,
        optimizer=None , loss=None
    ): 
        super().__init__(optimizer=optimizer,loss=loss)
        self.K_mer_emb_size = K_mer_emb_size        
        self.K_mer_nunique = K_mer_nunique                
        self.antivenom_emb_size = antivenom_emb_size  
        self.antivenom_unique = antivenom_unique    
        
        self.Kmer_emb_layer = nn.Embedding(
            num_embeddings=self.K_mer_nunique,
            embedding_dim=self.K_mer_emb_size,
        )
        self.Antivenom_emb = nn.Embedding(
            num_embeddings=self.antivenom_unique,
            embedding_dim=self.antivenom_emb_size,
        )
    
        self.Position_start_emb = nn.Embedding(
            num_embeddings=max_Position_start,
            embedding_dim=Position_start_emb_size,
        )

        self.Position_end_emb = nn.Embedding(
            num_embeddings=max_Position_end,
            embedding_dim=Position_end_emb_size,
        )


        self.Features = nn.Linear(
            in_features=self.antivenom_emb_size + Position_start_emb_size + Position_end_emb_size,
            out_features=128,
        )
        
        self.Lstm_layer_1 = nn.LSTM(
            input_size=self.K_mer_emb_size,
            hidden_size=256,
            num_layers=1,
            bidirectional=True,
            batch_first=True,
        )
        self.Lstm_layer_2 = nn.LSTM(
            input_size=512,
            hidden_size=256,
            num_layers=1,
            bidirectional=False,
            batch_first=True,
        )
        
        self.Linear_1 = nn.Linear(
            in_features=self.Lstm_layer_2.hidden_size + self.Features.out_features,
            out_features=512,
        )
        self.relu_1 = nn.ReLU()
        self.Linear_2 = nn.Linear(
            in_features=self.Linear_1.out_features, out_features=256,
        )
        self.relu_2 = nn.ReLU()
        self.Output = nn.Linear(
            in_features=self.Linear_2.out_features, out_features=1,
        )
        
    def forward(self, inputs):
        kmer_emb = self.Kmer_emb_layer(inputs["K_mer"])
        antivenom_emb = self.Antivenom_emb(inputs["antivenom"])
        position_start_emb = self.Position_start_emb(inputs["position_start"])
        position_end_emb = self.Position_end_emb(inputs["position_end"])

        emb_features = torch.cat((antivenom_emb, position_start_emb , position_end_emb), axis=1)
        features = self.Features(emb_features)
        
        lstm_1_seq, (lstm_1_h, lstm1_c) = self.Lstm_layer_1(kmer_emb)
        lstm_2_seq, (lstm_2_h, lstm2_c) = self.Lstm_layer_2(lstm_1_seq)

        lstm_h = torch.squeeze(lstm_2_h)
        emb = torch.cat((lstm_h, features), axis=1)
        linear_1 = self.relu_1(self.Linear_1(emb))
        linear_2 = self.relu_2(self.Linear_2(linear_1))
        output = self.Output(linear_2)
        return output
        
        

In [23]:
class Net:
  @inject_config
  def __init__(self,*args,config : CN,**kwargs):
    self.name = config.model.name
    self.config = config
    if self.name=="starter":
      self.model=SimpleSeqModel(*args,**kwargs)

  def get_model(self,path=None):
    if path is None:
      return self.model
    else :
      self.model.load_state_dict(torch.load(path,map_location=self.config.device)['model'])
      return self.model

In [24]:
@inject_config
def train(df : pd.DataFrame , fold , config : CN):
    seed_all()
    model_name=f"best_model_{fold}"
    df_train = df[df["folds"] != fold ].reset_index(drop=True)
    df_valid = df[df["folds"] == fold ].reset_index(drop=True)
    print("-------------",df_train.shape,"---------------",df_valid.shape,"-------------")
    
    
    
    train_dataset = AntivenomChallengeDataSet(
        amino_acid_map=amino_acid_map,
        antivenom_map=antivenom_map,
        data=df_train,
        is_train=True,
        label_name="Signal",
        classification = False,
    )
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.model["train_bs"], shuffle=True, num_workers=1, pin_memory=True, drop_last=False
    )
    
    valid_dataset = AntivenomChallengeDataSet(
        amino_acid_map=amino_acid_map,
        antivenom_map=antivenom_map,
        data=df_valid,
        is_train=True,
        label_name="Signal",
        classification = False,
    )

    
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.model["test_bs"], shuffle=False, num_workers=1, pin_memory=True
    )
    

    model_hub=Net()
    model=model_hub.get_model(path=None)
    model.train()
    
    loss = torch.nn.HuberLoss()
    optimizer = torch.optim.Adam(model.parameters() , lr=config.model["base_lr"])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config.model.scheduler.step, gamma=config.model.scheduler.factor,last_epoch=-1)
  
    model.set_metric(RMSE())
    
    model.set_loss(loss)
    model.set_optimizer(optimizer)
    model.set_scheduler(scheduler)
    early_stop_callback = EarlyStopping(
      monitor='val_metric',
      min_delta=config.model.early_stopping.min_delta,
      patience=config.model.early_stopping.patience,
      verbose=True,
      mode='min',
      check_on_train_epoch_end=False
    )

    checkpoint_callback=ModelCheckpoint(
        
        dirpath=MODELS_PATH,
        filename=model_name,
        monitor="val_metric",
        save_top_k=1,
        mode="min",
        verbose=True,
    )
    lr_callback=LearningRateMonitor(logging_interval="epoch", log_momentum=True)

    t_logger = TensorBoardLogger(os.path.join(LOGS_PATH,"tensorboard"),
                                 name=config.experiment_id)
    logger_callback = LoggerCallback(Logger(path="main.log",name="main").get_logger())
    eng = pl.Trainer(gpus=-1,
                     max_epochs=config.model["epochs"],
                     logger=[t_logger],
                     auto_lr_find=True,
                     log_every_n_steps=1,
                     callbacks=[early_stop_callback,checkpoint_callback,lr_callback,logger_callback])
    
    train = eng.fit(model,train_loader,valid_loader)
    return eng,model

In [25]:
folds = range(10)

In [None]:
for fold in folds:
  eng,_=train(df , fold)

# Prediction

In [None]:
@inject_config
def predict(submission , fold , config : CN):
    seed_all()
    model_name=f"best_model_{fold}.ckpt"
    model_path=os.path.join(MODELS_PATH,model_name)
    print("---------------",submission.shape,"-------------")
    
    
    
    test_dataset = AntivenomChallengeDataSet(
        amino_acid_map=amino_acid_map,
        antivenom_map=antivenom_map,
        data=submission,
        is_train=False,
    )

    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.model["test_bs"], shuffle=False, num_workers=1
    )
    model_hub=Net()
    model=model_hub.get_model()
    model = model.load_from_checkpoint(model_path)
    model.to(config.device)
    model.eval()
    final_predictions = []
    with torch.no_grad():
        tk0 = tqdm(test_loader, total=len(test_loader))
        for data in tk0:
            for key in data.keys():
              data[key] = data[key].to(config.device)
            predictions = model(data)
            predictions = predictions.cpu()
            final_predictions.append(predictions)
    
    predictions= torch.cat(final_predictions).numpy()
    return predictions

In [None]:
def predict_df(df : pd.DataFrame , fold):
  df = df.copy()
  pred = predict(df , fold)
  print("pred shape : ",pred.shape)
  df["Signal"] = pred 
  return df

In [None]:
def save_submission(df , fold):
  sub = predict_df(df , fold)
  sub=sub[["ID" , "Signal"]]
  path = os.path.join(RESULTS_PATH , f"sub_{fold}.csv")
  sub.to_csv(path , index = False)
  return sub

In [None]:
for fold in folds:
  sub = save_submission(test_df , fold)

In [None]:
def blend_subs(subs_list):
  subs = []
  result_df =pd.read_csv(subs_list[0])
  for sub in subs_list:
    result = pd.read_csv(sub)["Signal"]
    subs.append(result)
  
  blend = np.mean(subs , axis = 0)
  result_df["Signal"] = blend
  return result_df

In [None]:
path = os.path.join(RESULTS_PATH , f"sub_{fold}.csv")
paths = [os.path.join(RESULTS_PATH , f"sub_{fold}.csv") for fold in folds]
mean_sub = blend_subs(paths)

In [None]:
path = os.path.join(RESULTS_PATH , f"exp_v7.csv")
mean_sub.to_csv(path , index = False)

In [None]:
mean_sub