In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from moa.model import DenseNet, Model, DenseBlock

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

data_dir = './DATA/lish-moa/'
SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<torch._C.Generator at 0x1a1c8ffed0>

## Load data set

In [3]:
X = pd.read_csv(data_dir+'train_features.csv', index_col='sig_id')
y = pd.read_csv(data_dir+'train_targets_scored.csv', index_col='sig_id')

## Preprocess

In [5]:
# One-hot encoding for cp_type and cp_dose
X['cp_type'].replace({'trt_cp':1., 'ctl_vehicle':0.}, inplace=True)
X['cp_dose'].replace({'D1':1., 'D2':0.}, inplace=True)

# split into training set and test set
ids = X.index.values.copy()
np.random.shuffle(ids)

train_perc, test_perc = 0.9, 0.1
train_id = ids[:round(len(ids)*train_perc)]
test_id = ids[round(len(ids)*train_perc):]

X_train = X.loc[train_id]
X_test = X.loc[test_id]

y_train = y.loc[train_id]
y_test = y.loc[test_id]

# normalize the data
scaler = StandardScaler()
X_train_norm = pd.DataFrame(scaler.fit_transform(X_train))
X_train_norm.columns = X_train.columns
X_train_norm.index = X_train.index

X_test_norm = pd.DataFrame(scaler.transform(X_test))
X_test_norm.columns = X_test.columns
X_test_norm.index = X_test.index

## Model

In [6]:
SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)

params_net = {'input_size': X_train_norm.shape[1],
             'hidden_size': [1024, 512, 1024],
             'output_size': y_train.shape[1],
             'dropout': 0.01}

params_fit = {'X':X_train_norm,
             'y': y_train,
             'epoch': 20,
             'lr': 1e-4,
             'batch_size':128,
             'L1': 1e-6,
             'L2': 1e-6,
             'pos_weight':1,
             'patience':5,
             'verbose':True}

net = DenseNet(**params_net)
model = Model(net)
model.fit(**params_fit)

Epoch [1, 40] : train loss 0.7370374202728271
Epoch [1, 80] : train loss 0.720777153968811
Epoch [1, 120] : train loss 0.6966890096664429
Validation loss decreased (inf --> 0.669574).  Saving model ...
Epoch [2, 40] : train loss 0.6039992570877075
Epoch [2, 80] : train loss 0.5151388645172119
Epoch [2, 120] : train loss 0.4230225086212158
Validation loss decreased (0.669574 --> 0.367610).  Saving model ...
Epoch [3, 40] : train loss 0.2922835648059845
Epoch [3, 80] : train loss 0.23362737894058228
Epoch [3, 120] : train loss 0.19158142805099487
Validation loss decreased (0.367610 --> 0.169029).  Saving model ...
Epoch [4, 40] : train loss 0.14352665841579437
Epoch [4, 80] : train loss 0.12439484894275665
Epoch [4, 120] : train loss 0.11366760730743408
Validation loss decreased (0.169029 --> 0.104262).  Saving model ...
Epoch [5, 40] : train loss 0.09532388299703598
Epoch [5, 80] : train loss 0.08794067054986954
Epoch [5, 120] : train loss 0.08441205322742462
Validation loss decreased (

In [7]:
y_train_pred = model.predict_proba(X_train_norm)
y_test_pred = model.predict_proba(X_test_norm)

In [8]:
log_loss(y_train.values.ravel(), y_train_pred.ravel())

0.009937683484609957

In [9]:
log_loss(y_test.values.reshape(-1), y_test_pred.reshape(-1))

0.017457496457342073