In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from moa.model import DenseNet, Model, DenseBlock

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

data_dir = './DATA/lish-moa/'
SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<torch._C.Generator at 0x1a24a0ec90>

## Load data set

In [5]:
X = pd.read_csv(data_dir+'train_features.csv', index_col='sig_id')
y = pd.read_csv(data_dir+'train_targets_scored.csv', index_col='sig_id')

## Preprocess

In [6]:
# One-hot encoding for cp_type and cp_dose
X['cp_type'].replace({'trt_cp':1., 'ctl_vehicle':0.}, inplace=True)
X['cp_dose'].replace({'D1':1., 'D2':0.}, inplace=True)

# # split into training set and test set
# ids = X.index.values.copy()
# np.random.shuffle(ids)

# train_perc, test_perc = 0.85, 0.15
# train_id = ids[:round(len(ids)*train_perc)]
# test_id = ids[round(len(ids)*train_perc):]

# X_train = X.loc[train_id]
# X_test = X.loc[test_id]

# y_train = y.loc[train_id]
# y_test = y.loc[test_id]

# # normalize the data
# scaler = StandardScaler()
# X_train_norm = pd.DataFrame(scaler.fit_transform(X_train))
# X_train_norm.columns = X_train.columns
# X_train_norm.index = X_train.index

# X_test_norm = pd.DataFrame(scaler.transform(X_test))
# X_test_norm.columns = X_test.columns
# X_test_norm.index = X_test.index

## Model

In [11]:
SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)

params_net = {'input_size': X_train_norm.shape[1],
             'hidden_size': [1024, 512, 1024],
             'output_size': y_train.shape[1],
             'dropout': 0.01}

params_fit = {'X':X_train_norm,
             'y': y_train,
             'epoch': 20,
             'lr': 5e-4,
             'batch_size':128,
             'L1': 1e-6,
             'L2': 1e-6,
             'pos_weight':1,
             'patience':3,
             'verbose':True}

net = DenseNet(**params_net)
model = Model(net)
model.fit(**params_fit)

Epoch [1, 40] : train loss 0.6575137376785278
Epoch [1, 80] : train loss 0.3032241463661194
Epoch [1, 120] : train loss 0.11416324228048325
Validation loss decreased (inf --> 0.091764).  Saving model ...
Epoch [2, 40] : train loss 0.07053825259208679
Epoch [2, 80] : train loss 0.060192350298166275
Epoch [2, 120] : train loss 0.053510840982198715
Validation loss decreased (0.091764 --> 0.053594).  Saving model ...
Epoch [3, 40] : train loss 0.0505717396736145
Epoch [3, 80] : train loss 0.0464518740773201
Epoch [3, 120] : train loss 0.04203011095523834
Validation loss decreased (0.053594 --> 0.043874).  Saving model ...
Epoch [4, 40] : train loss 0.042121246457099915
Epoch [4, 80] : train loss 0.04004057124257088
Epoch [4, 120] : train loss 0.03692217543721199
Validation loss decreased (0.043874 --> 0.039242).  Saving model ...
Epoch [5, 40] : train loss 0.038430675864219666
Epoch [5, 80] : train loss 0.03667626902461052
Epoch [5, 120] : train loss 0.034974079579114914
Validation loss de

In [7]:
y_train_pred = model.predict_proba(X_train_norm)
y_test_pred = model.predict_proba(X_test_norm)

In [8]:
log_loss(y_train.values.ravel(), y_train_pred.ravel())

0.009937683484609957

In [9]:
log_loss(y_test.values.reshape(-1), y_test_pred.reshape(-1))

0.017457496457342073

In [None]:
n_SEED = 1
n_fold = 5
kfold = KFold(n_splits=n_fold, shuffle=True)
train_loss = []
test_loss = []
for SEED in range(100, 100+n_SEED):
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    for n, (train_idx, test_idx) in enumerate(kfold.split(X)):
        X_train = X.iloc[train_idx].values
        X_test = X.iloc[test_idx].values
        y_train = y.iloc[train_idx].values
        y_test = y.iloc[test_idx].values
        # normalize the data
        scaler = StandardScaler()
        X_train_norm = scaler.fit_transform(X_train)
        X_test_norm = scaler.transform(X_test)

        params_net = {'input_size': X_train_norm.shape[1],
                     'hidden_size': [2048,2048], # 128, 4096
                     'output_size': y_train.shape[1],
                     'dropout': [0.05, 0.3, 0.2]} # 长度比hidden_size长度多1

        params_fit = {'X':X_train_norm,
                     'y': y_train,
                     'epoch': 100,
                     'lr': 1e-4, # 1e-4 ~ 1e-3
                     'batch_size':64, # 64, 128, 256, 512
                     'L1': 1e-6,
                     'L2': 1e-5,
                     'pos_weight':1,
                     'patience':5,
                     'verbose':False}
        net = DenseNet(**params_net)
        model = Model(net)
        model.fit(**params_fit)
        y_train_pred = model.predict_proba(X_train_norm)
        y_test_pred = model.predict_proba(X_test_norm)
        train_loss.append(log_loss(y_train.ravel(), y_train_pred.ravel()))
        test_loss.append(log_loss(y_test.ravel(), y_test_pred.ravel()))
        print(f"SEED {SEED-99} out of {n_SEED}, KFOLD {n+1} out of {n_fold}")

print('Training loss : ', np.average(np.array(train_loss)))
print('Test loss : ', np.average(np.array(test_loss)))

EarlyStopping counter: 1 out of 5


In [11]:
a = torch.tensor([[3,4,5],[1,2,3]])

In [12]:
a.cpu().numpy()

array([[3, 4, 5],
       [1, 2, 3]])

In [13]:
a.size()

torch.Size([2, 3])

In [15]:
a.shape[0]

2