In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, precision_score, accuracy_score, recall_score, f1_score
from vaccine.model import DenseNet, Model
import torch 
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

data_dir = './DATA/'
SEED = 123

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the data set

In [14]:
df_bcell = pd.read_csv(data_dir+'input_bcell.csv')
df_sars = pd.read_csv(data_dir+'input_sars.csv')
df = pd.concat([df_bcell, df_sars], ignore_index=True)
df.head()

Unnamed: 0,parent_protein_id,protein_seq,start_position,end_position,peptide_seq,chou_fasman,emini,kolaskar_tongaonkar,parker,isoelectric_point,aromaticity,hydrophobicity,stability,target
0,A2T3T0,MDVLYSLSKTLKDARDKIVEGTLYSNVSDLIQQFNQMIITMNGNEF...,161,165,SASFT,1.016,0.703,1.018,2.22,5.810364,0.103275,-0.143829,40.2733,1
1,F0V2I4,MTIHKVAINGFGRIGRLLFRNLLSSQGVQVVAVNDVVDIKVLTHLL...,251,255,LCLKI,0.77,0.179,1.199,-3.86,6.210876,0.065476,-0.036905,24.998512,1
2,O75508,MVATCLQVVGFVTSFVGWIGVIVTTSTNDWVVTCGYTIPTCRKLDE...,145,149,AHRET,0.852,3.427,0.96,4.28,8.223938,0.091787,0.879227,27.863333,1
3,O84462,MTNSISGYQPTVTTSTSSTTSASGASGSLGASSVSTTANATVTQTA...,152,156,SNYDD,1.41,2.548,0.936,6.32,4.237976,0.044776,-0.521393,30.765373,1
4,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,85,89,DGTYR,1.214,1.908,0.937,4.64,6.867493,0.103846,-0.578846,21.684615,1


## Preprocessing

In [15]:
# feature columns
feature_cols = [col for col in df.columns if col not in ['parent_protein_id', 'protein_seq', 'peptide_seq', 'target']]
print(feature_cols)

# split df into training set and test set
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df.target, test_size=0.2, random_state=SEED)
y_train = y_train.to_frame()
y_test = y_test.to_frame()

# normalization
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=feature_cols)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=feature_cols)

['start_position', 'end_position', 'chou_fasman', 'emini', 'kolaskar_tongaonkar', 'parker', 'isoelectric_point', 'aromaticity', 'hydrophobicity', 'stability']


In [17]:
type(y_train)
y_train.shape

(11925, 1)

## Modeling

In [32]:
torch.manual_seed(SEED)
nfold = 7
skf = StratifiedKFold(n_splits = nfold, shuffle=True, random_state = SEED)

train_loss, val_loss, test_loss = [], [],[]
train_accu, val_accu, test_accu = [], [], []
train_prec, val_prec, test_prec = [], [], []
train_recall, val_recall, test_recall = [], [], []
train_f1, val_f1, test_f1 = [], [], []
for train_index, val_index in skf.split(X_train, y_train):
    train, val = X_train.iloc[train_index], X_train.iloc[val_index]
    train_targets, val_targets = y_train.iloc[train_index], y_train.iloc[val_index]
    
    params_net = {'input_size': train.shape[1],
                  'hidden_size':[512, 256],
                  'output_size':1,
                  'dropout':[0, 0.2, 0.2]}
    params_fit = {'X_train':train, 'y_train':train_targets,
                 'X_val':val, 'y_val': val_targets,
                 'epoch': 40,
                  'batch_size':128,
                 'lr': 1e-3,
                 'weight_decay':2e-5}
    net = DenseNet(**params_net)
    model = Model(net)
    model.fit(**params_fit)
    
    train_pred_proba = model.predict_proba(train)
    val_pred_proba = model.predict_proba(val)
    test_pred_proba = model.predict_proba(X_test)
    # neg log loss
    train_loss.append(log_loss(train_targets, train_pred_proba))
    val_loss.append(log_loss(val_targets, val_pred_proba))
    test_loss.append(log_loss(y_test, test_pred_proba))
    
    train_pred = model.predict(train)
    val_pred = model.predict(val)
    test_pred = model.predict(X_test)
    # accuracy
    train_accu.append(accuracy_score(train_targets, train_pred))
    val_accu.append(accuracy_score(val_targets, val_pred))
    test_accu.append(accuracy_score(y_test, test_pred))
    # precision
    train_prec.append(precision_score(train_targets, train_pred))
    val_prec.append(precision_score(val_targets, val_pred))
    test_prec.append(precision_score(y_test, test_pred))
    # recall
    train_recall.append(recall_score(train_targets, train_pred))
    val_recall.append(recall_score(val_targets, val_pred))
    test_recall.append(recall_score(y_test, test_pred))
    # f1 score
    train_f1.append(f1_score(train_targets, train_pred))
    val_f1.append(f1_score(val_targets, val_pred))
    test_f1.append(f1_score(y_test, test_pred))
    

print('Training loss : ', np.average(np.array(train_loss)))
print('Validation loss : ', np.average(np.array(val_loss)))
print('Test loss : ', np.average(np.array(test_loss)))

train_accu_avg = np.average(np.array(train_accu))
train_prec_avg = np.average(np.array(train_prec))
train_recall_avg = np.average(np.array(train_recall))
train_f1_avg = np.average(np.array(train_f1))
print('Train accuracy {:5f}, precision {:5f}, recall {:5f}, f1 {:5f} '.format(train_accu_avg, train_prec_avg, train_recall_avg, train_f1_avg))

val_accu_avg = np.average(np.array(val_accu))
val_prec_avg = np.average(np.array(val_prec))
val_recall_avg = np.average(np.array(val_recall))
val_f1_avg = np.average(np.array(val_f1))
print('Valid accuracy {:5f}, precision {:5f}, recall {:5f}, f1 {:5f} '.format(val_accu_avg, val_prec_avg, val_recall_avg, val_f1_avg))

test_accu_avg = np.average(np.array(test_accu))
test_prec_avg = np.average(np.array(test_prec))
test_recall_avg = np.average(np.array(test_recall))
test_f1_avg = np.average(np.array(test_f1))
print('Test  accuracy {:5f}, precision {:5f}, recall {:5f}, f1 {:5f} '.format(test_accu_avg, test_prec_avg, test_recall_avg, test_f1_avg))

Training loss :  0.34193550539367834
Validation loss :  0.43192105301030237
Test loss :  0.44936967781324955
Train accuracy 0.847603, precision 0.809408, recall 0.571245, f1 0.669641 
Valid accuracy 0.805032, precision 0.706707, recall 0.478604, f1 0.570592 
Test  accuracy 0.799751, precision 0.710162, recall 0.453606, f1 0.553425 
