In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

data_dir = './DATA/input_bcell.csv'
SEED = 123
np.random.seed(SEED)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
df = pd.read_csv(data_dir)
df.head()

Unnamed: 0,parent_protein_id,protein_seq,start_position,end_position,peptide_seq,chou_fasman,emini,kolaskar_tongaonkar,parker,isoelectric_point,aromaticity,hydrophobicity,stability,target
0,A2T3T0,MDVLYSLSKTLKDARDKIVEGTLYSNVSDLIQQFNQMIITMNGNEF...,161,165,SASFT,1.016,0.703,1.018,2.22,5.810364,0.103275,-0.143829,40.2733,1
1,F0V2I4,MTIHKVAINGFGRIGRLLFRNLLSSQGVQVVAVNDVVDIKVLTHLL...,251,255,LCLKI,0.77,0.179,1.199,-3.86,6.210876,0.065476,-0.036905,24.998512,1
2,O75508,MVATCLQVVGFVTSFVGWIGVIVTTSTNDWVVTCGYTIPTCRKLDE...,145,149,AHRET,0.852,3.427,0.96,4.28,8.223938,0.091787,0.879227,27.863333,1
3,O84462,MTNSISGYQPTVTTSTSSTTSASGASGSLGASSVSTTANATVTQTA...,152,156,SNYDD,1.41,2.548,0.936,6.32,4.237976,0.044776,-0.521393,30.765373,1
4,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,85,89,DGTYR,1.214,1.908,0.937,4.64,6.867493,0.103846,-0.578846,21.684615,1


In [4]:
df.shape

(14387, 14)

In [5]:
(df.end_position-df.start_position).value_counts()

14    4611
9     3747
7     2149
11    1185
8     1132
12     465
6      318
13     276
5      265
10     165
4       74
dtype: int64

In [9]:
# feature columns
feature_cols = [col for col in df.columns if col not in ['parent_protein_id', 'protein_seq', 'peptide_seq', 'target']]
feature_cols

['start_position',
 'end_position',
 'chou_fasman',
 'emini',
 'kolaskar_tongaonkar',
 'parker',
 'isoelectric_point',
 'aromaticity',
 'hydrophobicity',
 'stability']

In [10]:
# check whether there's missing values in the features and target columns
(df[feature_cols+['target']].isnull()).sum()

start_position         0
end_position           0
chou_fasman            0
emini                  0
kolaskar_tongaonkar    0
parker                 0
isoelectric_point      0
aromaticity            0
hydrophobicity         0
stability              0
target                 0
dtype: int64

In [12]:
# split df into training set and test set
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df.target, test_size=0.2, random_state=123)

In [17]:
# normalization
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=feature_cols)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=feature_cols)