In [1]:
import pandas as pd
import numpy as np

# for visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# for Pre-processing 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

# for Dimensionality reduction
from sklearn.decomposition import PCA

#for Models development 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier

#for validation
from sklearn.model_selection import cross_val_score,KFold


#for model Evaluation 
from sklearn.metrics import accuracy_score, f1_score,confusion_matrix,classification_report,matthews_corrcoef

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data=pd.read_csv('Ov_fullprots.csv')
data

Unnamed: 0,Info_organism_id,Info_protein_id,Info_pos,Info_AA,Info_epitope_id,Info_host_id,Info_nPos,Info_nNeg,Info_type,Info_window,...,feat_BLOSUM4,feat_BLOSUM5,feat_BLOSUM6,feat_BLOSUM7,feat_BLOSUM8,feat_BLOSUM9,feat_BLOSUM10,feat_MSWHIM1,feat_MSWHIM2,feat_MSWHIM3
0,6282,CAA31690.1,1,R,,,0,0,,RTTTMKIL,...,-0.226250,0.228750,0.303750,0.001250,-0.626250,-0.101250,-0.080000,-0.602500,0.681250,-0.225000
1,6282,CAA31690.1,2,T,,,0,0,,RTTTMKILF,...,-0.132222,0.142222,0.302222,-0.001111,-0.545556,-0.042222,-0.214444,-0.451111,0.700000,-0.237778
2,6282,CAA31690.1,3,T,,,0,0,,RTTTMKILFC,...,-0.038000,0.311000,0.167000,-0.075000,-0.453000,-0.050000,-0.203000,-0.472000,0.656000,-0.241000
3,6282,CAA31690.1,4,T,,,0,0,,RTTTMKILFCL,...,-0.050000,0.285455,0.182727,-0.058182,-0.398182,-0.085455,-0.186364,-0.496364,0.661818,-0.233636
4,6282,CAA31690.1,5,M,,,0,0,,RTTTMKILFCLL,...,-0.060000,0.264167,0.195833,-0.044167,-0.352500,-0.115000,-0.172500,-0.516667,0.666667,-0.227500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69466,6282,AAB96970.1,648,T,,,0,0,,EEKLIVSTKKQS,...,-0.257500,0.137500,0.191667,-0.176667,-0.156667,0.286667,-0.056667,-0.465000,0.405833,-0.205000
69467,6282,AAB96970.1,649,K,,,0,0,,EKLIVSTKKQS,...,-0.248182,0.175455,0.216364,-0.100909,-0.203636,0.242727,-0.089091,-0.529091,0.478182,-0.220000
69468,6282,AAB96970.1,650,K,,,0,0,,KLIVSTKKQS,...,-0.237000,0.221000,0.246000,-0.010000,-0.260000,0.190000,-0.128000,-0.606000,0.565000,-0.238000
69469,6282,AAB96970.1,651,Q,,,0,0,,LIVSTKKQS,...,-0.168889,0.154444,0.262222,-0.034444,-0.303333,0.191111,-0.047778,-0.616667,0.618889,-0.331111


In [3]:
#Check for missing values 

data.isnull().sum().sort_values(ascending=False)

Class               28528
Info_epitope_id     28528
Info_host_id        28528
Info_type           28528
feat_KF6                0
                    ...  
feat_VHSE8              0
feat_ProtFP1            0
feat_ProtFP2            0
feat_ProtFP3            0
Info_organism_id        0
Length: 94, dtype: int64

# feature engineering in series-based data with Tsfresh

To install tsfresh package first we need to install conda package on conda-forge using --> conda install -c conda-forge tsfresh

!pip install tsfresh

In [3]:
df_tfresh=data.copy()
df_tfresh

Unnamed: 0,Info_organism_id,Info_protein_id,Info_pos,Info_AA,Info_epitope_id,Info_host_id,Info_nPos,Info_nNeg,Info_type,Info_window,...,feat_BLOSUM4,feat_BLOSUM5,feat_BLOSUM6,feat_BLOSUM7,feat_BLOSUM8,feat_BLOSUM9,feat_BLOSUM10,feat_MSWHIM1,feat_MSWHIM2,feat_MSWHIM3
0,6282,CAA31690.1,1,R,,,0,0,,RTTTMKIL,...,-0.226250,0.228750,0.303750,0.001250,-0.626250,-0.101250,-0.080000,-0.602500,0.681250,-0.225000
1,6282,CAA31690.1,2,T,,,0,0,,RTTTMKILF,...,-0.132222,0.142222,0.302222,-0.001111,-0.545556,-0.042222,-0.214444,-0.451111,0.700000,-0.237778
2,6282,CAA31690.1,3,T,,,0,0,,RTTTMKILFC,...,-0.038000,0.311000,0.167000,-0.075000,-0.453000,-0.050000,-0.203000,-0.472000,0.656000,-0.241000
3,6282,CAA31690.1,4,T,,,0,0,,RTTTMKILFCL,...,-0.050000,0.285455,0.182727,-0.058182,-0.398182,-0.085455,-0.186364,-0.496364,0.661818,-0.233636
4,6282,CAA31690.1,5,M,,,0,0,,RTTTMKILFCLL,...,-0.060000,0.264167,0.195833,-0.044167,-0.352500,-0.115000,-0.172500,-0.516667,0.666667,-0.227500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69466,6282,AAB96970.1,648,T,,,0,0,,EEKLIVSTKKQS,...,-0.257500,0.137500,0.191667,-0.176667,-0.156667,0.286667,-0.056667,-0.465000,0.405833,-0.205000
69467,6282,AAB96970.1,649,K,,,0,0,,EKLIVSTKKQS,...,-0.248182,0.175455,0.216364,-0.100909,-0.203636,0.242727,-0.089091,-0.529091,0.478182,-0.220000
69468,6282,AAB96970.1,650,K,,,0,0,,KLIVSTKKQS,...,-0.237000,0.221000,0.246000,-0.010000,-0.260000,0.190000,-0.128000,-0.606000,0.565000,-0.238000
69469,6282,AAB96970.1,651,Q,,,0,0,,LIVSTKKQS,...,-0.168889,0.154444,0.262222,-0.034444,-0.303333,0.191111,-0.047778,-0.616667,0.618889,-0.331111


# Removing  the irrelevant features from the dataset.

In [4]:
df_tfresh=df_tfresh.drop("Info_organism_id", axis=1)
df_tfresh=df_tfresh.drop("Info_epitope_id", axis=1)
df_tfresh=df_tfresh.drop("Info_host_id", axis=1)
df_tfresh=df_tfresh.drop("Info_nPos", axis=1)
df_tfresh=df_tfresh.drop("Info_nNeg", axis=1)
df_tfresh=df_tfresh.drop("Info_type", axis=1)
df_tfresh=df_tfresh.drop("Info_window", axis=1)

# trying out
df_tfresh=df_tfresh.drop("Info_protein_id", axis=1)
df_tfresh=df_tfresh.drop("Info_AA", axis=1)
df_tfresh=df_tfresh.drop("Info_split", axis=1)


In [5]:
df_tfresh

Unnamed: 0,Info_pos,Class,feat_seq_entropy,feat_C_atoms,feat_H_atoms,feat_N_atoms,feat_O_atoms,feat_S_atoms,feat_molecular_weight,feat_Perc_Tiny,...,feat_BLOSUM4,feat_BLOSUM5,feat_BLOSUM6,feat_BLOSUM7,feat_BLOSUM8,feat_BLOSUM9,feat_BLOSUM10,feat_MSWHIM1,feat_MSWHIM2,feat_MSWHIM3
0,1,,2.405639,41,92,12,19,1,1089.30,0.375000,...,-0.226250,0.228750,0.303750,0.001250,-0.626250,-0.101250,-0.080000,-0.602500,0.681250,-0.225000
1,2,,2.641604,50,103,13,21,1,1254.49,0.333333,...,-0.132222,0.142222,0.302222,-0.001111,-0.545556,-0.042222,-0.214444,-0.451111,0.700000,-0.237778
2,3,,2.846439,53,110,14,23,2,1375.65,0.400000,...,-0.038000,0.311000,0.167000,-0.075000,-0.453000,-0.050000,-0.203000,-0.472000,0.656000,-0.241000
3,4,,2.845351,59,123,15,25,2,1506.82,0.363636,...,-0.050000,0.285455,0.182727,-0.058182,-0.398182,-0.085455,-0.186364,-0.496364,0.661818,-0.233636
4,5,,2.792481,65,136,16,27,2,1637.99,0.333333,...,-0.060000,0.264167,0.195833,-0.044167,-0.352500,-0.115000,-0.172500,-0.516667,0.666667,-0.227500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69466,648,,2.855389,60,130,16,32,0,1587.77,0.250000,...,-0.257500,0.137500,0.191667,-0.176667,-0.156667,0.286667,-0.056667,-0.465000,0.405833,-0.205000
69467,649,,2.845351,55,121,15,28,0,1440.64,0.272727,...,-0.248182,0.175455,0.216364,-0.100909,-0.203636,0.242727,-0.089091,-0.529091,0.478182,-0.220000
69468,650,,2.646439,50,112,14,24,0,1293.51,0.300000,...,-0.237000,0.221000,0.246000,-0.010000,-0.260000,0.190000,-0.128000,-0.606000,0.565000,-0.238000
69469,651,,2.725481,44,98,12,22,0,1147.32,0.333333,...,-0.168889,0.154444,0.262222,-0.034444,-0.303333,0.191111,-0.047778,-0.616667,0.618889,-0.331111


In [7]:
# # Converting string to float so that data could be fit on for scaling
# le = LabelEncoder()
# df_tfresh['Info_protein_id']=le.fit_transform(df_tfresh['Info_protein_id'])
# df_tfresh['Info_AA']=le.fit_transform(df_tfresh['Info_AA'])
# df_tfresh['Info_split']=le.fit_transform(df_tfresh['Info_split'])

In [6]:
x=df_tfresh.drop(['Class'], axis=1) #axis=1 means we are working with columns and axis=0 means rows
y=df_tfresh['Class']

In [11]:
# y = np.array(y).reshape(1, -1)

In [12]:
# from sklearn.impute import KNNImputer
# imputer = KNNImputer(n_neighbors=2)
# imputer.fit_transform(x)
# imputer.fit_transform(y)

array([[1., 1., 1., ..., 1., 1., 1.]])

In [7]:
#Check for missing values 

df_tfresh.isnull().sum().sort_values(ascending=False)

Class           28528
feat_MSWHIM3        0
feat_Z1             0
feat_KF3            0
feat_KF4            0
                ...  
feat_ProtFP3        0
feat_ProtFP4        0
feat_ProtFP5        0
feat_ProtFP6        0
Info_pos            0
Length: 84, dtype: int64

In [8]:
scaler= StandardScaler()
x_TsFresh =scaler.fit_transform(x)

In [None]:
from tsfresh.feature_extraction import ComprehensiveFCParameters
settings = ComprehensiveFCParameters()
from tsfresh.feature_extraction import extract_features
extract_features(x, default_fc_parameters=settings, column_id="Info_pos")

Feature Extraction:   0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
# There can be many non-values in extracted features that can be removed using the following lines of codes.

from tsfresh.utilities.dataframe_functions import impute
impute(features)

In [None]:
# Now we can select relevant features using the following lines of codes

from tsfresh import select_features
filtered_features = select_features(features, y)
filtered_features

In [None]:
# Now we can compare the results of any model from sklearn using the data with all features and the data with filtered features.
# Let’s split the data into tests and train.

from sklearn.model_selection import train_test_split
X_feature_train, X_feature_test, y_train, y_test = train_test_split(features, y, test_size=.4)
X_filtered_train, X_filtered_test = X_feature_train[filtered_features.columns], X_feature_test[filtered_features.columns]

In [None]:
# Let’s fit the data with all the features in a decision tree model.

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
classifier_feature = DecisionTreeClassifier()
classifier_feature.fit(X_feature_train, y_train)
print(classification_report(y_test, classifier_feature.predict(X_feature_test)))

In [None]:
# Here we have got some good results with all the features. Let’s check a model using data that has been filtered.

classifier_filtered = DecisionTreeClassifier()
classifier_filtered.fit(X_filtered_train, y_train)
print(classification_report(y_test, classifier_filtered.predict(X_filtered_test)))

Here we can see that a similar model has improved its performance with the filtered model.