## Classification DD's proteins using TwinSVM and dimers features

In [19]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

dd_dataset = pd.read_csv('./dataset/dd_separated_dimer_dataset.csv')
dd_dataset.head()

Unnamed: 0,Fold,Protein name,F1,F2,F3,F4,F5,F6,F7,F8,...,F391,F392,F393,F394,F395,F396,F397,F398,F399,F400
0,Globm-like(alpha),2LHB,790.757707,1399.278353,1160.903046,1304.446194,1148.844103,1226.536499,1366.709337,1065.2709,...,2047.411499,1932.648648,2066.643207,1873.608234,2128.992637,1479.646794,1791.508381,1855.162705,2256.788191,2113.393006
1,Globm-like(alpha),3SDHA,1858.069057,2583.822269,2225.255813,2541.743965,2282.452582,2314.731649,2645.835706,2127.373864,...,3049.226743,2721.848606,3233.938243,3030.218287,3116.204002,2971.653669,3084.973696,2660.943795,3197.60394,3105.250496
2,Globm-like(alpha),1FLP,7.155008,230.267735,162.175731,161.391631,158.688111,219.089868,183.23262,253.579903,...,456.046522,442.254578,488.554104,476.69263,503.492418,407.816526,454.116052,441.108208,511.866271,512.277052
3,Globm-like(alpha),2HBG,366.744441,404.68226,359.57748,419.468181,374.871212,469.008454,321.988257,310.432062,...,780.581804,815.827219,816.06181,789.845319,831.392548,714.711919,823.400724,678.371956,831.392548,832.072321
4,Globm-like(alpha),2MGE,2742.19943,3507.138537,3380.188979,2845.115945,3185.902917,3051.498448,3128.557751,3184.984844,...,4284.268445,4235.922465,4173.931347,4061.108912,4149.400413,4094.231864,4094.404959,3879.324727,4143.189527,4180.038029


### Converting class labels to unique intergers

In [20]:
y_true, labels = pd.factorize(dd_dataset.Fold)
dd_dataset.insert(1, 'class labels', y_true)
dd_dataset.head()

Unnamed: 0,Fold,class labels,Protein name,F1,F2,F3,F4,F5,F6,F7,...,F391,F392,F393,F394,F395,F396,F397,F398,F399,F400
0,Globm-like(alpha),0,2LHB,790.757707,1399.278353,1160.903046,1304.446194,1148.844103,1226.536499,1366.709337,...,2047.411499,1932.648648,2066.643207,1873.608234,2128.992637,1479.646794,1791.508381,1855.162705,2256.788191,2113.393006
1,Globm-like(alpha),0,3SDHA,1858.069057,2583.822269,2225.255813,2541.743965,2282.452582,2314.731649,2645.835706,...,3049.226743,2721.848606,3233.938243,3030.218287,3116.204002,2971.653669,3084.973696,2660.943795,3197.60394,3105.250496
2,Globm-like(alpha),0,1FLP,7.155008,230.267735,162.175731,161.391631,158.688111,219.089868,183.23262,...,456.046522,442.254578,488.554104,476.69263,503.492418,407.816526,454.116052,441.108208,511.866271,512.277052
3,Globm-like(alpha),0,2HBG,366.744441,404.68226,359.57748,419.468181,374.871212,469.008454,321.988257,...,780.581804,815.827219,816.06181,789.845319,831.392548,714.711919,823.400724,678.371956,831.392548,832.072321
4,Globm-like(alpha),0,2MGE,2742.19943,3507.138537,3380.188979,2845.115945,3185.902917,3051.498448,3128.557751,...,4284.268445,4235.922465,4173.931347,4061.108912,4149.400413,4094.231864,4094.404959,3879.324727,4143.189527,4180.038029


### Creating a training set from data frame

In [30]:
train_data = dd_dataset[['F%d' % i for i in range(1,401)]].values

# Normalize dataset
min_max_scaler = preprocessing.MinMaxScaler()
train_scaled = min_max_scaler.fit_transform(train_data)
train_data = train_scaled

print("Number of samples: %d, Number of features: %d\n Number of classes: %d" % (train_data.shape[0],
                                                                            train_data.shape[1],
                                                                               np.unique(y_true).size))

Number of samples: 311, Number of features: 400
 Number of classes: 27


### Save dataset

In [37]:
#dd_dataset[['class labels'] + ['F%d' % i for i in range(1,401)]].to_csv('./dataset/dd_dimer_num.csv',
 #                                                                       index=False)

### TwinSVMclassifier

In [31]:
# tsvm folder should be added to path for import
import sys
sys.path.insert(0, '/home/mir/mir-projects/bio-protein-recog/src/tsvm')

from tsvm import twinsvm

kernel = 'linear'

tsvm_model = twinsvm.OVO_TSVM(kernel=kernel)

### Define range of parameters for grid search

In [32]:
from sklearn.model_selection import GridSearchCV

c_range = {'C_1': [float(2**i) for i in range(-8, 9)],
             'C_2': [float(2**i) for i in range(-8, 9)]}

gamma_range = {'gamma': [float(2**i) for i in range(-10, 5)]} if kernel == 'RBF' else {}

param_range = {**c_range, **gamma_range}

# Arguments for grid search
cv_fold = 10
n_workers = 8 # Number of CPU threads

result = GridSearchCV(tsvm_model, param_range, cv=cv_fold, n_jobs=n_workers, refit=True,
                      verbose=1)

Start grid search!

In [33]:
result.fit(train_data, y_true)

Fitting 10 folds for each of 289 candidates, totalling 2890 fits




KeyboardInterrupt: 