## Training SVM classifier on DD_bigram dataset

The first step is to read dataset from CSV file

In [1]:
import pandas as pd
from sklearn import preprocessing


#dd_dataset = pd.read_csv('./dataset/dd_pssm_dataset_improved.csv')
# Add separated dimers feature extraction

dd_dataset = pd.read_csv('./dataset/pca_spd.csv')

In [2]:
dd_dataset.head()

Unnamed: 0,class labels,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F71,F72,F73,F74,F75,F76,F77,F78,F79,F80
0,0,-1.23123,-0.12887,0.023117,-0.234682,-0.002786,-0.011627,0.035891,0.011365,0.003888,...,0.00332,-0.000109,0.007113,0.003471,-0.005336,0.001139,0.01299,-0.014149,0.001149,0.015473
1,0,0.344408,-0.106565,0.020193,-0.086229,0.117225,0.04443,0.084057,-0.073645,0.034724,...,0.007709,-0.001961,0.005301,-0.003684,-0.003467,-0.002431,-0.010845,0.008244,0.014376,-0.002507
2,0,-3.029132,-0.026505,0.047796,0.012177,0.043901,0.036138,0.037599,0.004695,-0.000661,...,0.003564,-0.001329,-0.003671,-0.004046,-0.003318,0.001048,0.002861,-0.000234,0.001196,-0.001146
3,0,-2.607506,-0.092531,0.066263,0.024992,0.050555,0.026905,0.02954,0.004926,-0.01545,...,-0.003472,0.001659,9e-06,-0.000555,-0.002641,-0.000271,0.003342,0.004203,0.001513,-0.005227
4,0,1.578113,-0.224183,-0.207093,0.013658,0.015117,0.085344,0.062421,0.093711,-0.054058,...,-0.014802,0.007215,0.005183,-0.000568,-0.00357,-0.000858,-0.000448,0.011435,0.008016,0.000831


In [6]:
train_data = dd_dataset[['F%d' % i for i in range(1,81)]].values

# Normalize dataset
#min_max_scaler = preprocessing.MinMaxScaler()
#train_scaled = min_max_scaler.fit_transform(train_data)
#train_data = train_scaled

labels = dd_dataset['class labels'].values

print("Number of samples: %d, Number of features: %d" % (train_data.shape[0], train_data.shape[1]))

Number of samples: 311, Number of features: 80


In [7]:
# Show normlaized data

#train = pd.DataFrame(train_data)

#train.head()


## SVM classifier

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Hyperparameters
kernel = 'rbf'

# An instance of SVM classifier
svm_cl = SVC(kernel=kernel)

Define range of parameters for Grid Search

In [9]:
# Penalty parameter
c_range = {'C': [float(2**i) for i in range(-14, 14)]}
# Gamma parameter for RBF kernel
gamma_range = {'gamma': [float(2**i) for i in range(-14, 14)]} if kernel == 'rbf' else {}

param_range = {**c_range, **gamma_range}

# Arguments for grid search
cv_fold = 10
n_workers = -1 # Number of CPU threads

result = GridSearchCV(svm_cl, param_range, cv=cv_fold, n_jobs=n_workers, refit=True,
                      verbose=1)

Start grid search!

In [10]:
result.fit(train_data, labels)

Fitting 10 folds for each of 784 candidates, totalling 7840 fits


[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 1306 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 3306 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 6106 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 7840 out of 7840 | elapsed:  1.4min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [6.103515625e-05, 0.0001220703125, 0.000244140625, 0.00048828125, 0.0009765625, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, 2048.0, 4096.0, 8192.0], 'gamma': [6.103515625e-05, 0.0...25, 0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, 2048.0, 4096.0, 8192.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

Best results

In [11]:
print("Best accuracy: %.2f" % (result.best_score_ * 100))

Best accuracy: 32.48


Confusion matrix

In [12]:
from misc import plt_confusion_matrix
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import numpy as np

y_pred = result.best_estimator_.predict(train_data)
cm = confusion_matrix(labels, y_pred)

print("Accuracy: ", (accuracy_score(labels, y_pred) * 100))

plt.figure(figsize=(20, 10))
plt_confusion_matrix(cm, np.unique(labels))


Accuracy:  85.85209003215434


NameError: name 'plt' is not defined

Classification report

In [None]:
print(classification_report(labels, y_pred))

# XG Boost

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Create Dmatrix for more performance 
data_matrix = xgb.DMatrix(data=train_data,label=labels)

# Split data to train and test
X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.2, random_state=123)

In [None]:
# Instantitate an XGBosst Classifier
xg_clf = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.1, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 10)


In [None]:
# Fit the classifier to the training set
xg_clf.fit(X_train,y_train)

# Predit test data
preds = xg_clf.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, preds)

In [None]:
print("Best accuracy: %.2f" % (accuracy*100))
