## Training SVM classifier on DD_bigram dataset

The first step is to read dataset from CSV file

In [None]:
import pandas as pd
from sklearn import preprocessing


#dd_dataset = pd.read_csv('./dataset/dd_pssm_dataset_improved.csv')
# Add separated dimers feature extraction

dd_dataset = pd.read_csv('./dataset/dd_separated_dimer_dataset.csv')

In [None]:
dd_dataset.head()

Convert class lables to unique integers

In [None]:
dd_dataset.insert(1, 'class labels', pd.factorize(dd_dataset.Fold)[0])

In [None]:
dd_dataset.head()

Create a training set from pandas data frame

In [None]:
train_data = dd_dataset[['F%d' % i for i in range(1,401)]].values

# Normalize dataset
min_max_scaler = preprocessing.MinMaxScaler()
train_scaled = min_max_scaler.fit_transform(train_data)
train_data = train_scaled

labels = dd_dataset['class labels'].values

print("Number of samples: %d, Number of features: %d" % (train_data.shape[0], train_data.shape[1]))

In [None]:
# Show normlaized data

train = pd.DataFrame(train_data)

train.head()


## Data analysis

In [None]:
# Frequency of folds
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
freq_plot = dd_dataset['Fold'].value_counts().plot(ax=ax, kind='bar')
freq_plot.set_ylabel('Number of proteins')
plt.show()

## SVM classifier

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Hyperparameters
kernel = 'rbf'

# An instance of SVM classifier
svm_cl = SVC(kernel=kernel)

Define range of parameters for Grid Search

In [None]:
# Penalty parameter
c_range = {'C': [float(2**i) for i in range(-14, 14)]}
# Gamma parameter for RBF kernel
gamma_range = {'gamma': [float(2**i) for i in range(-14, 14)]} if kernel == 'rbf' else {}

param_range = {**c_range, **gamma_range}

# Arguments for grid search
cv_fold = 5
n_workers = -1 # Number of CPU threads

result = GridSearchCV(svm_cl, param_range, cv=cv_fold, n_jobs=n_workers, refit=True,
                      verbose=1)

Start grid search!

In [None]:
result.fit(train_data, labels)

Best results

In [None]:
print("Best accuracy: %.2f" % (result.best_score_ * 100))

Confusion matrix

In [None]:
from misc import plt_confusion_matrix
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import numpy as np

y_pred = result.best_estimator_.predict(train_data)
cm = confusion_matrix(labels, y_pred)

print("Accuracy: ", (accuracy_score(labels, y_pred) * 100))

plt.figure(figsize=(20, 10))
plt_confusion_matrix(cm, np.unique(labels))


Classification report

In [None]:
print(classification_report(labels, y_pred))

# XG Boost

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Create Dmatrix for more performance 
data_matrix = xgb.DMatrix(data=train_data,label=labels)

# Split data to train and test
X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.2, random_state=123)

In [None]:
# Instantitate an XGBosst Classifier
xg_clf = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.1, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 10)


In [None]:
# Fit the classifier to the training set
xg_clf.fit(X_train,y_train)

# Predit test data
preds = xg_clf.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, preds)

In [None]:
print("Best accuracy: %.2f" % (accuracy*100))
