Support Vector Machines (SVM)

In [7]:
import pandas as pd 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_validate
from sklearn import preprocessing
from sklearn.model_selection import RepeatedStratifiedKFold,cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, auc,roc_curve,accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
from sklearn.preprocessing import LabelEncoder
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

Importing Data

 **Dataset 1**

Dataset Manipulation

In [8]:

scaler = StandardScaler()
df_dataset_1 = pd.read_csv("project3_dataset1.txt",sep="\t", header=None)
X,y = df_dataset_1.iloc[:,:-1].to_numpy(), df_dataset_1.iloc[:,-1].to_numpy()
X = scaler.fit_transform(X)
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print("The shape of the Training features is {} ".format(X_train.shape))
print("The shape of the Training labels is {} ".format(y_train.shape))
print("The shape of the Testing features is {} ".format(X_test.shape))
print("The shape of the Testing labels is {} ".format(y_test.shape))


The shape of the Training features is (455, 30) 
The shape of the Training labels is (455,) 
The shape of the Testing features is (114, 30) 
The shape of the Testing labels is (114,) 


Using default Hyperparameters 

In [10]:
#No Hyperparameter tuning
def train_default(X_train,y_train,X_test,y_test):
  clf_default = SVC()
  clf_default.fit(X_train,y_train)
  score = clf_default.score(X_test,y_test)
  y_hat = clf.predict(X_test)
  print("The accuracy of the default hyperparameters on the test set is {:.2f}%".format(score))
  print("the hyperparamters are : {}".format(clf_default.get_params()))
  accuracy_score(y_test, y_hat)
  clf_report = pd.DataFrame(classification_report(y_test, y_hat, output_dict=True))
  print(tabulate(clf_report, headers='keys', tablefmt='psql'))

train_default(X_train,y_train,X_test,y_test)

The accuracy of the default hyperparameters on the test set is 0.97%
the hyperparamters are : {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
+-----------+-----------+-----------+------------+-------------+----------------+
|           |         0 |         1 |   accuracy |   macro avg |   weighted avg |
|-----------+-----------+-----------+------------+-------------+----------------|
| precision |  0.971014 |  0.977778 |   0.973684 |    0.974396 |       0.973744 |
| recall    |  0.985294 |  0.956522 |   0.973684 |    0.970908 |       0.973684 |
| f1-score  |  0.978102 |  0.967033 |   0.973684 |    0.972568 |       0.973636 |
| support   | 68        | 46        |   0.973684 |  114        |     114        |
+-----------+-----------+-----------+------------+--

Hyperparamter tuning using GridSearchCV

In [12]:

def search_parameter_space(grid):
  clf = SVC()
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
  grid_result = grid_search.fit(X_train, y_train)
  score = grid_result.score(X_test,y_test) * 100
  print("The accuracy of the tuned hyperparameters on the test set without cross validation is {:.2f}%".format(score))
  y_hat = grid_result.predict(X_test)
  accuracy_score(y_test, y_hat)
  clf_report = pd.DataFrame(classification_report(y_test, y_hat, output_dict=True))
  print(tabulate(clf_report, headers='keys', tablefmt='psql'))
  print("\t\t\t\t Cross Validation")
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
  print("Cross Validation Results for the Best Fit :-")
  cv_results = cross_validate(grid_search, X_train,y_train, cv=cv, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
  print('\n\nCross-Validation accuracy: %.3f +/- %.3f' %(np.mean(cv_results.get('test_accuracy')), np.std(cv_results.get('test_accuracy'))))
  print('\nList of possible accuracy:', cv_results.get('test_accuracy'))
  print('\nMaximum Accuracy That can be obtained from this model is:', max(cv_results.get('test_accuracy'))*100, '%')
  print('\nMinimum Accuracy:', min(cv_results.get('test_accuracy'))*100, '%')
  print('\nOverall Accuracy:', (np.mean(cv_results.get('test_accuracy'))*100, '%'))
  print('\nStandard Deviation is:', np.std(cv_results.get('test_accuracy')))
  #print(classification_report(y_test, y_hat))
c_values = [0.1, 1, 10, 100, 1000,10000]
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = dict(C=c_values,gamma=gamma,kernel=kernel)
search_parameter_space(param_grid)

The accuracy of the tuned hyperparameters on the test set without cross validation is 97.37%
+-----------+-----------+-----------+------------+-------------+----------------+
|           |         0 |         1 |   accuracy |   macro avg |   weighted avg |
|-----------+-----------+-----------+------------+-------------+----------------|
| precision |  0.971014 |  0.977778 |   0.973684 |    0.974396 |       0.973744 |
| recall    |  0.985294 |  0.956522 |   0.973684 |    0.970908 |       0.973684 |
| f1-score  |  0.978102 |  0.967033 |   0.973684 |    0.972568 |       0.973636 |
| support   | 68        | 46        |   0.973684 |  114        |     114        |
+-----------+-----------+-----------+------------+-------------+----------------+
				 Cross Validation
Best: 0.977279 using {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.972866 (0.025178) with: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
0.949436 (0.032698) with: {'C': 0.1, 'gamma': 1, 'kernel': 'poly'}
0.635217 (0.007898) with: {'C'

**Dataset-2**

Import dataset

In [14]:

def preprocess(df_dataset_2):
  X,y = df_dataset_2.iloc[:,:-1], df_dataset_2.iloc[:,-1]
  le = LabelEncoder()
  scaler = StandardScaler()
  X_without_categorical = X.iloc[:,X.columns!=4]
  X_with_categorical = le.fit_transform(X.iloc[:,X.columns==4]).reshape(-1,1)
  X = np.append(X_without_categorical,X_with_categorical,1)
  X = scaler.fit_transform(X)
  return X,y
df_dataset_2 = pd.read_csv("project3_dataset2.txt",sep="\t", header=None)
X,y = preprocess(df_dataset_2)
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

Using Default Hyperparameters

In [16]:
def train_default(X_train,y_train,X_test,y_test):
  clf_default = SVC()
  clf_default.fit(X_train,y_train)
  y_hat = clf_default.predict(X_test)
  score = clf_default.score(X_test,y_test) * 100
  print("The accuracy of the default hyperparameters on the test set is {:.2f}%".format(score))
  print("the hyperparamters are : {}".format(clf_default.get_params()))
  accuracy_score(y_test, y_hat)
  clf_report = pd.DataFrame(classification_report(y_test, y_hat, output_dict=True))
  print(tabulate(clf_report, headers='keys', tablefmt='psql'))

train_default(X_train,y_train,X_test,y_test)

The accuracy of the default hyperparameters on the test set is 70.97%
the hyperparamters are : {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
+-----------+-----------+-----------+------------+-------------+----------------+
|           |         0 |         1 |   accuracy |   macro avg |   weighted avg |
|-----------+-----------+-----------+------------+-------------+----------------|
| precision |  0.80303  |  0.481481 |   0.709677 |    0.642256 |       0.713135 |
| recall    |  0.791045 |  0.5      |   0.709677 |    0.645522 |       0.709677 |
| f1-score  |  0.796992 |  0.490566 |   0.709677 |    0.643779 |       0.711325 |
| support   | 67        | 26        |   0.709677 |   93        |      93        |
+-----------+-----------+-----------+------------+-

Hyperparameter tuning using GridSearchCV

In [None]:

def search_parameter_space(grid):
  clf = SVC()
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
  grid_result = grid_search.fit(X_train, y_train)
  score = grid_result.score(X_test,y_test) * 100
  print("The accuracy of the tuned hyperparameters on the test set without cross validation is {:.2f}%".format(score))
  y_hat = grid_result.predict(X_test)
  accuracy_score(y_test, y_hat)
  clf_report = pd.DataFrame(classification_report(y_test, y_hat, output_dict=True))
  print(tabulate(clf_report, headers='keys', tablefmt='psql'))
  print("\t\t\t\t Cross Validation")
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
  print("Cross Validation Results for the Best Fit :-")
  cv_results = cross_validate(grid_search, X_train,y_train, cv=cv, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
  print('\n\nCross-Validation accuracy: %.3f +/- %.3f' %(np.mean(cv_results.get('test_accuracy')), np.std(cv_results.get('test_accuracy'))))
  print('\nList of possible accuracy:', cv_results.get('test_accuracy'))
  print('\nMaximum Accuracy That can be obtained from this model is:', max(cv_results.get('test_accuracy'))*100, '%')
  print('\nMinimum Accuracy:', min(cv_results.get('test_accuracy'))*100, '%')
  print('\nOverall Accuracy:', (np.mean(cv_results.get('test_accuracy'))*100, '%'))
  print('\nStandard Deviation is:', np.std(cv_results.get('test_accuracy')))
  #print(classification_report(y_test, y_hat))
c_values = [0.1, 1, 10, 100, 1000,10000]
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = dict(C=c_values,gamma=gamma,kernel=kernel)
search_parameter_space(param_grid)