Implementation of Logistic Regression and Hyperparameter Tuning

In [70]:
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_validate
from sklearn import preprocessing
from sklearn.model_selection import RepeatedStratifiedKFold,cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, auc,roc_curve,accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
from sklearn.preprocessing import LabelEncoder

import numpy as np 
import warnings
warnings.filterwarnings('ignore')

Importing Dataset

In [None]:
df_dataset_1 = pd.read_csv("project3_dataset1.txt",sep="\t", header=None)
df_dataset_2 = pd.read_csv("project3_dataset2.txt",sep="\t", header=None)

 **Dataset 1**

Dataset Manipulation

In [None]:
X,y = df_dataset_1.iloc[:,:-1], df_dataset_1.iloc[:,-1]
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print("The shape of the Training features is {} ".format(X_train.shape))
print("The shape of the Training labels is {} ".format(y_train.shape))
print("The shape of the Testing features is {} ".format(X_test.shape))
print("The shape of the Testing labels is {} ".format(y_test.shape))


The shape of the Training features is (455, 30) 
The shape of the Training labels is (455,) 
The shape of the Testing features is (114, 30) 
The shape of the Testing labels is (114,) 


Using default Hyperparameters with no feature scaling

In [None]:
def train_default(X_train,y_train,X_test,y_test):
  clf_default = LogisticRegression()
  clf_default.fit(X_train,y_train)
  y_hat = clf_default.predict(X_test)
  score = clf_default.score(X_test,y_test) * 100
  print("The accuracy of the default hyperparameters on the test set is {:.2f}%".format(score))
  print("the hyperparamters are : {}".format(clf_default.get_params()))
  accuracy_score(y_test, y_hat)
  clf_report = pd.DataFrame(classification_report(y_test, y_hat, output_dict=True))
  print(tabulate(clf_report, headers='keys', tablefmt='psql'))

train_default(X_train,y_train,X_test,y_test)

The accuracy of the default hyperparameters on the test set is 97.37%
the hyperparamters are : {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
+-----------+-----------+-----------+------------+-------------+----------------+
|           |         0 |         1 |   accuracy |   macro avg |   weighted avg |
|-----------+-----------+-----------+------------+-------------+----------------|
| precision |  0.957746 |  1        |   0.973684 |    0.978873 |       0.974796 |
| recall    |  1        |  0.934783 |   0.973684 |    0.967391 |       0.973684 |
| f1-score  |  0.978417 |  0.966292 |   0.973684 |    0.972355 |       0.973525 |
| support   | 68        | 46        |   0.973684 |  114        |     114        |
+-----------+-----------+-----------+------------+----

Using Default Hyperparameters with feature scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
train_default(X_train,y_train,X_test,y_test)

The accuracy of the default hyperparameters on the test set is 99.12%
the hyperparamters are : {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
+-----------+-----------+-----------+------------+-------------+----------------+
|           |         0 |         1 |   accuracy |   macro avg |   weighted avg |
|-----------+-----------+-----------+------------+-------------+----------------|
| precision |  1        |  0.978723 |   0.991228 |    0.989362 |       0.991415 |
| recall    |  0.985294 |  1        |   0.991228 |    0.992647 |       0.991228 |
| f1-score  |  0.992593 |  0.989247 |   0.991228 |    0.99092  |       0.991243 |
| support   | 68        | 46        |   0.991228 |  114        |     114        |
+-----------+-----------+-----------+------------+----

Hyperparamter tuning using GridSearchCV

In [None]:
def search_parameter_space(grid):
  clf = LogisticRegression()
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
  grid_result = grid_search.fit(X_train, y_train)
  score = grid_result.score(X_test,y_test) * 100
  print("The accuracy of the tuned hyperparameters on the test set without cross validation is {:.2f}%".format(score))
  y_hat = grid_result.predict(X_test)
  accuracy_score(y_test, y_hat)
  clf_report = pd.DataFrame(classification_report(y_test, y_hat, output_dict=True))
  print(tabulate(clf_report, headers='keys', tablefmt='psql'))
  print("\t\t\t\t Cross Validation")
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
  print("Cross Validation Results for the Best Fit :-")
  cv_results = cross_validate(grid_search, X_train,y_train, cv=cv, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
  print('\n\nCross-Validation accuracy: %.3f +/- %.3f' %(np.mean(cv_results.get('test_accuracy')), np.std(cv_results.get('test_accuracy'))))
  print('\nList of possible accuracy:', cv_results.get('test_accuracy'))
  print('\nMaximum Accuracy That can be obtained from this model is:', max(cv_results.get('test_accuracy'))*100, '%')
  print('\nMinimum Accuracy:', min(cv_results.get('test_accuracy'))*100, '%')
  print('\nOverall Accuracy:', (np.mean(cv_results.get('test_accuracy'))*100, '%'))
  print('\nStandard Deviation is:', np.std(cv_results.get('test_accuracy')))
  #print(classification_report(y_test, y_hat))
solvers = ['lbfgs','liblinear']
penalty = ['l2']
c_values = [0.01,1,1.2,1.3,10,100,1000,10000]
grid = dict(solver=solvers,penalty=penalty,C=c_values)  
search_parameter_space(grid)

The accuracy of the tuned hyperparameters on the test set without cross validation is 99.12%
+-----------+-----------+-----------+------------+-------------+----------------+
|           |         0 |         1 |   accuracy |   macro avg |   weighted avg |
|-----------+-----------+-----------+------------+-------------+----------------|
| precision |  1        |  0.978723 |   0.991228 |    0.989362 |       0.991415 |
| recall    |  0.985294 |  1        |   0.991228 |    0.992647 |       0.991228 |
| f1-score  |  0.992593 |  0.989247 |   0.991228 |    0.99092  |       0.991243 |
| support   | 68        | 46        |   0.991228 |  114        |     114        |
+-----------+-----------+-----------+------------+-------------+----------------+
				 Cross Validation
Best: 0.978019 using {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
0.953060 (0.033640) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.969936 (0.024482) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
0.97655

**Dataset-2**

Preprocessing Data 

In [76]:

def preprocess(df_dataset_2):
  X,y = df_dataset_2.iloc[:,:-1], df_dataset_2.iloc[:,-1]
  le = LabelEncoder()
  scaler = StandardScaler()
  X_without_categorical = X.iloc[:,X.columns!=4]
  X_with_categorical = le.fit_transform(X.iloc[:,X.columns==4]).reshape(-1,1)
  X = np.append(X_without_categorical,X_with_categorical,1)
  X = scaler.fit_transform(X)
  return X,y

X,y = preprocess(df_dataset_2)
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

Using Default Hyperparameters

In [77]:
def train_default(X_train,y_train,X_test,y_test):
  clf_default = LogisticRegression()
  clf_default.fit(X_train,y_train)
  y_hat = clf_default.predict(X_test)
  score = clf_default.score(X_test,y_test) * 100
  print("The accuracy of the default hyperparameters on the test set is {:.2f}%".format(score))
  print("the hyperparamters are : {}".format(clf_default.get_params()))
  accuracy_score(y_test, y_hat)
  clf_report = pd.DataFrame(classification_report(y_test, y_hat, output_dict=True))
  print(tabulate(clf_report, headers='keys', tablefmt='psql'))

train_default(X_train,y_train,X_test,y_test)

The accuracy of the default hyperparameters on the test set is 72.04%
the hyperparamters are : {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
+-----------+-----------+-----------+------------+-------------+----------------+
|           |         0 |         1 |   accuracy |   macro avg |   weighted avg |
|-----------+-----------+-----------+------------+-------------+----------------|
| precision |  0.815385 |  0.5      |    0.72043 |    0.657692 |       0.727213 |
| recall    |  0.791045 |  0.538462 |    0.72043 |    0.664753 |       0.72043  |
| f1-score  |  0.80303  |  0.518519 |    0.72043 |    0.660774 |       0.723489 |
| support   | 67        | 26        |    0.72043 |   93        |      93        |
+-----------+-----------+-----------+------------+----

In [78]:
def search_parameter_space(grid):
  clf = LogisticRegression()
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  grid_search = GridSearchCV(estimator=clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
  grid_result = grid_search.fit(X_train, y_train)
  score = grid_result.score(X_test,y_test) * 100
  print("The accuracy of the tuned hyperparameters on the test set without cross validation is {:.2f}%".format(score))
  y_hat = grid_result.predict(X_test)
  accuracy_score(y_test, y_hat)
  clf_report = pd.DataFrame(classification_report(y_test, y_hat, output_dict=True))
  print(tabulate(clf_report, headers='keys', tablefmt='psql'))
  print("\t\t\t\t Cross Validation")
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
  print("\n\nCross Validation Results for the Best Fit :-")
  cv_results = cross_validate(grid_search, X_train,y_train, cv=cv, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
  print('\n\nCross-Validation accuracy: %.3f +/- %.3f' %(np.mean(cv_results.get('test_accuracy')), np.std(cv_results.get('test_accuracy'))))
  print('\nList of possible accuracy:', cv_results.get('test_accuracy'))
  print('\nMaximum Accuracy That can be obtained from this model is:', max(cv_results.get('test_accuracy'))*100, '%')
  print('\nMinimum Accuracy:', min(cv_results.get('test_accuracy'))*100, '%')
  print('\nOverall Accuracy:', (np.mean(cv_results.get('test_accuracy'))*100, '%'))
  print('\nStandard Deviation is:', np.std(cv_results.get('test_accuracy')))
  #print(classification_report(y_test, y_hat))
solvers = ['liblinear','lbfgs']
penalty = ['l2']
c_values = np.linspace(0.1,10,50)
grid = dict(solver=solvers,penalty=penalty,C=c_values)  
search_parameter_space(grid)

The accuracy of the tuned hyperparameters on the test set without cross validation is 72.04%
+-----------+-----------+-----------+------------+-------------+----------------+
|           |         0 |         1 |   accuracy |   macro avg |   weighted avg |
|-----------+-----------+-----------+------------+-------------+----------------|
| precision |  0.815385 |  0.5      |    0.72043 |    0.657692 |       0.727213 |
| recall    |  0.791045 |  0.538462 |    0.72043 |    0.664753 |       0.72043  |
| f1-score  |  0.80303  |  0.518519 |    0.72043 |    0.660774 |       0.723489 |
| support   | 67        | 26        |    0.72043 |   93        |      93        |
+-----------+-----------+-----------+------------+-------------+----------------+
				 Cross Validation
Best: 0.726476 using {'C': 0.9081632653061225, 'penalty': 'l2', 'solver': 'liblinear'}
0.721922 (0.067534) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.725551 (0.064888) with: {'C': 0.3020408163265306, 'penalty': 'l