# Template for evaluating several ML classification models

## Import basic Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('Data_for_UCI_named.csv')
X = dataset.iloc[:, :-2].values
y = dataset.iloc[:, -1].values

In [None]:
print(X)

## Data Preprocessing

In [13]:
### Impute missing data

In [3]:
print(dataset.isnull().sum())

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X)
X = imputer.transform(X)

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64


### Encode the labels

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[1 0 1 ... 0 1 1]


### Splitting the dataset into training and test data

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

### Feature scaling

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Comparing the ML classifiers

## Import the classifier classes

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

## Model and parameter definitions

In [11]:
names = [
    "Logistic Regression",
    "K-nearest Neighbors",
    "SVM",
    #"Linear SVM",
    #"RBF SVM",
    "Naive Bayes",
    "Decision Tree",
    "Random Forest",
]

classifiers = [
    LogisticRegression(random_state=0),
    KNeighborsClassifier(metric = 'minkowski', p = 2),
    SVC(random_state=0),
    #SVC(kernel = 'linear', random_state = 0),
    #SVC(kernel = 'rbf', random_state = 0),
    GaussianNB(),
    DecisionTreeClassifier(random_state = 0),
    RandomForestClassifier(random_state = 0)am
]

parameters = [
    [
        {'C': [0.25, 0.5, 0.75, 1, 10]}
    ],
    [
        {'n_neighbors': [2, 3, 4, 5, 10, 20, 100], 'weights': ['uniform', 'distance']}
    ],
    [
        {'C': [0.25, 0.5, 0.75, 1, 10], 'kernel': ['linear']},
        #{'C': [0.25, 0.5, 0.75, 1, 10], 'kernel': ['poly'], 'degree': [2, 3, 4, 5]},
        #{'C': [0.25, 0.5, 0.75, 1, 10], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}         
    ],
    [
        {}
    ],
    [
        {'criterion': ['gini', 'entropy', 'log_loss'], 'max_depth': [2, 3, 4, 5, 8, 12, 20]}
    ],
    [
        {'n_estimators': [5, 10, 20, 100, 1000], 'max_depth': [2, 3, 4, 5, 8, 12, 20, 100], 'criterion': ['gini', 'entropy', 'log_loss']}
    ]    
]

## Model Comparison at standard settings

In [10]:
print(f"Classifier {:20} Accuracy {:5}")
for name, classifier in zip(names, classifiers):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print(f"Classifier: {name:20},  accuracy: {accuracy_score(y_test, y_pred) * 100:5.2f}%")

Classifier: Logistic Regression ,  accuracy: 80.40%
Classifier: K-nearest Neighbors ,  accuracy: 86.50%
Classifier: SVM                 ,  accuracy: 95.85%
Classifier: Naive Bayes         ,  accuracy: 82.90%
Classifier: Decision Tree       ,  accuracy: 85.50%
Classifier: Random Forest       ,  accuracy: 92.45%


## Hyperparameter tuning using Grid Search including k-fold Cross Validation

In [12]:
print(f"Classifier {:20} Best Accuracy {:20.2f} Parameter combination")
for name, classifier, params in zip(names, classifiers, parameters):
    grid_search = GridSearchCV(estimator = classifier,
                           param_grid = params,
                           scoring = 'accuracy',
                           cv = 10,  # Number of cross validations
                           n_jobs = -1)  # number of CPU cores used. -1 indicates to use all cores.
    grid_search.fit(X_train, y_train)
    print(f"{name:20} {grid_search.best_score_ * 100: 5.2f} {grid_search.best_params_}")

Classifier: Logistic Regression , Best accuracy:  81.74 at parameters: {'C': 0.25}
Classifier: K-nearest Neighbors , Best accuracy:  87.19 at parameters: {'n_neighbors': 20, 'weights': 'uniform'}
Classifier: SVM                 , Best accuracy:  81.74 at parameters: {'C': 0.25, 'kernel': 'linear'}
Classifier: Naive Bayes         , Best accuracy:  83.25 at parameters: {}
Classifier: Decision Tree       , Best accuracy:  85.15 at parameters: {'criterion': 'entropy', 'max_depth': 12}
Classifier: Random Forest       , Best accuracy:  92.28 at parameters: {'criterion': 'gini', 'max_depth': 20, 'n_estimators': 1000}
