In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest,SelectFromModel, RFE, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import time


In [2]:
dt = pd.read_csv('actual_dt.csv')
dt.dropna(inplace = True)
dt.drop_duplicates(inplace = True)

In [3]:
X = dt.drop(['class', 'id'], axis=1)
y = dt['class']

In [4]:
X.head()

Unnamed: 0,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,locAbsJitter,...,tqwt_kurtosisValue_dec_27,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36
0,1,0.85247,0.71826,0.57227,240,239,0.008064,8.7e-05,0.00218,1.8e-05,...,1.5466,1.562,2.6445,3.8686,4.2105,5.1221,4.4625,2.6202,3.0004,18.9405
1,1,0.76686,0.69481,0.53966,234,233,0.008258,7.3e-05,0.00195,1.6e-05,...,1.553,1.5589,3.6107,23.5155,14.1962,11.0261,9.5082,6.5245,6.3431,45.178
2,1,0.85083,0.67604,0.58982,232,231,0.00834,6e-05,0.00176,1.5e-05,...,1.5399,1.5643,2.3308,9.4959,10.7458,11.0177,4.8066,2.9199,3.1495,4.7666
3,0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,4.6e-05,...,6.9761,3.7805,3.5664,5.2558,14.0403,4.2235,4.6857,4.846,6.265,4.0603
4,0,0.3279,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,4.4e-05,...,7.8832,6.1727,5.8416,6.0805,5.7621,7.7817,11.6891,8.2103,5.0559,6.1164


In [5]:
# Feature Selection
X_linear =SelectKBest(f_classif, k=519).fit_transform(X, y)

In [6]:
# Oversampling
smote = SMOTE(random_state=42)
X_linear_smote, y_linear_smote = smote.fit_resample(X_linear, y)


In [7]:
# Feature Scaling
scaler = StandardScaler()
X_linear_scaled = scaler.fit_transform(X_linear_smote)

In [8]:
# Split data into train and test sets
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(X_linear_scaled, y_linear_smote, test_size=0.2, random_state=42)


In [9]:
# Hyperparameter Tuning
def evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    end_time = time.time()
    execution_time = end_time - start_time
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Execution Time: {execution_time:.2f} seconds")

In [10]:
def cross_validate_model(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    print(f"Cross-Validation Scores: {scores}")
    print(f"Mean Accuracy: {scores.mean():.2f} (+/- {scores.std():.2f})")


In [11]:
# Naive Bayes
gnb_linear = GaussianNB()
evaluate_model(gnb_linear, X_train_linear, X_test_linear, y_train_linear, y_test_linear)
cross_validate_model(gnb_linear, X_linear_scaled, y_linear_smote)

Confusion Matrix:
[[64 43]
 [23 96]]
Accuracy: 0.71
Precision: 0.69
Recall: 0.81
Execution Time: 0.01 seconds
Cross-Validation Scores: [0.71238938 0.77333333 0.82222222 0.74222222 0.72888889]
Mean Accuracy: 0.76 (+/- 0.04)


***Naive Bayes:***

Test set accuracy: 0.71
Mean cross-validation accuracy: 0.76
Standard deviation: 0.04
The mean cross-validation accuracy is slightly higher than the test set accuracy, and the standard deviation is relatively low, indicating that the Naive Bayes model is likely not overfitting.

In [12]:
# Multi-layer Perceptron
mlp_linear = MLPClassifier()
mlp_params = {'hidden_layer_sizes': [(50,), (100,), (200,)], 'max_iter': [500, 1000]}
mlp_cv_linear = GridSearchCV(mlp_linear, mlp_params, cv=5, scoring='accuracy')
mlp_cv_linear.fit(X_train_linear, y_train_linear)
evaluate_model(mlp_cv_linear.best_estimator_, X_train_linear, X_test_linear, y_train_linear, y_test_linear)
cross_validate_model(mlp_cv_linear.best_estimator_, X_linear_scaled, y_linear_smote)


Confusion Matrix:
[[104   3]
 [  3 116]]
Accuracy: 0.97
Precision: 0.97
Recall: 0.97
Execution Time: 2.41 seconds
Cross-Validation Scores: [0.91150442 0.97333333 0.94222222 0.94666667 0.90222222]
Mean Accuracy: 0.94 (+/- 0.03)


***Multi-layer Perceptron:***

Test set accuracy: 0.96
Mean cross-validation accuracy: 0.94
Standard deviation: 0.03
The mean cross-validation accuracy is slightly lower than the test set accuracy, and the standard deviation is low. This suggests that the Multi-layer Perceptron model may be slightly overfitting, but not to a concerning degree.

In [13]:
# Random Forest
rf_linear = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}
rf_cv_linear = GridSearchCV(rf_linear, rf_params, cv=5, scoring='accuracy')
rf_cv_linear.fit(X_train_linear, y_train_linear)
evaluate_model(rf_cv_linear.best_estimator_, X_train_linear, X_test_linear, y_train_linear, y_test_linear)
cross_validate_model(rf_cv_linear.best_estimator_, X_linear_scaled, y_linear_smote)

Confusion Matrix:
[[104   3]
 [  8 111]]
Accuracy: 0.95
Precision: 0.97
Recall: 0.93
Execution Time: 2.08 seconds
Cross-Validation Scores: [0.85840708 0.92888889 0.97333333 0.93777778 0.88444444]
Mean Accuracy: 0.92 (+/- 0.04)


***Random Forest:***

Test set accuracy: 0.95
Mean cross-validation accuracy: 0.92
Standard deviation: 0.04
The mean cross-validation accuracy is lower than the test set accuracy, and the standard deviation is moderate. This may indicate that the Random Forest model is overfitting to some extent, but it's not a severe case.

In [14]:
# SVM Linear Kernel
svm_linear = SVC(kernel='linear')
svm_params = {'C': [0.1, 1, 10]}
svm_cv_linear = GridSearchCV(svm_linear, svm_params, cv=5, scoring='accuracy')
svm_cv_linear.fit(X_train_linear, y_train_linear)
evaluate_model(svm_cv_linear.best_estimator_, X_train_linear, X_test_linear, y_train_linear, y_test_linear)
cross_validate_model(svm_cv_linear.best_estimator_, X_linear_scaled, y_linear_smote)

Confusion Matrix:
[[102   5]
 [ 11 108]]
Accuracy: 0.93
Precision: 0.96
Recall: 0.91
Execution Time: 0.05 seconds
Cross-Validation Scores: [0.83185841 0.92444444 0.91111111 0.92444444 0.91555556]
Mean Accuracy: 0.90 (+/- 0.04)


***SVM Linear Kernel:***

Test set accuracy: 0.93
Mean cross-validation accuracy: 0.90
Standard deviation: 0.04
The mean cross-validation accuracy is lower than the test set accuracy, and the standard deviation is moderate. This suggests that the SVM Linear Kernel model is overfitting to the training data, and regularization techniques or simpler models may be required.

In [15]:
# SVM RBF Kernel
svm_rbf_linear = SVC(kernel='rbf')
svm_rbf_params = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
svm_rbf_cv_linear = GridSearchCV(svm_rbf_linear, svm_rbf_params, cv=5, scoring='accuracy')
svm_rbf_cv_linear.fit(X_train_linear, y_train_linear)
evaluate_model(svm_rbf_cv_linear.best_estimator_, X_train_linear, X_test_linear, y_train_linear, y_test_linear)
cross_validate_model(svm_rbf_cv_linear.best_estimator_, X_linear_scaled, y_linear_smote)

Confusion Matrix:
[[104   3]
 [  2 117]]
Accuracy: 0.98
Precision: 0.97
Recall: 0.98
Execution Time: 0.06 seconds
Cross-Validation Scores: [0.9159292  0.96444444 0.95555556 0.96888889 0.91111111]
Mean Accuracy: 0.94 (+/- 0.02)


***SVM RBF Kernel:***

Test set accuracy: 0.98
Mean cross-validation accuracy: 0.94
Standard deviation: 0.02
The mean cross-validation accuracy is lower than the test set accuracy, but the standard deviation is low. This indicates that the SVM RBF Kernel model may be overfitting to some extent, but it's not a major concern.