In [1]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score, roc_curve, classification_report,confusion_matrix
import pickle

In [2]:
#loading PCA dataset from saved data folder
df = pd.read_pickle('saved_data/df_ready.pkl')

In [3]:
#splitting the data
x = df.drop(columns='num')
y = df['num']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24)

In [4]:
#model and parameter grids
models = {
    'Logistic_Regression': LogisticRegression(),
    'Decision_Tree': DecisionTreeClassifier(),
    'Random_Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True)
}
param_grids = {
    'Logistic_Regression': {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    },
    'Decision_Tree': {
        'max_depth': [5, 10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Random_Forest': {
        'n_estimators': [50, 100, 150],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }
}

param_dists = {
    'Logistic_Regression': {
        'C': np.logspace(-3, 2, 20),
        'penalty': ['l2'],
        'solver': ['lbfgs']
    },
    'Decision_Tree': {
        'max_depth': randint(3, 30),
        'min_samples_split': randint(2, 10),
        'min_samples_leaf': randint(1, 5)
    },
    'Random_Forest': {
        'n_estimators': randint(50, 200),
        'max_depth': randint(10, 40),
        'min_samples_split': randint(2, 10),
        'min_samples_leaf': randint(1, 4),
        'bootstrap': [True, False]
    },
    'SVM': {
        'C': np.logspace(-2, 2, 10),
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }
}

In [5]:
#parameter tuning
overall_best_model = None
overall_best_score = 0
best_models = {}

for name, model in models.items():
    grid_search = GridSearchCV(model, param_grids[name], scoring={'AUC': 'roc_auc', 'Acc': 'accuracy'}, refit='Acc')
    grid_search.fit(x_train, y_train)
    grid_pred = grid_search.predict(x_test)
    grid_acc = accuracy_score(y_test, grid_pred)
    rand_search = RandomizedSearchCV(model, param_dists[name], scoring={'AUC': 'roc_auc', 'Acc': 'accuracy'}, refit='Acc', random_state=24)
    rand_search.fit(x_train, y_train)
    rand_pred = rand_search.predict(x_test)
    rand_acc = accuracy_score(y_test, rand_pred)
    if grid_acc >= rand_acc:
        best_model = grid_search.best_estimator_
        best_score = grid_acc
        best_search_type = "GridSearch"
        best_params = grid_search.best_params_
    else:
        best_model = rand_search.best_estimator_
        best_score = rand_acc
        best_search_type = "RandomSearch"
        best_params = rand_search.best_params_
    best_models[name.replace('_', ' ')] = best_model
    print(f" Best {name.replace('_', ' ')}\n Tuning: {best_search_type}\n Accuracy: {best_score:.4f}")
    if best_score > overall_best_score:
        overall_best_model = best_model
        overall_best_score = best_score

 Best Logistic Regression
 Tuning: GridSearch
 Accuracy: 0.8033
 Best Decision Tree
 Tuning: GridSearch
 Accuracy: 0.7705
 Best Random Forest
 Tuning: RandomSearch
 Accuracy: 0.8361
 Best SVM
 Tuning: RandomSearch
 Accuracy: 0.8197


In [6]:
#saving evaluation as a file to compare with optimized models later
text = ""
for name, model in best_models.items():
    y_pred = model.predict(x_test)
    y_proba = model.predict_proba(x_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    metrics = classification_report(y_test, y_pred, target_names=["class 0", "class 1"])
    text += f"{name}:\n"
    text += metrics
    text += f"AUC Score: {auc:.4f}\n\n"
with open("results/after_tuning_evaluation.txt", "w") as f:
    f.write(text)

In [12]:
#pickleing best model (which is Random Forest)
with open(f"saved_models/final_model.pkl", 'wb') as f:
    pickle.dump(overall_best_model, f)