<a href="https://colab.research.google.com/github/LivaIg/Diabetes-classification/blob/main/diabetes_classification_shallow_learners.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Diabetes Prediction Challange with shallow learners**
Task is to build a machine learning classifier that predicts whether a patient is diagnosed with diabetes or not, based on a set of health indicators.

The dataset includes 21 features describing lifestyle, demographic, and health-related factors (e.g., BMI, smoking status, physical activity, age, blood pressure, cholesterol levels, etc.).

In [None]:
#loading datasets
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix ,accuracy_score,mean_squared_error, r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, RocCurveDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier,
                              VotingClassifier, VotingRegressor, StackingClassifier,
                              StackingRegressor)
from sklearn.svm import SVR



# Loading the datasets

In [None]:
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
X_train = pd.read_csv('X_train.csv')
X_test.head()
X_train.head()
y_train.head()

Unnamed: 0,ID,Diabetes
0,18165,No
1,38450,Yes
2,42473,Yes
3,13478,No
4,69265,Yes


# Prepare the data and perform one-hot encoding

In [None]:
y_train = y_train.drop(columns=["ID"])
y_train = y_train["Diabetes"].map({'Yes': 1, 'No': 0})

In [None]:
X_train = X_train.drop(columns=['ID'], errors='ignore')
X_test = X_test.drop(columns=['ID'], errors='ignore')

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
oneHotEncodedColumn = enc.fit_transform(X_train["Gender"].to_numpy().reshape(-1, 1))
print(oneHotEncodedColumn[:25].toarray())

[[0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]]


In [None]:

# Get feature names from encoder
gender_encoded_df = pd.DataFrame(
    oneHotEncodedColumn.toarray(),
    columns=enc.get_feature_names_out(["Gender"]),
    index=X_train.index  # Keep original index for alignment
)

# Drop original Gender column and concatenate encoded columns
X = X_train.drop("Gender", axis=1)
X = pd.concat([X, gender_encoded_df], axis=1)


# Split to test-train data sets



In [None]:
X_train_split, X_val, y_train_split, y_val = train_test_split(X, y_train, test_size=0.2, random_state=42)


# Evaluating the best performance for benchmark

In [None]:


models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVC_rbf": SVC(kernel= 'rbf',random_state=42),
    "SVC_poly": SVC(kernel= 'poly',random_state=42)
}

for name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    print(f"{name} Accuracy: {acc:.4f}")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.7379
Decision Tree Accuracy: 0.6541
Random Forest Accuracy: 0.7401
Gradient Boosting Accuracy: 0.7506
SVC_rbf Accuracy: 0.7472
SVC_poly Accuracy: 0.7457


# Performing parameter tuning on base models

In [None]:
#parameter tuning

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

model=LogisticRegression(random_state=42)
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.752 total time=   0.8s
[CV 2/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.745 total time=   1.1s
[CV 3/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.745 total time=   1.1s
[CV 4/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.750 total time=   1.3s
[CV 5/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.751 total time=   1.1s




[CV 1/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.752 total time=   2.1s




[CV 2/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.745 total time=   2.1s




[CV 3/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.746 total time=   2.1s




[CV 4/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.750 total time=   2.1s




[CV 5/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.751 total time=   2.9s
[CV 1/5] END C=0.01, penalty=l2, solver=liblinear;, score=0.750 total time=   0.3s
[CV 2/5] END C=0.01, penalty=l2, solver=liblinear;, score=0.744 total time=   0.3s
[CV 3/5] END C=0.01, penalty=l2, solver=liblinear;, score=0.741 total time=   0.3s
[CV 4/5] END C=0.01, penalty=l2, solver=liblinear;, score=0.748 total time=   0.3s
[CV 5/5] END C=0.01, penalty=l2, solver=liblinear;, score=0.749 total time=   0.3s




[CV 1/5] END ...C=0.01, penalty=l2, solver=saga;, score=0.753 total time=   1.9s




[CV 2/5] END ...C=0.01, penalty=l2, solver=saga;, score=0.746 total time=   1.8s




[CV 3/5] END ...C=0.01, penalty=l2, solver=saga;, score=0.747 total time=   1.8s




[CV 4/5] END ...C=0.01, penalty=l2, solver=saga;, score=0.751 total time=   1.8s




[CV 5/5] END ...C=0.01, penalty=l2, solver=saga;, score=0.750 total time=   1.8s
[CV 1/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.759 total time=   6.9s
[CV 2/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.751 total time=   6.5s
[CV 3/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.750 total time=   6.9s
[CV 4/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.752 total time=   4.3s
[CV 5/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.751 total time=   6.5s




[CV 1/5] END ....C=0.1, penalty=l1, solver=saga;, score=0.755 total time=   4.3s




[CV 2/5] END ....C=0.1, penalty=l1, solver=saga;, score=0.748 total time=   2.6s




[CV 3/5] END ....C=0.1, penalty=l1, solver=saga;, score=0.748 total time=   2.2s




[CV 4/5] END ....C=0.1, penalty=l1, solver=saga;, score=0.752 total time=   2.2s




[CV 5/5] END ....C=0.1, penalty=l1, solver=saga;, score=0.752 total time=   2.4s
[CV 1/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.757 total time=   0.4s
[CV 2/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.750 total time=   0.4s
[CV 3/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.749 total time=   0.4s
[CV 4/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.752 total time=   0.4s
[CV 5/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.751 total time=   0.8s




[CV 1/5] END ....C=0.1, penalty=l2, solver=saga;, score=0.754 total time=   3.3s




[CV 2/5] END ....C=0.1, penalty=l2, solver=saga;, score=0.748 total time=   1.8s




[CV 3/5] END ....C=0.1, penalty=l2, solver=saga;, score=0.748 total time=   1.8s




[CV 4/5] END ....C=0.1, penalty=l2, solver=saga;, score=0.752 total time=   1.8s




[CV 5/5] END ....C=0.1, penalty=l2, solver=saga;, score=0.751 total time=   1.7s
[CV 1/5] END .C=1, penalty=l1, solver=liblinear;, score=0.760 total time=   1.0s
[CV 2/5] END .C=1, penalty=l1, solver=liblinear;, score=0.752 total time=   1.4s
[CV 3/5] END .C=1, penalty=l1, solver=liblinear;, score=0.749 total time=   1.9s
[CV 4/5] END .C=1, penalty=l1, solver=liblinear;, score=0.752 total time=   2.0s
[CV 5/5] END .C=1, penalty=l1, solver=liblinear;, score=0.750 total time=   0.9s




[CV 1/5] END ......C=1, penalty=l1, solver=saga;, score=0.754 total time=   2.2s




[CV 2/5] END ......C=1, penalty=l1, solver=saga;, score=0.749 total time=   2.2s




[CV 3/5] END ......C=1, penalty=l1, solver=saga;, score=0.748 total time=   2.1s




[CV 4/5] END ......C=1, penalty=l1, solver=saga;, score=0.752 total time=   2.6s




[CV 5/5] END ......C=1, penalty=l1, solver=saga;, score=0.751 total time=   4.5s
[CV 1/5] END .C=1, penalty=l2, solver=liblinear;, score=0.760 total time=   0.4s
[CV 2/5] END .C=1, penalty=l2, solver=liblinear;, score=0.752 total time=   0.4s
[CV 3/5] END .C=1, penalty=l2, solver=liblinear;, score=0.749 total time=   0.4s
[CV 4/5] END .C=1, penalty=l2, solver=liblinear;, score=0.752 total time=   0.7s
[CV 5/5] END .C=1, penalty=l2, solver=liblinear;, score=0.751 total time=   0.5s




[CV 1/5] END ......C=1, penalty=l2, solver=saga;, score=0.754 total time=   2.0s




[CV 2/5] END ......C=1, penalty=l2, solver=saga;, score=0.749 total time=   1.8s




[CV 3/5] END ......C=1, penalty=l2, solver=saga;, score=0.748 total time=   1.8s




[CV 4/5] END ......C=1, penalty=l2, solver=saga;, score=0.752 total time=   3.1s




[CV 5/5] END ......C=1, penalty=l2, solver=saga;, score=0.751 total time=   3.2s
[CV 1/5] END C=10, penalty=l1, solver=liblinear;, score=0.760 total time=   1.3s
[CV 2/5] END C=10, penalty=l1, solver=liblinear;, score=0.752 total time=   1.9s
[CV 3/5] END C=10, penalty=l1, solver=liblinear;, score=0.749 total time=   1.8s
[CV 4/5] END C=10, penalty=l1, solver=liblinear;, score=0.752 total time=   1.3s
[CV 5/5] END C=10, penalty=l1, solver=liblinear;, score=0.750 total time=   0.9s




[CV 1/5] END .....C=10, penalty=l1, solver=saga;, score=0.754 total time=   3.6s




[CV 2/5] END .....C=10, penalty=l1, solver=saga;, score=0.749 total time=   2.7s




[CV 3/5] END .....C=10, penalty=l1, solver=saga;, score=0.748 total time=   2.6s




[CV 4/5] END .....C=10, penalty=l1, solver=saga;, score=0.752 total time=   2.3s




[CV 5/5] END .....C=10, penalty=l1, solver=saga;, score=0.751 total time=   2.6s
[CV 1/5] END C=10, penalty=l2, solver=liblinear;, score=0.760 total time=   0.4s
[CV 2/5] END C=10, penalty=l2, solver=liblinear;, score=0.752 total time=   0.3s
[CV 3/5] END C=10, penalty=l2, solver=liblinear;, score=0.749 total time=   0.4s
[CV 4/5] END C=10, penalty=l2, solver=liblinear;, score=0.752 total time=   0.4s
[CV 5/5] END C=10, penalty=l2, solver=liblinear;, score=0.751 total time=   0.5s




[CV 1/5] END .....C=10, penalty=l2, solver=saga;, score=0.754 total time=   4.5s




[CV 2/5] END .....C=10, penalty=l2, solver=saga;, score=0.749 total time=   2.0s




[CV 3/5] END .....C=10, penalty=l2, solver=saga;, score=0.748 total time=   2.4s




[CV 4/5] END .....C=10, penalty=l2, solver=saga;, score=0.752 total time=   2.4s




[CV 5/5] END .....C=10, penalty=l2, solver=saga;, score=0.751 total time=   3.0s
{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.75      0.73      0.74      5296
           1       0.74      0.76      0.75      5308

    accuracy                           0.74     10604
   macro avg       0.75      0.74      0.74     10604
weighted avg       0.75      0.74      0.74     10604



In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}


model=SVC(random_state=42,kernel='rbf')
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ................C=0.1, gamma=scale;, score=0.750 total time= 1.2min
[CV 2/5] END ................C=0.1, gamma=scale;, score=0.729 total time= 1.1min
[CV 3/5] END ................C=0.1, gamma=scale;, score=0.741 total time= 1.2min
[CV 4/5] END ................C=0.1, gamma=scale;, score=0.746 total time= 1.2min
[CV 5/5] END ................C=0.1, gamma=scale;, score=0.745 total time= 1.1min
[CV 1/5] END .................C=0.1, gamma=auto;, score=0.745 total time= 1.2min
[CV 2/5] END .................C=0.1, gamma=auto;, score=0.721 total time= 1.2min
[CV 3/5] END .................C=0.1, gamma=auto;, score=0.737 total time= 1.2min
[CV 4/5] END .................C=0.1, gamma=auto;, score=0.739 total time= 1.2min
[CV 5/5] END .................C=0.1, gamma=auto;, score=0.737 total time= 1.3min
[CV 1/5] END ..................C=1, gamma=scale;, score=0.760 total time= 1.0min
[CV 2/5] END ..................C=1, gamma=scale;,

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]  # only for 'poly' kernel
}


model=SVC(random_state=42,kernel='poly')
grid = RandomizedSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END .......C=0.1, degree=2, gamma=auto;, score=0.763 total time= 2.4min
[CV 2/5] END .......C=0.1, degree=2, gamma=auto;, score=0.750 total time= 2.4min
[CV 3/5] END .......C=0.1, degree=2, gamma=auto;, score=0.754 total time= 2.4min
[CV 4/5] END .......C=0.1, degree=2, gamma=auto;, score=0.754 total time= 2.5min
[CV 5/5] END .......C=0.1, degree=2, gamma=auto;, score=0.759 total time= 2.4min
[CV 1/5] END .......C=10, degree=4, gamma=scale;, score=0.757 total time= 1.4min
[CV 2/5] END .......C=10, degree=4, gamma=scale;, score=0.748 total time= 1.5min
[CV 3/5] END .......C=10, degree=4, gamma=scale;, score=0.752 total time= 1.4min
[CV 4/5] END .......C=10, degree=4, gamma=scale;, score=0.756 total time= 1.4min
[CV 5/5] END .......C=10, degree=4, gamma=scale;, score=0.754 total time= 1.4min
[CV 1/5] END .......C=10, degree=3, gamma=scale;, score=0.760 total time=  58.4s
[CV 2/5] END .......C=10, degree=3, gamma=scale;

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 1.0],
    'max_features': ['auto', 'sqrt']
}

model=GradientBoostingClassifier(random_state=42)
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8;, score=nan total time=   0.0s
[CV 3/5] END learning_rate=0.01, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8;, score=nan total time=   0.0s
[CV 4/5] END learning_rate=0.01, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8;, score=nan total time=   0.0s
[CV 5/5] END learning_rate=0.01, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8;, score=nan total time=   0.0s
[CV 1/5] END learning_rate=0.01, max_depth=3, max_features=auto, min_samples_leaf=1, min_sa

720 fits failed out of a total of 1440.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
720 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
s

{'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100, 'subsample': 0.8}
              precision    recall  f1-score   support

           0       0.77      0.72      0.74      5296
           1       0.74      0.79      0.76      5308

    accuracy                           0.75     10604
   macro avg       0.75      0.75      0.75     10604
weighted avg       0.75      0.75      0.75     10604



In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}


model=RandomForestClassifier(random_state=42)
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[CV 1/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   0.0s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   0.0s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   0.0s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=nan total time=   0.0s
[CV 2/5] END boots

1080 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(

{'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 300}
              precision    recall  f1-score   support

           0       0.77      0.71      0.74      5296
           1       0.73      0.79      0.76      5308

    accuracy                           0.75     10604
   macro avg       0.75      0.75      0.75     10604
weighted avg       0.75      0.75      0.75     10604



In [None]:
param_grid = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'class_weight': [None, 'balanced']
}


model=DecisionTreeClassifier(random_state=42)
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3,n_jobs=1)

# fitting the model for grid search
grid.fit(X_train_split, y_train_split)

# print best parameter after tuning
print(grid.best_params_)
grid_predictions = grid.predict(X_val)

# print classification report
print(classification_report(y_val, grid_predictions))

Fitting 5 folds for each of 864 candidates, totalling 4320 fits
[CV 1/5] END class_weight=None, criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, splitter=best;, score=0.670 total time=   0.1s
[CV 2/5] END class_weight=None, criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, splitter=best;, score=0.658 total time=   0.1s
[CV 3/5] END class_weight=None, criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, splitter=best;, score=0.669 total time=   0.1s
[CV 4/5] END class_weight=None, criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, splitter=best;, score=0.658 total time=   0.1s
[CV 5/5] END class_weight=None, criterion=gini, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, splitter=best;, score=0.656 total time=   0.1s
[CV 1/5] END class_weight=None, criterion=gini, max_depth=None, max_features=sqrt

# Testing best parameter combinations with base models for best performance

In [None]:
models = {
    "Logistic Regression": LogisticRegression(random_state=42,C=0.1,penalty='l1',solver='liblinear'),
    "Decision Tree": DecisionTreeClassifier(random_state=42,class_weight= 'balanced', criterion= 'entropy', max_depth =10, max_features= None, min_samples_leaf= 1, min_samples_split= 2, splitter= 'random'),
    "Random Forest": RandomForestClassifier(random_state=42,bootstrap=False,max_depth=10,max_features='sqrt',n_estimators=300),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42,learning_rate=0.1,max_depth=5,max_features='sqrt',min_samples_leaf=1,min_samples_split=5,n_estimators=100,subsample=0.8),
    "SVC_rbf": SVC(kernel= 'rbf',random_state=42,C=10,gamma='scale')
}

for name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    print(f"{name} Accuracy: {acc:.4f}")


Logistic Regression Accuracy: 0.7448
Decision Tree Accuracy: 0.7381
Random Forest Accuracy: 0.7513
Gradient Boosting Accuracy: 0.7525
SVC_rbf Accuracy: 0.7507
