In [9]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

In [10]:
data = pd.read_csv('D:\\ml\\Demo Datasets\\Lesson 9\\pima-indians-diabetes.csv')

In [11]:
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in columns_to_replace:
    median_value = data[column].median()
    data[column] = data[column].replace(0, median_value)

In [12]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [13]:
def evaluate_model(model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train_resampled, y_train_resampled)
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{best_model.__class__.__name__} Accuracy: {accuracy}")
    print(classification_report(y_test, predictions))
    return best_model

# Logistic Regression
lr_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
best_lr_model = evaluate_model(LogisticRegression(random_state=42, solver='liblinear'), lr_param_grid)

# Random Forest Classifier
rf_param_grid = {'n_estimators': [50, 100, 200, 300], 'max_depth': [None, 3, 5, 7]}
best_rf_model = evaluate_model(RandomForestClassifier(random_state=42), rf_param_grid)

# AdaBoost Classifier
ada_param_grid = {'n_estimators': [50, 100, 200, 300], 'learning_rate': [0.01, 0.1, 1, 2]}
best_ada_model = evaluate_model(AdaBoostClassifier(random_state=42, algorithm='SAMME'), ada_param_grid)

# XGBoost Classifier
xgb_param_grid = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7, 9], 'subsample': [0.6, 0.8, 1]}
best_xgb_model = evaluate_model(XGBClassifier(random_state=42, eval_metric='logloss'), xgb_param_grid)

# Support Vector Machine
svm_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.001, 0.01]}
best_svm_model = evaluate_model(SVC(random_state=42), svm_param_grid)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
LogisticRegression Accuracy: 0.6883116883116883
              precision    recall  f1-score   support

           0       0.80      0.69      0.74        99
           1       0.55      0.69      0.61        55

    accuracy                           0.69       154
   macro avg       0.68      0.69      0.68       154
weighted avg       0.71      0.69      0.69       154

Fitting 5 folds for each of 16 candidates, totalling 80 fits
RandomForestClassifier Accuracy: 0.7597402597402597
              precision    recall  f1-score   support

           0       0.87      0.74      0.80        99
           1       0.63      0.80      0.70        55

    accuracy                           0.76       154
   macro avg       0.75      0.77      0.75       154
weighted avg       0.78      0.76      0.76       154

Fitting 5 folds for each of 16 candidates, totalling 80 fits




AdaBoostClassifier Accuracy: 0.7337662337662337
              precision    recall  f1-score   support

           0       0.85      0.72      0.78        99
           1       0.60      0.76      0.67        55

    accuracy                           0.73       154
   macro avg       0.72      0.74      0.72       154
weighted avg       0.76      0.73      0.74       154

Fitting 5 folds for each of 108 candidates, totalling 540 fits
XGBClassifier Accuracy: 0.7207792207792207
              precision    recall  f1-score   support

           0       0.84      0.70      0.76        99
           1       0.58      0.76      0.66        55

    accuracy                           0.72       154
   macro avg       0.71      0.73      0.71       154
weighted avg       0.75      0.72      0.73       154

Fitting 5 folds for each of 24 candidates, totalling 120 fits
SVC Accuracy: 0.6753246753246753
              precision    recall  f1-score   support

           0       0.77      0.71      0.7

In [14]:
voting_clf = VotingClassifier(estimators=[
    ('lr', best_lr_model), 
    ('rf', best_rf_model), 
    ('ada', best_ada_model), 
    ('xgb', best_xgb_model), 
    ('svm', best_svm_model)
], voting='hard')

# Evaluate the voting classifier
voting_param_grid = {}
best_voting_model = evaluate_model(voting_clf, voting_param_grid)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




VotingClassifier Accuracy: 0.7337662337662337
              precision    recall  f1-score   support

           0       0.85      0.72      0.78        99
           1       0.60      0.76      0.67        55

    accuracy                           0.73       154
   macro avg       0.72      0.74      0.72       154
weighted avg       0.76      0.73      0.74       154

