# Explore here

## 1. Data loading

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset from the provided URL
url = "https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv"
df = pd.read_csv(url)

# Check for missing values
print(df.isnull().sum())

# Impute missing values with the median (for numerical columns)
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Split into features (X) and target (y)
X = df_imputed.drop(columns=['Outcome'])
y = df_imputed['Outcome']

# Scale features for algorithms like XGBoost that benefit from scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


###  2: Model Selection & Hyperparameter Tuning 

In [3]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define hyperparameter grid for XGBoost
param_grid = {
    'n_estimators': [50, 100, 200],       # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2],    # Learning rate
    'max_depth': [3, 5, 7],                # Maximum depth of trees
    'min_child_weight': [1, 3, 5],         # Minimum sum of instance weight in a child
    'subsample': [0.6, 0.8, 1.0],          # Fraction of samples used for each tree
    'colsample_bytree': [0.6, 0.8, 1.0]    # Fraction of features used for each tree
}

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')

# Initialize GridSearchCV with the XGBoost model and hyperparameter grid
grid_search = GridSearchCV(estimator=xgb_model, 
                           param_grid=param_grid, 
                           cv=5,              # 5-fold cross-validation
                           n_jobs=-1,         # Use all available CPU cores
                           verbose=2)         # Print progress

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and best cross-validation score
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Evaluate the model with the best hyperparameters on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the performance on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time

### 3: Model Comparison 

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Test Accuracy: {accuracy:.4f}")
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"{name} Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 50)

Logistic Regression Test Accuracy: 0.7532
Logistic Regression Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.80      0.81        99
         1.0       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

Logistic Regression Confusion Matrix:
[[79 20]
 [18 37]]
--------------------------------------------------
Random Forest Test Accuracy: 0.7273
Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.79      0.79      0.79        99
         1.0       0.62      0.62      0.62        55

    accuracy                           0.73       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.73      0.73      0.73       154

Random Forest Confusion Matrix:
[[78 21]
 [21 34]]
-----------------------------

### 4: Voting Classifier

In [5]:
from sklearn.ensemble import VotingClassifier

# Combine the models into a Voting Classifier (hard voting)
ensemble_model = VotingClassifier(estimators=[
    ('logreg', LogisticRegression()),
    ('rf', RandomForestClassifier(random_state=42)),
    ('svm', SVC(random_state=42))
], voting='hard')

# Fit and evaluate the ensemble model
ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)

# Evaluate the ensemble model
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
print(f"Ensemble Model Test Accuracy: {ensemble_accuracy:.4f}")
print("Ensemble Classification Report:")
print(classification_report(y_test, y_pred_ensemble))
print("Ensemble Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_ensemble))

Ensemble Model Test Accuracy: 0.7597
Ensemble Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.82      0.81        99
         1.0       0.67      0.65      0.66        55

    accuracy                           0.76       154
   macro avg       0.74      0.74      0.74       154
weighted avg       0.76      0.76      0.76       154

Ensemble Confusion Matrix:
[[81 18]
 [19 36]]
