In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# XGBoost
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)
print('All dependencies imported successfully')

All dependencies imported successfully


In [2]:

os.makedirs('model', exist_ok=True)
os.makedirs('data', exist_ok=True)

Reading the data (change the path to your exact file path if this fails)

In [3]:
df=pd.read_csv('C:/Users/MHA213/Documents/BiTS Wilp/ML/ml-classification-model-comparison/diabetes_data_upload.csv')
features_count = df.shape[1] - 1
instances_count = df.shape[0]
print(features_count,instances_count)

16 520


In [4]:
df.describe(include="all")

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
count,520.0,520,520,520,520,520,520,520,520,520,520,520,520,520,520,520,520
unique,,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
top,,Male,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,Positive
freq,,328,262,287,303,305,283,404,287,267,394,281,296,325,341,432,320
mean,48.028846,,,,,,,,,,,,,,,,
std,12.151466,,,,,,,,,,,,,,,,
min,16.0,,,,,,,,,,,,,,,,
25%,39.0,,,,,,,,,,,,,,,,
50%,47.5,,,,,,,,,,,,,,,,
75%,57.0,,,,,,,,,,,,,,,,


In [5]:
df.head()
df.columns
df.dtypes

Age                    int64
Gender                object
Polyuria              object
Polydipsia            object
sudden weight loss    object
weakness              object
Polyphagia            object
Genital thrush        object
visual blurring       object
Itching               object
Irritability          object
delayed healing       object
partial paresis       object
muscle stiffness      object
Alopecia              object
Obesity               object
class                 object
dtype: object

In [6]:
missing=df.isnull().sum()
total_missing=missing.sum()
print(total_missing)

0


Data Preprocessing

In [7]:
#standardizing the column names
df.columns = map(str.lower, df.columns)
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(" ", "_")

In [8]:
# Map yes/no values to 1 and 0
one_values = ["Male", "Positive", "Yes"]
zero_values = ["Female", "Negative", "No"]

for column in df.columns:
    df[column] = df[column].replace(to_replace=one_values, value=1)
    df[column] = df[column].replace(to_replace=zero_values, value=0)

In [9]:
df = df.rename({"class": "status"}, axis = "columns")
df.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,status
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


Model Building

In [10]:
# feat_corr = df.corr()["status"].to_frame()
# feat_corr

In [11]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

X = df[df.columns.difference(["status"])]
y = df["status"]

feat_chi = SelectKBest(score_func=chi2, k=12)
fit = feat_chi.fit(X, y)
feat_chi = pd.concat([pd.DataFrame(X.columns), pd.DataFrame(fit.scores_)], axis=1)
feat_chi.columns = ["column", "score"]
feat_chi = feat_chi.sort_values(by="score", ascending=False).reset_index(drop=False)
feat_chi = feat_chi[0:12]["column"].to_numpy()
print(feat_chi)

['polydipsia' 'polyuria' 'sudden_weight_loss' 'partial_paresis' 'gender'
 'irritability' 'polyphagia' 'alopecia' 'age' 'visual_blurring' 'weakness'
 'genital_thrush']


In [12]:
# #selecting top 12 features
# feat_corr["status"] = abs(feat_corr["status"])
# feat_corr = feat_corr.sort_values(by="status", ascending=False).reset_index(drop=False)
# feat_corr = feat_corr[1:13]["index"].to_numpy()
# feat_corr

In [13]:
# from sklearn.model_selection import train_test_split
#train test split 80-20
X = df[feat_chi]
y = df["status"]

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.2, random_state=42)
# print(x_train)
# print(x_test)

In [14]:
# from sklearn.preprocessing import StandardScaler

scl = StandardScaler()
X_train = scl.fit_transform(X_train)
X_test = scl.transform(X_test)

# x_train_scaled = pd.DataFrame(x_train, columns=x.columns, index=x_train.index)
# x_test_scaled = pd.DataFrame(x_test, columns=x.columns, index=x_test.index)


In [15]:
# Save scaler (CRITICAL for Streamlit!)
joblib.dump(scl, 'model/scaler.pkl')
print(" Scaler saved: model/scaler.pkl")

# Save feature names
feature_names = list(X.columns)
joblib.dump(feature_names, 'model/feature_names.pkl')
print(" Feature names saved: model/feature_names.pkl")

# Save test data sample for demo
test_sample = pd.DataFrame(X_test[:100], columns=feature_names)
test_sample['target'] = pd.DataFrame(y_test[:100].values)
test_sample.to_csv('data/test_data.csv', index=False)
print(" Test sample saved: data/test_data.csv")

 Scaler saved: model/scaler.pkl
 Feature names saved: model/feature_names.pkl
 Test sample saved: data/test_data.csv


In [16]:

results = {
    'Model': [],
    'Accuracy': [],
    'AUC': [],
    'Precision': [],
    'Recall': [],
    'F1': [],
    'MCC': []
}

trained_models = {}
model_predictions = {}
model_confusion_matrices = {}
model_classification_reports = {}

print(" Results storage initialized")

print("\n Ready for model training!")


models_to_train = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=10),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        max_depth=5,          # Reduced to prevent overfitting
        min_samples_split=10, # Added regularization
        min_samples_leaf=5    # Added regularization
    ),
    'XGBoost': XGBClassifier(
        n_estimators=100,
        random_state=42,
        max_depth=3,          # Reduced to prevent overfitting
        learning_rate=0.1,    # Added regularization
        eval_metric='logloss',
        use_label_encoder=False
    )
}

def train_and_evaluate_model(model_name, model, X_train, X_test, y_train, y_test):
    """Train a model and calculate all 6 evaluation metrics + confusion matrix + classification report."""


    print(f" Training: {model_name}")


    # Train the model
    model.fit(X_train, y_train)
    print(f" Model trained successfully")

    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate all 6 metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'AUC': roc_auc_score(y_test, y_pred_proba),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1': f1_score(y_test, y_pred, zero_division=0),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }

    # Display metrics
    print(f"\n Evaluation Metrics:")
    print(f"   Accuracy:  {metrics['Accuracy']:.4f}")
    print(f"   AUC:       {metrics['AUC']:.4f}")
    print(f"   Precision: {metrics['Precision']:.4f}")
    print(f"   Recall:    {metrics['Recall']:.4f}")
    print(f"   F1 Score:  {metrics['F1']:.4f}")
    print(f"   MCC:       {metrics['MCC']:.4f}")

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n Confusion Matrix:")
    print(f"   TN: {cm[0,0]:3d}  |  FP: {cm[0,1]:3d}")
    print(f"   FN: {cm[1,0]:3d}  |  TP: {cm[1,1]:3d}")

    # Classification report
    report = classification_report(y_test, y_pred, target_names=['Not Diabetic', 'Diabetic'])
    print(f"\n Classification Report:")
    print(report)

    print(f" {model_name} completed!")

    return {
        'model': model,
        'metrics': metrics,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'confusion_matrix': cm,
        'classification_report': report
    }

 Results storage initialized

 Ready for model training!


In [17]:
model_results = {}

for idx, (model_name, model) in enumerate(models_to_train.items(), 1):
    print(f"\n{'#'*70}")
    print(f"MODEL {idx}/6: {model_name.upper()}")
    print(f"{'#'*70}")

    # Train and evaluate
    result = train_and_evaluate_model(
        model_name=model_name,
        model=model,
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test
    )

    # Store results
    model_results[model_name] = result

    # Store in results dictionary for comparison table
    results['Model'].append(model_name)
    results['Accuracy'].append(round(result['metrics']['Accuracy'], 4))
    results['AUC'].append(round(result['metrics']['AUC'], 4))
    results['Precision'].append(round(result['metrics']['Precision'], 4))
    results['Recall'].append(round(result['metrics']['Recall'], 4))
    results['F1'].append(round(result['metrics']['F1'], 4))
    results['MCC'].append(round(result['metrics']['MCC'], 4))

    # Store model, predictions, confusion matrix, and classification report
    trained_models[model_name] = result['model']
    model_predictions[model_name] = result['predictions']
    model_confusion_matrices[model_name] = result['confusion_matrix']
    model_classification_reports[model_name] = result['classification_report']


results_df = pd.DataFrame(results)
# print("\n" + results_df.to_string(index=False))

# Save comparison table
results_df.to_csv('model/models_comparison.csv', index=False)



######################################################################
MODEL 1/6: LOGISTIC REGRESSION
######################################################################
 Training: Logistic Regression
 Model trained successfully

 Evaluation Metrics:
   Accuracy:  0.9038
   AUC:       0.9629
   Precision: 0.9296
   Recall:    0.9296
   F1 Score:  0.9296
   MCC:       0.7781

 Confusion Matrix:
   TN:  28  |  FP:   5
   FN:   5  |  TP:  66

 Classification Report:
              precision    recall  f1-score   support

Not Diabetic       0.85      0.85      0.85        33
    Diabetic       0.93      0.93      0.93        71

    accuracy                           0.90       104
   macro avg       0.89      0.89      0.89       104
weighted avg       0.90      0.90      0.90       104

 Logistic Regression completed!

######################################################################
MODEL 2/6: DECISION TREE
######################################################################
 

In [18]:

# IDENTIFY BEST PERFORMING MODELS

print(" BEST PERFORMING MODELS BY METRIC")

for metric in ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']:
    best_idx = results_df[metric].idxmax()
    best_model = results_df.loc[best_idx, 'Model']
    best_score = results_df.loc[best_idx, metric]
    print(f"{metric:12} : {best_model:25} ({best_score:.4f})")

# Find overall best model (by average)

print(" BEST MODEL OVERALL (by average performance)")

results_df['Average'] = results_df[['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']].mean(axis=1)
best_overall_idx = results_df['Average'].idxmax()
best_overall_model = results_df.loc[best_overall_idx, 'Model']
best_overall_score = results_df.loc[best_overall_idx, 'Average']

print(f"Best Model: {best_overall_model}")
print(f"Average Score: {best_overall_score:.4f}")



# 1. Save all 6 models (REQUIRED)
for model_name, model_obj in trained_models.items():
    filename = model_name.lower().replace(' ', '_').replace('-', '_') + '.pkl'
    filepath = os.path.join('model', filename)
    joblib.dump(model_obj, filepath)
    print(f" Saved: {filepath}")

# 2. Save scaler (CRITICAL - needed to scale uploaded data in Streamlit)
joblib.dump(scl, 'model/scaler.pkl')
print(" Saved: model/scaler.pkl")

# 3. Save feature names (HELPFUL - for validation)
feature_names = list(X.columns)
joblib.dump(feature_names, 'model/feature_names.pkl')
print(" Saved: model/feature_names.pkl")

# 4. Save comparison table (REQUIRED - for README.md)
results_df.to_csv('model/model_comparison.csv', index=False)
print(" Saved: model/model_comparison.csv")

# 5. Save sample test data (OPTIONAL - for demo)
ttest_sample = pd.DataFrame(X_test[:100], columns=feature_names)
test_sample['target'] = y_test[:100]
test_sample.to_csv('data/test_sample.csv', index=False)

print("\n All required files saved!")

 BEST PERFORMING MODELS BY METRIC
Accuracy     : XGBoost                   (0.9808)
AUC          : XGBoost                   (0.9970)
Precision    : XGBoost                   (1.0000)
Recall       : XGBoost                   (0.9718)
F1           : XGBoost                   (0.9857)
MCC          : XGBoost                   (0.9572)
 BEST MODEL OVERALL (by average performance)
Best Model: XGBoost
Average Score: 0.9821
 Saved: model\logistic_regression.pkl
 Saved: model\decision_tree.pkl
 Saved: model\k_nearest_neighbors.pkl
 Saved: model\naive_bayes.pkl
 Saved: model\random_forest.pkl
 Saved: model\xgboost.pkl
 Saved: model/scaler.pkl
 Saved: model/feature_names.pkl
 Saved: model/model_comparison.csv

 All required files saved!


In [19]:
# Convert results DataFrame to the format needed for Streamlit
metrics_for_streamlit = {}

for index, row in results_df.iterrows():
    model_name = row['Model']
    metrics_for_streamlit[model_name] = {
        "Accuracy": float(row['Accuracy']),
        "AUC": float(row['AUC']),
        "Precision": float(row['Precision']),
        "Recall": float(row['Recall']),
        "F1": float(row['F1']),
        "MCC": float(row['MCC'])
    }

# Save to JSON file
import json
with open('model/metrics.json', 'w') as f:
    json.dump(metrics_for_streamlit, f, indent=4)

print(" Saved: model/metrics.json")


 Saved: model/metrics.json


In [20]:
print("\n" + "-"*50)
print(" SAVING CONFUSION MATRICES & CLASSIFICATION REPORTS")


from sklearn.metrics import confusion_matrix, classification_report

# Initialize storage dictionaries
confusion_matrices = {}
classification_reports_dict = {}

# For each model, compute and save confusion matrix + classification report
for model_name in trained_models.keys():
    print(f"\nProcessing: {model_name}")

    # Get predictions for this model (already computed earlier)
    y_pred = model_predictions[model_name]

    # 1. CONFUSION MATRIX
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices[model_name] = cm.tolist()  # Convert numpy array to list for JSON

    print(f"  Confusion Matrix computed")

    # 2. CLASSIFICATION REPORT
    cr = classification_report(y_test, y_pred, output_dict=True)
    classification_reports_dict[model_name] = cr

    print(f"  Classification Report computed")


# JSON FILES

import json

# Save confusion matrices
with open('model/confusion_matrices.json', 'w') as f:
    json.dump(confusion_matrices, f, indent=4)
print("\n Saved: model/confusion_matrices.json")

# Save classification reports
with open('model/classification_reports.json', 'w') as f:
    json.dump(classification_reports_dict, f, indent=4)
print(" Saved: model/classification_reports.json")

# SAMPLE OUTPUT (VERIFICATION)


# print("\n" + "-"*50)
print(" SAMPLE: Confusion Matrix for Logistic Regression")
# print(confusion_matrices["Logistic Regression"])

# print("\n" + "-"*50)
print(" SAMPLE: Classification Report for Logistic Regression")

import pandas as pd
cr_df = pd.DataFrame(classification_reports_dict["Logistic Regression"]).transpose()
# print(cr_df)



--------------------------------------------------
 SAVING CONFUSION MATRICES & CLASSIFICATION REPORTS

Processing: Logistic Regression
  Confusion Matrix computed
  Classification Report computed

Processing: Decision Tree
  Confusion Matrix computed
  Classification Report computed

Processing: K-Nearest Neighbors
  Confusion Matrix computed
  Classification Report computed

Processing: Naive Bayes
  Confusion Matrix computed
  Classification Report computed

Processing: Random Forest
  Confusion Matrix computed
  Classification Report computed

Processing: XGBoost
  Confusion Matrix computed
  Classification Report computed

 Saved: model/confusion_matrices.json
 Saved: model/classification_reports.json
 SAMPLE: Confusion Matrix for Logistic Regression
 SAMPLE: Classification Report for Logistic Regression


In [21]:
print(" Populating TEST SPLIT DATA")

# Create test data with features + target
X_test = pd.DataFrame(X_test, columns=feature_names)
test_data_for_streamlit = X_test
test_data_for_streamlit['target'] = y_test.values

# Save to CSV
test_data_for_streamlit.to_csv('data/test_data.csv', index=False)
print(f" Saved: data/test_data.csv ({len(test_data_for_streamlit)} rows)")

 Populating TEST SPLIT DATA
 Saved: data/test_data.csv (104 rows)


**END**