<a href="https://colab.research.google.com/github/MonikSense/GLAUCOMA/blob/main/run1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==========================================
# Glaucoma Detection using Ensemble ML
# Dataset: glaucoma.csv (MonikSense GitHub)
# ==========================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Baseline models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Ensemble models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

# ==========================================
# 1️⃣ LOAD DATASET
# ==========================================

url = "https://raw.githubusercontent.com/MonikSense/GLAUCOMA/refs/heads/main/glaucoma.csv"
df = pd.read_csv(url)

print("Dataset shape:", df.shape)
print(df.head())

# Drop filename (not useful for ML)
df = df.drop(columns=["Filename"])

# ==========================================
# 2️⃣ FEATURES & TARGET
# ==========================================

X = df.drop("Glaucoma", axis=1)
y = df["Glaucoma"]

# Identify feature types
numeric_features = ["ExpCDR"]
categorical_features = ["Eye", "Set"]

# ==========================================
# 3️⃣ DATA PREPROCESSING PIPELINE
# ==========================================

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ==========================================
# 4️⃣ BASELINE MODELS
# ==========================================

baseline_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

print("\n===== BASELINE MODELS PERFORMANCE =====\n")

for name, model in baseline_models.items():
    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall (Sensitivity):", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_prob))
    print()

# ==========================================
# 5️⃣ ENSEMBLE MODELS
# ==========================================

rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ("rf", rf),
        ("gb", gb)
    ],
    voting="soft"
)

ensemble_models = {
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "Voting Ensemble": voting_clf
}

print("\n===== ENSEMBLE MODELS PERFORMANCE =====\n")

for name, model in ensemble_models.items():
    pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall (Sensitivity):", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_prob))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("="*50)


Dataset shape: (650, 5)
  Filename  ExpCDR Eye Set  Glaucoma
0  001.jpg  0.7097  OD   A         0
1  002.jpg  0.6953  OS   A         0
2  003.jpg  0.9629  OS   A         0
3  004.jpg  0.7246  OD   A         0
4  005.jpg  0.6138  OS   A         0

===== BASELINE MODELS PERFORMANCE =====

--- Logistic Regression ---
Accuracy: 0.7846153846153846
Precision: 0.6666666666666666
Recall (Sensitivity): 0.35294117647058826
F1 Score: 0.46153846153846156
ROC-AUC: 0.7889093137254901

--- SVM ---
Accuracy: 0.7769230769230769
Precision: 0.6470588235294118
Recall (Sensitivity): 0.3235294117647059
F1 Score: 0.43137254901960786
ROC-AUC: 0.7405024509803921

--- Decision Tree ---
Accuracy: 0.6461538461538462
Precision: 0.35714285714285715
Recall (Sensitivity): 0.4411764705882353
F1 Score: 0.39473684210526316
ROC-AUC: 0.5799632352941176


===== ENSEMBLE MODELS PERFORMANCE =====

--- Random Forest ---
Accuracy: 0.6692307692307692
Precision: 0.37142857142857144
Recall (Sensitivity): 0.38235294117647056
F1 Sc