<a href="https://colab.research.google.com/github/MVijayKrishna/neolens_bame/blob/master/Jaundice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving full_jaundice_metadata.csv to full_jaundice_metadata.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# === 1. Load and Prepare Data ===
df = pd.read_csv("full_jaundice_metadata.csv")

# Clean text
for col in df.columns:
    df[col] = df[col].astype(str).str.strip().str.lower().str.replace('â€”', '-', regex=False)

# Drop unused
df = df.drop(columns=["Image_ID", "File_Path", "Notes"])

# Encode target
df['Condition'] = df['Condition'].map({'normal': 0, 'jaundiced': 1})

# Label encode all features
X = df.drop(columns=['Condition']).copy()
y = df['Condition'].copy()

In [3]:
y.value_counts()

Unnamed: 0_level_0,count
Condition,Unnamed: 1_level_1
1,110
0,110


In [4]:
label_encoders = {}
for col in X.columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # save encoder for future use

# === 2. KFold + SMOTE + XGBoost ===
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_y_true = []
all_y_pred = []

print("📊 XGBoost with Label Encoding and KFold + SMOTE:\n")

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Scale (optional but helps sometimes)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Apply SMOTE on training only
    sm = SMOTE(random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # Train XGBoost
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train_res, y_train_res)

    # Predict
    y_pred = model.predict(X_test)
    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)

    print(f"Fold {fold} Report:")
    print(classification_report(y_test, y_pred))
    fold += 1

# === 3. Overall Report ===
print("📈 Final Classification Report (All Folds Combined):")
print(classification_report(all_y_true, all_y_pred))

📊 XGBoost with Label Encoding and KFold + SMOTE:

Fold 1 Report:
              precision    recall  f1-score   support

           0       0.74      0.52      0.61        27
           1       0.48      0.71      0.57        17

    accuracy                           0.59        44
   macro avg       0.61      0.61      0.59        44
weighted avg       0.64      0.59      0.59        44

Fold 2 Report:
              precision    recall  f1-score   support

           0       0.38      0.50      0.43        18
           1       0.55      0.42      0.48        26

    accuracy                           0.45        44
   macro avg       0.46      0.46      0.45        44
weighted avg       0.48      0.45      0.46        44



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Fold 3 Report:
              precision    recall  f1-score   support

           0       0.37      0.65      0.47        17
           1       0.57      0.30      0.39        27

    accuracy                           0.43        44
   macro avg       0.47      0.47      0.43        44
weighted avg       0.49      0.43      0.42        44

Fold 4 Report:
              precision    recall  f1-score   support

           0       0.52      0.50      0.51        22
           1       0.52      0.55      0.53        22

    accuracy                           0.52        44
   macro avg       0.52      0.52      0.52        44
weighted avg       0.52      0.52      0.52        44

Fold 5 Report:
              precision    recall  f1-score   support

           0       0.48      0.46      0.47        26
           1       0.26      0.28      0.27        18

    accuracy                           0.39        44
   macro avg       0.37      0.37      0.37        44
weighted avg       0.39      

Parameters: { "use_label_encoder" } are not used.



In [5]:
import joblib

# Save model
joblib.dump(model, "xgboost_jaundice_model.pkl")

# Save scaler and label encoders too
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")


['label_encoders.pkl']

In [6]:
from google.colab import files

files.download("xgboost_jaundice_model.pkl")
files.download("scaler.pkl")
files.download("label_encoders.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>