In [9]:
# ============================================================
# Anemia Classification Model Training (Render Compatible)
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix,
                             RocCurveDisplay, roc_auc_score, accuracy_score,
                             f1_score, precision_score, recall_score)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance
from scipy.stats import friedmanchisquare
import shap
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# ============================================================
# Data Loading and Cleaning
# ============================================================

def load_and_clean_data(filepath):
    df = pd.read_csv(filepath)
    df = df.drop(index=0).reset_index(drop=True)
    df.columns = [col.strip().replace(" ", "_").replace("/", "_per_") for col in df.columns]
    df.columns = ["_".join(col.split()) for col in df.columns]
    df.columns = [col.replace("/", "_per_") for col in df.columns]

    numeric_cols = ['RBC', 'PCV', 'MCV', 'MCH', 'MCHC', 'RDW',
                    'TLC', 'PLT__per_mm3', 'HGB', 'Age', 'Sex']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    df.dropna(subset=numeric_cols, inplace=True)
    return df.reset_index(drop=True)

df = load_and_clean_data("data/cleaned_cbc_data_no_duplicates.csv")
print(df.info())
print(df.head())

# ============================================================
# Anemia Classification Function
# ============================================================

def classify_anemia(MCV, MCHC, HGB, RDW, sex):
    hgb_severe = 10 if sex == 0 else 11
    hgb_moderate = 12 if sex == 0 else 13

    if HGB >= hgb_moderate:
        return 'No_Anemia'

    if MCV < 80:
        cell_type = "Microcytic"
    elif 80 <= MCV <= 100:
        cell_type = "Normocytic"
    else:
        cell_type = "Macrocytic"

    if cell_type == "Microcytic":
        return "ACD_Severe" if HGB < hgb_severe else "ACD_Moderate"
    elif cell_type == "Normocytic":
        if MCHC < 32:
            if HGB < hgb_severe:
                return "Severe_thalassemia" if RDW < 14.16 else "Severe_iron_deficiency_anemia"
            else:
                return "Moderate_thalassemia" if RDW < 14.16 else "Moderate_iron_deficiency_anemia"
        else:
            return "Normocytic_anemia_Unknown"
    elif cell_type == "Macrocytic":
        return "Severe_aplastic_anemia" if HGB < hgb_severe else "Moderate_aplastic_anemia"
    else:
        return "Unknown_anemia_type"

df['Anemia_Type'] = df.apply(lambda x: classify_anemia(
    x['MCV'], x['MCHC'], x['HGB'], x['RDW'], x['Sex']
), axis=1)

print("\nAnemia Type Distribution:")
print(df['Anemia_Type'].value_counts())

df = df.drop(columns=['S._No.', 'TLC', 'MCH', 'PLT__per_mm3'], errors='ignore')

# ============================================================
# Feature Engineering and Encoding
# ============================================================

for col in df.columns:
    if df[col].isnull().sum() > 0 and df[col].dtype.kind in 'iuf':
        df[col].fillna(df[col].median(), inplace=True)

le = LabelEncoder()
y_encoded = le.fit_transform(df['Anemia_Type'])
X = df.drop('Anemia_Type', axis=1)
y = y_encoded

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# ============================================================
# Model Configurations
# ============================================================

models = {
    "XGBoost": {
        'model': XGBClassifier(objective='multi:softmax', random_state=42,
                               eval_metric='mlogloss'),
        'params': {'n_estimators': [200], 'max_depth': [5], 'learning_rate': [0.1]}
    },
    "Random_Forest": {
        'model': RandomForestClassifier(class_weight='balanced', random_state=42),
        'params': {'n_estimators': [200], 'max_depth': [None]}
    },
    "Logistic_Regression": {
        'model': Pipeline([('scaler', StandardScaler()),
                           ('clf', LogisticRegression(class_weight='balanced',
                                                      max_iter=1000,
                                                      multi_class='ovr',
                                                      random_state=42))]),
        'params': {'clf__C': [1]}
    },
    "LightGBM": {
        'model': LGBMClassifier(objective='multiclass', random_state=42,
                                class_weight='balanced'),
        'params': {'n_estimators': [200], 'learning_rate': [0.1]}
    },
    "Gradient_Boosting": {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {'n_estimators': [200], 'learning_rate': [0.1]}
    },
    "SVM": {
        'model': Pipeline([('scaler', StandardScaler()),
                           ('clf', SVC(probability=True, random_state=42))]),
        'params': {'clf__C': [1], 'clf__gamma': ['scale']}
    }
}

# ============================================================
# Model Training & Evaluation
# ============================================================

results = {}
for name, config in models.items():
    print(f"\n{'='*40}\nTraining {name}\n{'='*40}")
    model = config['model']
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    results[name] = {
        'model': model,
        'accuracy': accuracy_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred, average='weighted')
    }
    print(f"{name}: Accuracy={results[name]['accuracy']:.4f}, F1={results[name]['f1']:.4f}")

# ============================================================
# Save Best Model for Deployment (Render-compatible)
# ============================================================

best_model_name = max(results, key=lambda k: results[k]['accuracy'])
best_model = results[best_model_name]['model']

os.makedirs("models", exist_ok=True)

reference_ranges = {
    "HGB": (12.0, 16.0),
    "RBC": (4.0, 5.5),
    "PCV": (37.0, 47.0),
    "MCV": (80.0, 100.0),
    "MCHC": (32.0, 36.0),
    "RDW": (11.5, 14.5),
}

metadata = {
    "features": X.columns.tolist(),
    "class_names": le.classes_.tolist(),
    "reference_ranges": reference_ranges,
    "version": "1.0"
}

joblib.dump({"model": best_model, "metadata": metadata}, "models/Anemia_classifier_model.pkl")

print(f"\n✅ Best Model: {best_model_name}")
print("✅ Model saved successfully at models/Anemia_classifier_model.pkl")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 789 entries, 0 to 788
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   S._No.        789 non-null    float64
 1   Age           789 non-null    int64  
 2   Sex           789 non-null    float64
 3   RBC           789 non-null    float64
 4   PCV           789 non-null    float64
 5   MCV           789 non-null    float64
 6   MCH           789 non-null    float64
 7   MCHC          789 non-null    float64
 8   RDW           789 non-null    float64
 9   TLC           789 non-null    float64
 10  PLT__per_mm3  789 non-null    float64
 11  HGB           789 non-null    float64
dtypes: float64(11), int64(1)
memory usage: 74.1 KB
None
   S._No.  Age  Sex   RBC   PCV   MCV   MCH  MCHC   RDW    TLC  PLT__per_mm3  \
0     2.0   41  0.0  4.78  44.5  93.1  28.9  31.0  13.0   7.02         419.0   
1     3.0   40  1.0  4.65  41.6  89.5  28.8  32.2  13.0   8.09         325.0