# Start

In [None]:
# ==========================
# CELL 1: LIBRARY IMPORTS
# ==========================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import numpy as np  # Remove this redundant import
# np.bool = bool  # Create alias for bool -> Remove or comment out this line
import seaborn as sns

# Sklearn modules
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE

# ML models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier

# Deep Learning modules
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, GRU, Conv1D, MaxPooling1D, Flatten, Dropout

# For explainability (SHAP)
#import shap

In [None]:
# ==========================
# CELL 2: LOAD DATA
# ==========================
# Example: If your dataset is a CSV file
df=pd.read_excel('Anemia Dataset.xlsx')

# For demonstration, we assume 'df' is already loaded with columns:
# ['Gender', 'Age', 'Hb', 'RBC', 'PCV', 'MCV', 'MCH', 'MCHC', 'Decision_Class']

print("Data Sample:")
display(df.head())

print("\nData Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())


In [None]:
# ==========================
# CELL 3: EXPLORATORY DATA ANALYSIS
# ==========================
# Basic distribution of each feature
df.hist(figsize=(12,8))
plt.tight_layout()
plt.show()

# Check class distribution
print("Class Distribution:")
print(df['Decision_Class'].value_counts())

# Boxplots to see outliers
fig, axes = plt.subplots(2, 4, figsize=(16,8))
axes = axes.flatten()
numeric_cols = ['Age', 'Hb', 'RBC', 'PCV', 'MCV', 'MCH', 'MCHC']
for i, col in enumerate(numeric_cols):
    sns.boxplot(x=df[col], ax=axes[i])
plt.tight_layout()
plt.show()


In [None]:
# ==========================
# CELL 4: MISSING VALUE & OUTLIER HANDLING
# ==========================

# 1) Convert 'Gender' to numeric if needed
df['Gender'] = df['Gender'].map({'m':1, 'f':0})  # or LabelEncoder

# 2) Handle missing values with SimpleImputer (mean strategy)
imputer = SimpleImputer(strategy='mean')
numeric_cols = ['Age', 'Hb', 'RBC', 'PCV', 'MCV', 'MCH', 'MCHC']

df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# 3) (Optional) Outlier handling
#    Example: remove rows beyond 3*IQR, or just clip them
#    Here we simply show how to clip:
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower_bound, upper_bound)

print("Data after missing value & outlier handling:")
display(df.head())


In [None]:
# ==========================
# CELL 5: FEATURE/TARGET SPLIT
# ==========================
X = df.drop('Decision_Class', axis=1)
y = df['Decision_Class'].values  # 0 or 1

print("Features shape:", X.shape)
print("Target shape:", y.shape)


In [None]:
# ==========================
# CELL 6: TRAIN-TEST SPLIT
# ==========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)


In [None]:
# ==========================
# CELL 7: SMOTE FOR IMBALANCE
# ==========================
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

print("After SMOTE, train shape:", X_train_sm.shape, y_train_sm.shape)
print("Class Distribution in y_train_sm:", pd.Series(y_train_sm).value_counts())


In [None]:
# ==========================
# CELL 8: SCALING
# ==========================
scaler = StandardScaler()  # or MinMaxScaler
X_train_sm_scaled = scaler.fit_transform(X_train_sm)
X_test_scaled = scaler.transform(X_test)


In [None]:
# ==========================
# CELL 9: CLASSICAL ML MODELS
# ==========================
from sklearn.model_selection import cross_val_score

models = {
    "SVM": SVC(probability=True, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000),
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    # "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
}

results = {}

for model_name, model in models.items():
    # Cross-validation on the training set
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train_sm_scaled, y_train_sm, cv=cv, scoring='accuracy')

    # Train on the full training set
    model.fit(X_train_sm_scaled, y_train_sm)

    # Predict on test set
    y_pred = model.predict(X_test_scaled)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    cls_report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # Print results
    print("="*50)
    print(f"Model: {model_name}")
    print(f"CV Accuracy (mean): {cv_scores.mean():.4f} | CV Std: {cv_scores.std():.4f}")
    print(f"Test Accuracy: {acc:.4f}")
    print("\nClassification Report:\n", cls_report)
    print("\nConfusion Matrix:\n", cm)
    print("="*50)

    # Store in results dict if needed
    results[model_name] = {
        "cv_mean_acc": cv_scores.mean(),
        "test_acc": acc,
        "classification_report": cls_report,
        "confusion_matrix": cm
    }


In [None]:
# ==========================
# CELL 10: HYPERPARAMETER TUNING EXAMPLE (RANDOM FOREST)
# ==========================
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train_sm_scaled, y_train_sm)
print("Best Params:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

best_rf = grid_search.best_estimator_

# Evaluate on test set
y_pred_rf = best_rf.predict(X_test_scaled)
acc_rf = accuracy_score(y_test, y_pred_rf)
cls_report_rf = classification_report(y_test, y_pred_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)

print(f"\nTest Accuracy with Best RF: {acc_rf:.4f}")
print("\nClassification Report:\n", cls_report_rf)
print("\nConfusion Matrix:\n", cm_rf)


In [None]:
# ==========================
# CELL 11: RESHAPE DATA FOR DEEP LEARNING
# ==========================
# Convert from (samples, features) -> (samples, timesteps=1, features)
X_train_dl = np.expand_dims(X_train_sm_scaled, axis=1)
X_test_dl = np.expand_dims(X_test_scaled, axis=1)

print("New DL shape, X_train:", X_train_dl.shape)  # (num_samples, 1, num_features)
print("New DL shape, X_test:", X_test_dl.shape)


In [None]:
# ==========================
# CELL 12: DEFINE DL MODELS
# ==========================

def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, activation='relu', input_shape=input_shape))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_bilstm_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(64, activation='relu'), input_shape=input_shape))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_gru_model(input_shape):
    model = Sequential()
    model.add(GRU(64, activation='relu', input_shape=input_shape))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_cnn_lstm_model(input_shape):
    # For CNN, we need timesteps > 1 for a real convolution. We'll do a trivial example with kernel_size=1.
    # This is just a demonstration.
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=1, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=1))
    model.add(LSTM(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [None]:
# ==========================
# CELL 13: TRAIN & EVALUATE DL MODELS
# ==========================
dl_models = {
    "LSTM": build_lstm_model(X_train_dl.shape[1:]),
    "BiLSTM": build_bilstm_model(X_train_dl.shape[1:]),
    "GRU": build_gru_model(X_train_dl.shape[1:]),
    "CNN_LSTM": build_cnn_lstm_model(X_train_dl.shape[1:])
}

epochs = 10
batch_size = 32

for name, dl_model in dl_models.items():
    print(f"\nTraining {name} model...")
    history = dl_model.fit(
        X_train_dl, y_train_sm,
        validation_split=0.2,
        epochs=epochs,
        batch_size=batch_size,
        verbose=0  # set to 1 or 2 if you want to see the training progress
    )

    # Predict on test set
    y_pred_prob = dl_model.predict(X_test_dl)
    y_pred = (y_pred_prob > 0.5).astype(int)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    cls_report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print("="*50)
    print(f"Deep Learning Model: {name}")
    print(f"Test Accuracy: {acc:.4f}")
    print("\nClassification Report:\n", cls_report)
    print("\nConfusion Matrix:\n", cm)
    print("="*50)


In [None]:
# ==========================
# CELL 14: SHAP EXPLAINABILITY
# ==========================
# Example with XGBoost
best_xgb = XGBClassifier(random_state=42)
best_xgb.fit(X_train_sm_scaled, y_train_sm)

explainer = shap.Explainer(best_xgb, X_train_sm_scaled)
shap_values = explainer(X_test_scaled)

# Summary plot
shap.summary_plot(shap_values, X_test_scaled, feature_names=X.columns)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Aapke model ke results yahaan daalen.
# Main example ke liye kuch data use kar raha hoon.
model_names = ['SVM', 'Decision Tree', 'Logistic Regression', 'Random Forest', 'XGBoost']
accuracy = [0.95, 0.92, 0.90, 0.96, 0.97]
f1_score = [0.94, 0.91, 0.89, 0.95, 0.96]
recall = [0.93, 0.90, 0.88, 0.94, 0.95]
precision = [0.95, 0.92, 0.90, 0.96, 0.97] # Precision values added

# Dataframe banaen
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracy,
    'F1 Score': f1_score,
    'Recall': recall,
    'Precision': precision  # Precision column added
})

# Bar chart ke liye metrics
metrics = ['Accuracy', 'F1 Score', 'Recall', 'Precision']

# Har metric ke liye bar chart banaen
for metric in metrics:
    plt.figure(figsize=(10, 6))
    plt.bar(results_df['Model'], results_df[metric])
    plt.title(f'Model {metric} Comparison')
    plt.xlabel('Model')
    plt.ylabel(metric)
    plt.ylim(0, 1)  # Y-axis ko 0 se 1 tak limit karein
    plt.show()

In [20]:
import joblib

# Save the trained best XGBoost model to a file
joblib.dump(models["XGBoost"], "best_xgboost_model.pkl")
print("Model saved as best_xgboost_model.pkl")


Model saved as best_xgboost_model.pkl
