<a href="https://colab.research.google.com/github/Gajeshgif/Gajesh/blob/main/XAI_for_Emergency_Admission_Predi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
asjad99_mimiciii_path = kagglehub.dataset_download('asjad99/mimiciii')

print('Data source import complete.')

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load datasets
admissions = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/ADMISSIONS.csv")
patients = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/PATIENTS.csv")
diagnoses = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/DIAGNOSES_ICD.csv")

# Show column names
print("ADMISSIONS columns:", admissions.columns.tolist())
print("PATIENTS columns:", patients.columns.tolist())
print("DIAGNOSES columns:", diagnoses.columns.tolist())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
admissions = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/ADMISSIONS.csv")
patients = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/PATIENTS.csv")
diagnoses = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/DIAGNOSES_ICD.csv")

# Ensure lowercase columns
admissions.columns = admissions.columns.str.lower()
patients.columns = patients.columns.str.lower()
diagnoses.columns = diagnoses.columns.str.lower()

# Merge
df = pd.merge(admissions, patients, on='subject_id', how='inner')

# Convert to datetime
df['admittime'] = pd.to_datetime(df['admittime'], errors='coerce')
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Drop rows with missing/invalid dates
df = df.dropna(subset=['admittime', 'dob'])

# Remove date outliers to prevent overflow
df = df[(df['dob'].dt.year >= 1900) & (df['dob'].dt.year <= 2025)]

# Calculate admit age safely
df['admit_age'] = (df['admittime'] - df['dob']).dt.total_seconds() / (365.25 * 24 * 3600)
df['admit_age'] = df['admit_age'].clip(lower=0, upper=120)

# Emergency admission label
df['is_emergency'] = df['admission_type'].apply(lambda x: 1 if x == 'EMERGENCY' else 0)

# EDA plots
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='admission_type', order=df['admission_type'].value_counts().index)
plt.title("Admission Types")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 5))
sns.kdeplot(data=df, x='admit_age', hue='admission_type', common_norm=False, fill=True)
plt.title("Age Distribution by Admission Type")
plt.xlabel("Age at Admission")
plt.show()

plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='gender')
plt.title("Gender Distribution")
plt.show()

plt.figure(figsize=(6, 4))
sns.barplot(data=df, x='gender', y='is_emergency')
plt.title("Emergency Admission Rate by Gender")
plt.ylabel("Proportion Emergency")
plt.show()

# Join number of diagnoses per admission
diag_counts = diagnoses.groupby('hadm_id').size().reset_index(name='num_diag')
df = pd.merge(df, diag_counts, on='hadm_id', how='left')
df['num_diag'] = df['num_diag'].fillna(0)

plt.figure(figsize=(7, 4))
sns.histplot(df['num_diag'], bins=30, kde=True)
plt.title("Number of Diagnoses per Admission")
plt.xlabel("Number of Diagnoses")
plt.show()

plt.figure(figsize=(7, 5))
sns.boxplot(data=df, x='is_emergency', y='num_diag')
plt.title("Diagnoses Count by Emergency Admission Status")
plt.xlabel("Is Emergency")
plt.ylabel("Number of Diagnoses")
plt.show()

df['gender_enc'] = df['gender'].map({'M': 1, 'F': 0})
corr_features = df[['admit_age', 'num_diag', 'is_emergency', 'gender_enc']]

plt.figure(figsize=(6, 5))
sns.heatmap(corr_features.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation")
plt.tight_layout()
plt.show()


In [None]:
# ======================
# 1. Import Libraries
# ======================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, f1_score
from sklearn.utils.class_weight import compute_class_weight
import lightgbm as lgb
import shap
import warnings
warnings.filterwarnings('ignore')
shap.initjs()

# ======================
# 2. Load Datasets
# ======================
admissions = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/ADMISSIONS.csv")
patients = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/PATIENTS.csv")
diagnoses = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/DIAGNOSES_ICD.csv")
labs = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/LABEVENTS.csv")

# Standardize column names
admissions.columns = admissions.columns.str.lower()
patients.columns = patients.columns.str.lower()
diagnoses.columns = diagnoses.columns.str.lower()
labs.columns = labs.columns.str.lower()

# ======================
# 3. Merge & Feature Engineering
# ======================
df = pd.merge(admissions, patients, on='subject_id', how='inner')

# Convert date columns
df['admittime'] = pd.to_datetime(df['admittime'], errors='coerce')
df['dischtime'] = pd.to_datetime(df['dischtime'], errors='coerce')
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Drop invalid dates
df = df.dropna(subset=['admittime', 'dob'])

# Keep plausible DOB range
df = df[(df['dob'].dt.year >= 1900) & (df['dob'].dt.year <= 2200)]

# Emergency admission label
df['emergency_label'] = df['admission_type'].apply(lambda x: 1 if x == 'EMERGENCY' else 0)

# Age at admission
df['admit_age'] = (df['admittime'] - df['dob']).dt.total_seconds() / (365.25 * 24 * 3600)
df['admit_age'] = df['admit_age'].clip(lower=0, upper=120)

# Length of stay (in days)
df['length_of_stay'] = (df['dischtime'] - df['admittime']).dt.total_seconds() / (3600 * 24)
df['length_of_stay'] = df['length_of_stay'].clip(lower=0)

# Admission month and season
df['admit_month'] = df['admittime'].dt.month
df['season'] = df['admit_month'] % 12 // 3 + 1

# Time of day (night or not)
df['hour'] = df['admittime'].dt.hour
df['is_night'] = df['hour'].apply(lambda x: 1 if (x < 7 or x > 20) else 0)

# Diagnosis count
diag_count = diagnoses.groupby('hadm_id').size().reset_index(name='num_diag')
df = pd.merge(df, diag_count, on='hadm_id', how='left')
df['num_diag'].fillna(0, inplace=True)

# Lab event count
lab_count = labs.groupby('hadm_id').size().reset_index(name='num_labs')
df = pd.merge(df, lab_count, on='hadm_id', how='left')
df['num_labs'].fillna(0, inplace=True)

# Encode gender
df['gender'] = LabelEncoder().fit_transform(df['gender'])

# ======================
# 4. Prepare Data
# ======================
features = [
    'gender', 'admit_age', 'num_diag', 'num_labs',
    'length_of_stay', 'season', 'is_night'
]

X = df[features]
y = df['emergency_label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.25, random_state=42
)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in np.unique(y_train)}

# ======================
# 5. Train Optimized LightGBM Model
# ======================
params = {
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'subsample': [0.7, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.9, 1.0],
}

lgb_est = lgb.LGBMClassifier(random_state=42, class_weight=class_weights_dict)
search = RandomizedSearchCV(
    lgb_est, param_distributions=params, n_iter=30,
    cv=3, scoring='roc_auc', verbose=1, n_jobs=-1
)
search.fit(X_train, y_train)
model = search.best_estimator_

# ======================
# 6. Predict & Evaluate
# ======================
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"AUC Score: {roc_auc_score(y_test, y_prob):.4f}")

# ======================
# 7. Feature Importance
# ======================
lgb.plot_importance(model, importance_type='gain', figsize=(10, 6))
plt.title("LightGBM Feature Importance (Gain)")
plt.tight_layout()
plt.show()

# ======================
# 8. SHAP Explainability
# ======================
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)

# SHAP summary
shap.summary_plot(shap_values, X_test)

# SHAP waterfall plot for first test case
shap.plots.waterfall(shap_values[0])

# ======================
# 9. Correlation Heatmap
# ======================
plt.figure(figsize=(6, 4))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import f1_score

# Define thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 0.91, 0.05)
f1_scores = []

# Compute F1 score at each threshold
for threshold in thresholds:
    y_pred_thresh = (y_prob >= threshold).astype(int)
    f1 = f1_score(y_test, y_pred_thresh)
    f1_scores.append(f1)

# Plot F1 score vs. Threshold
plt.figure(figsize=(8, 5))
plt.plot(thresholds, f1_scores, marker='o', linestyle='-', color='darkgreen')
plt.title('F1 Score vs Classification Threshold')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.xticks(thresholds)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd

# Define thresholds
thresholds = np.arange(0.1, 0.91, 0.05)

# Initialize list to collect metrics
metrics_list = []

# Compute precision, recall, f1 for each threshold
for threshold in thresholds:
    y_pred_thresh = (y_prob >= threshold).astype(int)
    precision = precision_score(y_test, y_pred_thresh)
    recall = recall_score(y_test, y_pred_thresh)
    f1 = f1_score(y_test, y_pred_thresh)

    metrics_list.append({
        'Threshold': round(threshold, 2),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1-Score': round(f1, 4)
    })

# Convert to DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Display the table
print("📊 Precision, Recall, F1-Score at Various Thresholds:")
print(metrics_df.to_string(index=False))


In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc_score = roc_auc_score(y_test, y_prob)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.4f}", color='darkorange', linewidth=2)
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Receiver Operating Characteristic)")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
print(f"✅ ROC AUC Score: {auc_score:.4f}")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Non-Emergency', 'Emergency']

# Display as heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()


In [None]:
print("Confusion Matrix (Raw Values):")
print(cm)
