<a href="https://colab.research.google.com/github/Gajeshgif/Gajesh/blob/main/XAI_for_Emergency_Admission_Predi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
asjad99_mimiciii_path = kagglehub.dataset_download('asjad99/mimiciii')

print('Data source import complete.')

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load datasets
admissions = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/ADMISSIONS.csv")
patients = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/PATIENTS.csv")
diagnoses = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/DIAGNOSES_ICD.csv")

# Show column names
print("ADMISSIONS columns:", admissions.columns.tolist())
print("PATIENTS columns:", patients.columns.tolist())
print("DIAGNOSES columns:", diagnoses.columns.tolist())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
admissions = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/ADMISSIONS.csv")
patients = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/PATIENTS.csv")
diagnoses = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/DIAGNOSES_ICD.csv")

# Ensure lowercase columns
admissions.columns = admissions.columns.str.lower()
patients.columns = patients.columns.str.lower()
diagnoses.columns = diagnoses.columns.str.lower()

# Merge
df = pd.merge(admissions, patients, on='subject_id', how='inner')

# Convert to datetime
df['admittime'] = pd.to_datetime(df['admittime'], errors='coerce')
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Drop rows with missing/invalid dates
df = df.dropna(subset=['admittime', 'dob'])

# Remove date outliers to prevent overflow
df = df[(df['dob'].dt.year >= 1900) & (df['dob'].dt.year <= 2025)]

# Calculate admit age safely
df['admit_age'] = (df['admittime'] - df['dob']).dt.total_seconds() / (365.25 * 24 * 3600)
df['admit_age'] = df['admit_age'].clip(lower=0, upper=120)

# Emergency admission label
df['is_emergency'] = df['admission_type'].apply(lambda x: 1 if x == 'EMERGENCY' else 0)

# EDA plots
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='admission_type', order=df['admission_type'].value_counts().index)
plt.title("Admission Types")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 5))
sns.kdeplot(data=df, x='admit_age', hue='admission_type', common_norm=False, fill=True)
plt.title("Age Distribution by Admission Type")
plt.xlabel("Age at Admission")
plt.show()

plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='gender')
plt.title("Gender Distribution")
plt.show()

plt.figure(figsize=(6, 4))
sns.barplot(data=df, x='gender', y='is_emergency')
plt.title("Emergency Admission Rate by Gender")
plt.ylabel("Proportion Emergency")
plt.show()

# Join number of diagnoses per admission
diag_counts = diagnoses.groupby('hadm_id').size().reset_index(name='num_diag')
df = pd.merge(df, diag_counts, on='hadm_id', how='left')
df['num_diag'] = df['num_diag'].fillna(0)

plt.figure(figsize=(7, 4))
sns.histplot(df['num_diag'], bins=30, kde=True)
plt.title("Number of Diagnoses per Admission")
plt.xlabel("Number of Diagnoses")
plt.show()

plt.figure(figsize=(7, 5))
sns.boxplot(data=df, x='is_emergency', y='num_diag')
plt.title("Diagnoses Count by Emergency Admission Status")
plt.xlabel("Is Emergency")
plt.ylabel("Number of Diagnoses")
plt.show()

df['gender_enc'] = df['gender'].map({'M': 1, 'F': 0})
corr_features = df[['admit_age', 'num_diag', 'is_emergency', 'gender_enc']]

plt.figure(figsize=(6, 5))
sns.heatmap(corr_features.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation")
plt.tight_layout()
plt.show()


In [None]:
# ======================
# 1. Import Libraries
# ======================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score
import lightgbm as lgb
import shap
import warnings
warnings.filterwarnings('ignore')
shap.initjs()

# ======================
# 2. Load Datasets
# ======================
admissions = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/ADMISSIONS.csv")
patients = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/PATIENTS.csv")
diagnoses = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/DIAGNOSES_ICD.csv")
labs = pd.read_csv("/kaggle/input/mimiciii/mimic-iii-clinical-database-demo-1.4/LABEVENTS.csv")  # Full load

# Standardize column names
admissions.columns = admissions.columns.str.lower()
patients.columns = patients.columns.str.lower()
diagnoses.columns = diagnoses.columns.str.lower()
labs.columns = labs.columns.str.lower()

# ======================
# 3. Merge & Feature Engineering
# ======================
df = pd.merge(admissions, patients, on='subject_id', how='inner')

# Convert datetime columns
df['admittime'] = pd.to_datetime(df['admittime'], errors='coerce')
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['admittime', 'dob'])

# Relaxed DOB filtering to allow synthetic dates up to year 2200
df = df[(df['dob'].dt.year >= 1900) & (df['dob'].dt.year <= 2200)]

# Check if enough rows remain
if df.shape[0] < 10:
    raise ValueError(f"Too few rows after filtering. Only {df.shape[0]} rows.")

# Target variable: emergency admission = 1 else 0
df['emergency_label'] = df['admission_type'].apply(lambda x: 1 if x == 'EMERGENCY' else 0)

# Calculate age at admission (years)
df['admit_age'] = (df['admittime'] - df['dob']).dt.total_seconds() / (365.25 * 24 * 3600)
df['admit_age'] = df['admit_age'].clip(lower=0, upper=120)

# Diagnosis count per hospital admission
diag_count = diagnoses.groupby('hadm_id').size().reset_index(name='num_diag')
df = pd.merge(df, diag_count, on='hadm_id', how='left')
df['num_diag'].fillna(0, inplace=True)

# Lab events count per hospital admission
lab_count = labs.groupby('hadm_id').size().reset_index(name='num_labs')
df = pd.merge(df, lab_count, on='hadm_id', how='left')
df['num_labs'].fillna(0, inplace=True)

# Encode gender as numeric
df['gender'] = LabelEncoder().fit_transform(df['gender'])

# ======================
# 4. Prepare Data
# ======================
features = ['gender', 'admit_age', 'num_diag', 'num_labs']
X = df[features]
y = df['emergency_label']

# Check if dataset has enough rows to split
if X.shape[0] < 2:
    raise ValueError(f"Not enough samples to train/test split: only {X.shape[0]} rows.")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.25, random_state=42
)

# ======================
# 5. Train LightGBM Model
# ======================
model = lgb.LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict & evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"AUC Score: {roc_auc_score(y_test, y_prob):.4f}")

# ======================
# 6. Feature Importance Plot
# ======================
lgb.plot_importance(model, importance_type='gain', figsize=(10, 6))
plt.title("LightGBM Feature Importance (Gain)")
plt.tight_layout()
plt.show()

# ======================
# 7. SHAP XAI Visualization
# ======================
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)

# SHAP summary plot
shap.summary_plot(shap_values, X_test)

# SHAP waterfall plot for the first test instance
shap.plots.waterfall(shap_values[0])

# ======================
# 8. Correlation Heatmap
# ======================
plt.figure(figsize=(6, 4))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()
