In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, accuracy_score, 
                             precision_recall_curve, auc, f1_score, 
                             precision_score, recall_score, average_precision_score,
                             brier_score_loss, log_loss)
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from imblearn.over_sampling import SMOTE
from collections import Counter
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

# --------------------------
# 1. Data Loading & Initial Prep
# --------------------------
print("Loading data and initial preprocessing...")
df = pd.read_csv('mimic3c.csv')

# Create target variable
df['expired'] = df['ExpiredHospital'].astype(int)

# Select key features based on Kaggle notebook
selected_features = [
    'age', 'gender', 'LOSdays', 'NumChartEvents', 'NumNotes', 'NumProcs',
    'NumLabs', 'NumMicroLabs', 'NumRx', 'admit_type', 'admit_location',
    'insurance', 'marital_status', 'ethnicity', 'HeartRate_Min', 'HeartRate_Max',
    'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max', 'SysBP_Mean', 'DiasBP_Min',
    'DiasBP_Max', 'DiasBP_Mean', 'MeanBP_Min', 'MeanBP_Max', 'MeanBP_Mean',
    'RespRate_Min', 'RespRate_Max', 'RespRate_Mean', 'TempC_Min', 'TempC_Max',
    'TempC_Mean', 'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 'Glucose_Min',
    'Glucose_Max', 'Glucose_Mean', 'expired'
]

df = df[selected_features]

print(f"Initial target distribution:\n{df['expired'].value_counts()}")
print(f"Class ratio: {df['expired'].mean():.4f}")

# --------------------------
# 2. Data Cleaning & Preprocessing
# --------------------------
print("\nPerforming data cleaning...")

# Handle missing values - simpler imputation
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = ['gender', 'admit_type', 'admit_location', 'insurance', 'marital_status', 'ethnicity']

# Impute numerical features with median
for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)
    
# Impute categorical features with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Encode categorical variables
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Feature transformations
skewed_features = ['NumChartEvents', 'NumNotes', 'NumProcs', 'NumLabs', 'NumMicroLabs', 'NumRx']
for feature in skewed_features:
    df[feature] = np.log1p(df[feature])

# --------------------------
# 3. Feature Selection
# --------------------------
X = df.drop(columns=['expired'])
y = df['expired']

# Select top 30 features using ANOVA F-value
selector = SelectKBest(f_classif, k=30)
X_selected = selector.fit_transform(X, y)
selected_mask = selector.get_support()
selected_features = X.columns[selected_mask]

print("\nSelected features:")
print(selected_features.tolist())

X = pd.DataFrame(X_selected, columns=selected_features)

# --------------------------
# 4. Train-Test Split & Balancing
# --------------------------
# Split data before any balancing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("\nClass distribution before balancing:")
print(f"Training: {Counter(y_train)}")
print(f"Testing: {Counter(y_test)}")

# Apply SMOTE only to training data
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("\nClass distribution after SMOTE:")
print(f"Training: {Counter(y_train_res)}")

# Scale features
scaler = StandardScaler()
X_train_res = scaler.fit_transform(X_train_res)
X_test = scaler.transform(X_test)

# --------------------------
# 5. Model Training & Evaluation
# --------------------------
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(class_weight='balanced', random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Evaluate models
results = []
plt.figure(figsize=(10, 8))
plt.plot([0, 1], [0, 1], 'k--', label='Random')

for name, model in models.items():
    print(f"\n--- Training {name} ---")
    model.fit(X_train_res, y_train_res)
    
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba = model.decision_function(X_test)
    
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    pr_auc = average_precision_score(y_test, y_proba)
    logloss = log_loss(y_test, y_proba)
    brier = brier_score_loss(y_test, y_proba)
    
    results.append({
        "Model": name,
        "Accuracy": acc,
        "F1": f1,
        "Precision": precision,
        "Recall": recall,
        "ROC AUC": roc_auc,
        "PR AUC": pr_auc,
        "Log Loss": logloss,
        "Brier Score": brier
    })
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')
    
    # Precision-Recall Curve
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_proba)
    plt.figure()
    plt.plot(recall_curve, precision_curve, label=name)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve: {name}')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'pr_curve_{name.lower().replace(" ", "_")}.png', dpi=300)
    plt.close()
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Survived', 'Died'], 
                yticklabels=['Survived', 'Died'])
    plt.title(f'Confusion Matrix: {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_{name.lower().replace(" ", "_")}.png', dpi=300)
    plt.close()
    
    print(f"\n{name} Performance:")
    print(classification_report(y_test, y_pred))
    print(f"ROC AUC: {roc_auc:.4f}, PR AUC: {pr_auc:.4f}")
    print(f"Log Loss: {logloss:.4f}, Brier Score: {brier:.4f}")

# ROC Curve for all models
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves: Model Comparison')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.savefig('roc_curves_comparison.png', dpi=300)
plt.show()

# Results comparison
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df)

# --------------------------
# 6. Hyperparameter Tuning (Logistic Regression)
# --------------------------
print("\nTuning Logistic Regression...")
param_grid = {
    'C': np.logspace(-3, 3, 7),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr_tuner = GridSearchCV(
    lr, param_grid, scoring='roc_auc', 
    cv=StratifiedKFold(5, shuffle=True, random_state=42),
    n_jobs=-1, verbose=1
)
lr_tuner.fit(X_train_res, y_train_res)

best_lr = lr_tuner.best_estimator_
print(f"Best parameters: {lr_tuner.best_params_}")
print(f"Best ROC AUC: {lr_tuner.best_score_:.4f}")

# Evaluate best model
y_proba_lr = best_lr.predict_proba(X_test)[:, 1]
y_pred_lr = best_lr.predict(X_test)

print("\nTuned Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))
print(f"ROC AUC: {roc_auc_score(y_test, y_proba_lr):.4f}")
print(f"PR AUC: {average_precision_score(y_test, y_proba_lr):.4f}")

# --------------------------
# 7. Model Calibration
# --------------------------
print("\nCalibrating model...")
calibrated_lr = CalibratedClassifierCV(best_lr, method='isotonic', cv=5)
calibrated_lr.fit(X_train_res, y_train_res)

y_proba_cal = calibrated_lr.predict_proba(X_test)[:, 1]

# Reliability curve
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_test, y_proba_cal, n_bins=10
)

plt.figure(figsize=(8, 6))
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Calibrated LR")
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
plt.xlabel("Mean predicted value")
plt.ylabel("Fraction of positives")
plt.title("Calibration Curve")
plt.legend()
plt.tight_layout()
plt.savefig('calibration_curve.png', dpi=300)
plt.show()

# Compare calibration scores
print(f"Brier score (uncalibrated): {brier_score_loss(y_test, y_proba_lr):.4f}")
print(f"Brier score (calibrated): {brier_score_loss(y_test, y_proba_cal):.4f}")

# --------------------------
# 8. Feature Importance
# --------------------------
# Logistic Regression coefficients
coefs = pd.Series(best_lr.coef_[0], index=selected_features)
sorted_coefs = coefs.abs().sort_values(ascending=False)[:20]

plt.figure(figsize=(12, 8))
sns.barplot(x=sorted_coefs.values, y=sorted_coefs.index, palette='viridis')
plt.title('Top 20 Predictive Features (Logistic Regression)', fontsize=14)
plt.xlabel('Absolute Coefficient Value')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300)
plt.show()

print("\nTop predictive features:")
print(sorted_coefs)

# --------------------------
# 9. Final Evaluation Report
# --------------------------
from sklearn.metrics import precision_recall_fscore_support

def evaluate_model(y_true, y_pred, y_proba, model_name):
    """Generate comprehensive evaluation report"""
    # Classification metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='binary'
    )
    roc_auc = roc_auc_score(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calibration metrics
    logloss = log_loss(y_true, y_proba)
    brier = brier_score_loss(y_true, y_proba)
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Sensitivity and specificity
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'ROC AUC': roc_auc,
        'PR AUC': pr_auc,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'Log Loss': logloss,
        'Brier Score': brier,
        'TP': tp,
        'FP': fp,
        'TN': tn,
        'FN': fn
    }

# Evaluate both models
final_results = [
    evaluate_model(y_test, y_pred_lr, y_proba_lr, "Logistic Regression (Uncalibrated)"),
    evaluate_model(y_test, calibrated_lr.predict(X_test), y_proba_cal, "Logistic Regression (Calibrated)")
]

# Create final report
final_report = pd.DataFrame(final_results)
print("\nFinal Model Evaluation:")
print(final_report)

# Save results to CSV
final_report.to_csv('model_evaluation_results.csv', index=False)
print("\nAnalysis complete! All results saved.")

Loading data and initial preprocessing...


KeyError: "['HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max', 'SysBP_Mean', 'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'MeanBP_Min', 'MeanBP_Max', 'MeanBP_Mean', 'RespRate_Min', 'RespRate_Max', 'RespRate_Mean', 'TempC_Min', 'TempC_Max', 'TempC_Mean', 'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 'Glucose_Min', 'Glucose_Max', 'Glucose_Mean'] not in index"

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    ConfusionMatrixDisplay, roc_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier



# === Load Dataset ===
df = pd.read_csv('mimic3c.csv')

# === Define Target and Potential Features ===
target_col = 'expired'  # Adjust if your target column has a different name
potential_features = [
    'age', 'gender', 'LOSdays', 'NumChartEvents', 'NumNotes', 'NumProcs',
    'NumLabs', 'NumMicroLabs', 'NumRx', 'admit_type', 'admit_location',
    'HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min',
    'SysBP_Max', 'SysBP_Mean', 'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean',
    'MeanBP_Min', 'MeanBP_Max', 'MeanBP_Mean', 'RespRate_Min',
    'RespRate_Max', 'RespRate_Mean', 'TempC_Min', 'TempC_Max', 'TempC_Mean',
    'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 'Glucose_Min', 'Glucose_Max',
    'Glucose_Mean'
]

# === Keep Only Available Features ===
available_features = [col for col in potential_features if col in df.columns]
missing_features = list(set(potential_features) - set(available_features))
print(f"Using {len(available_features)} features.")
if missing_features:
    print("Missing columns (skipped):", missing_features)

# Ensure target exists
if target_col not in df.columns:
    raise ValueError(f"Target column '{target_col}' not found in dataset.")

# === Subset the Data ===
df = df[available_features + [target_col]]
X = df.drop(columns=[target_col])
y = df[target_col]

# === Impute and Scale ===
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# === Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# === Define Classifiers ===
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'GaussianNB': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(probability=True),
    'SGD': SGDClassifier(random_state=42)
}

# === Train and Evaluate ===
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probas = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
    
    results[name] = {
        'accuracy': accuracy_score(y_test, preds),
        'precision': precision_score(y_test, preds),
        'recall': recall_score(y_test, preds),
        'f1': f1_score(y_test, preds),
        'roc_auc': roc_auc_score(y_test, probas)
    }

# === Show Performance ===
res_df = pd.DataFrame(results).T.sort_values('roc_auc', ascending=False)
print("\nModel Performance:")
print(res_df)

# === Best Model: Random Forest Feature Importance ===
rf = models['RandomForest']
importances = rf.feature_importances_
feat_names = X.columns
sorted_idx = np.argsort(importances)[-15:]

plt.figure(figsize=(10, 6))
plt.barh(feat_names[sorted_idx], importances[sorted_idx])
plt.title("Top 15 Important Features (Random Forest)")
plt.tight_layout()
plt.show()

# === ROC Curve for Random Forest ===
y_prob = rf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f'ROC AUC = {roc_auc_score(y_test, y_prob):.3f}')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Random Forest')
plt.legend()
plt.show()

# === Confusion Matrix ===
ConfusionMatrixDisplay.from_estimator(rf, X_test, y_test)
plt.title("Confusion Matrix - Random Forest")
plt.show()

# === Deep Neural Network ===
nn = Sequential([
    Dense(256, activation='relu', input_dim=X_train.shape[1]),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
nn.compile(optimizer=Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy'])

history = nn.fit(
    X_train, y_train, validation_split=0.2,
    epochs=30, batch_size=128, verbose=0
)

# Plot training history
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.title("NN Training Loss")
plt.legend()
plt.show()

# Evaluate NN
y_pred_nn = (nn.predict(X_test).flatten() > 0.5).astype(int)
y_prob_nn = nn.predict(X_test).flatten()

print("\nNeural Network Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Precision:", precision_score(y_test, y_pred_nn))
print("Recall:", recall_score(y_test, y_pred_nn))
print("F1 Score:", f1_score(y_test, y_pred_nn))
print("ROC AUC:", roc_auc_score(y_test, y_prob_nn))


Using 11 features.
Missing columns (skipped): ['Glucose_Mean', 'RespRate_Min', 'HeartRate_Min', 'SysBP_Mean', 'Glucose_Min', 'RespRate_Mean', 'SpO2_Max', 'SpO2_Mean', 'TempC_Min', 'HeartRate_Mean', 'DiasBP_Max', 'TempC_Max', 'HeartRate_Max', 'DiasBP_Min', 'DiasBP_Mean', 'SysBP_Max', 'MeanBP_Min', 'MeanBP_Mean', 'SysBP_Min', 'RespRate_Max', 'SpO2_Min', 'Glucose_Max', 'MeanBP_Max', 'TempC_Mean']


ValueError: Target column 'expired' not found in dataset.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, roc_curve
)
from sklearn.ensemble import RandomForestClassifier


# === Load Data ===
df = pd.read_csv("mimic3c.csv")
print(df.columns.tolist())


# === Step 1: Define working features ===
available_features = [
    'age', 'gender', 'LOSdays', 'NumChartEvents', 'NumNotes', 
    'NumProcs', 'NumLabs', 'NumMicroLabs', 'NumRx', 
    'admit_type', 'admit_location'
]

# === Step 2: Infer the actual target column ===
possible_targets = [col for col in df.columns if 'expire' in col or 'mortality' in col or 'death' in col]
if not possible_targets:
    raise ValueError("Could not find a suitable target column (e.g., expired, death, mortality).")
target_col = possible_targets[0]
print(f"Using target column: {target_col}")

# === Step 3: Prepare Data ===
df = df[available_features + [target_col]]
X = df.drop(columns=[target_col])
y = df[target_col]

# Impute & scale
X = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(X), columns=X.columns)
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === Step 4: Train Random Forest ===
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

# Metrics
print("Random Forest Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

# Plot Feature Importance
importances = rf.feature_importances_
sorted_idx = np.argsort(importances)
plt.barh(X.columns[sorted_idx], importances[sorted_idx])
plt.title("Feature Importance")
plt.tight_layout()
plt.show()

# === Step 5: Deep Neural Network ===
nn = Sequential([
    Dense(128, activation='relu', input_dim=X.shape[1]),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
nn.compile(optimizer=Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy'])
history = nn.fit(X_train, y_train, validation_split=0.2, epochs=30, batch_size=64, verbose=0)

# NN Evaluation
y_pred_nn = (nn.predict(X_test).flatten() > 0.5).astype(int)
y_prob_nn = nn.predict(X_test).flatten()
print("\nNeural Network Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Precision:", precision_score(y_test, y_pred_nn))
print("Recall:", recall_score(y_test, y_pred_nn))
print("F1 Score:", f1_score(y_test, y_pred_nn))
print("ROC AUC:", roc_auc_score(y_test, y_prob_nn))

# NN Loss Curve
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title("Neural Network Training Loss")
plt.legend()
plt.tight_layout()
plt.show()


['hadm_id', 'gender', 'age', 'LOSdays', 'admit_type', 'admit_location', 'AdmitDiagnosis', 'insurance', 'religion', 'marital_status', 'ethnicity', 'NumCallouts', 'NumDiagnosis', 'NumProcs', 'AdmitProcedure', 'NumCPTevents', 'NumInput', 'NumLabs', 'NumMicroLabs', 'NumNotes', 'NumOutput', 'NumRx', 'NumProcEvents', 'NumTransfers', 'NumChartEvents', 'ExpiredHospital', 'TotalNumInteract', 'LOSgroupNum']


ValueError: Could not find a suitable target column (e.g., expired, death, mortality).

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve
)
from sklearn.ensemble import RandomForestClassifier


# === Load Data ===
df = pd.read_csv("mimic3c.csv")

# === Define available features ===
available_features = [
    'age', 'gender', 'LOSdays', 'NumChartEvents', 'NumNotes',
    'NumProcs', 'NumLabs', 'NumMicroLabs', 'NumRx',
    'admit_type', 'admit_location'
]

# === Define target column ===
target_col = 'ExpiredHospital'

# === Validate column exists ===
if target_col not in df.columns:
    raise ValueError(f"Target column '{target_col}' not found.")

# === Prepare features and target ===
df = df[available_features + [target_col]]
X = df.drop(columns=[target_col])
y = df[target_col]

# Impute and scale
X = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(X), columns=X.columns)
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# === Random Forest ===
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

print("🎯 Random Forest Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

# Feature importance
importances = rf.feature_importances_
sorted_idx = np.argsort(importances)
plt.barh(X.columns[sorted_idx], importances[sorted_idx])
plt.title("Feature Importance (Random Forest)")
plt.tight_layout()
plt.show()

# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f'ROC AUC = {roc_auc_score(y_test, y_prob):.3f}')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.tight_layout()
plt.show()

# === Neural Network ===
nn = Sequential([
    Dense(128, activation='relu', input_dim=X.shape[1]),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
nn.compile(optimizer=Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy'])

history = nn.fit(X_train, y_train, validation_split=0.2, epochs=30, batch_size=64, verbose=0)

# NN evaluation
y_pred_nn = (nn.predict(X_test).flatten() > 0.5).astype(int)
y_prob_nn = nn.predict(X_test).flatten()

print("\n🤖 Neural Network Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Precision:", precision_score(y_test, y_pred_nn))
print("Recall:", recall_score(y_test, y_pred_nn))
print("F1 Score:", f1_score(y_test, y_pred_nn))
print("ROC AUC:", roc_auc_score(y_test, y_prob_nn))

# NN loss plot
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title("Neural Network Training Loss")
plt.legend()
plt.tight_layout()
plt.show()


ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'F'