#NOTEBOOK 5/6: ML MODEL

##1. FIXING THE SCHEMA AND IMPORTING ML FEATURES

In [0]:
# Selecting ML features schema and healthcare_analytics catalog
spark.sql("USE CATALOG healthcare_analytics")
spark.sql("USE SCHEMA ml_features")
spark.sql("SELECT current_catalog(), current_schema()").show()

In [0]:
# Importing libraries and functions
import mlflow
import mlflow.spark
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, ConfusionMatrixDisplay
from pyspark.sql.functions import *

##2. SELECTING FEATURES

In [0]:
# Numeric features
numeric_features = [
    "time_in_hospital",
    "num_lab_procedures",
    "num_procedures",
    "num_medications",
    "number_outpatient",
    "number_emergency",
    "number_inpatient",
    "number_diagnoses",
    "total_procedures",
    "is_emergency"
]

# Categorical features
categorical_features = [
    "age_group",
    "admission_type_id",
    "medical_specialty",
    "num_medications_category"
]

print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")


##3. PREPARE DATA FOR ML

In [0]:
# READ SILVER DATA

df = spark.table("healthcare_analytics.silver.silver_events")
print(f"Total records: {df.count()}")

In [0]:
# Select columns and rename target

ml_df = df.select(
    ["encounter_id", "patient_nbr"] + 
    numeric_features + 
    categorical_features + 
    ["readmitted_30days"]
).withColumnRenamed("readmitted_30days", "label")

#In ML:
#Features → input variables (X)
#Label → output variable (y) we want to predict

# Show sample
display(ml_df.limit(1000))

##4. CREATE FEATURE PIPELINE

In [0]:

# String indexing for categorical features
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid="keep")
    for col in categorical_features
]

# One-hot encoding Emergency (0) < Urgent (1) < Elective (2) 
# Converts category index into binary vector [1,0,0] for Emergency, [0,1,0] for Urgent, [0,0,1] for Elective
encoders = [
    OneHotEncoder(inputCol=f"{col}_index", outputCol=f"{col}_encoded")
    for col in categorical_features
]

# Combine all features
encoded_cols = [f"{col}_encoded" for col in categorical_features]
all_features = numeric_features + encoded_cols

assembler = VectorAssembler(
    inputCols=all_features,
    outputCol="features",
    handleInvalid="skip"
)

# Create pipeline instead of manually running:Indexing, Encoding, Assembling
feature_pipeline = Pipeline(stages=indexers + encoders + [assembler])

# Transform data
print("Creating features...")
feature_model = feature_pipeline.fit(ml_df) #Fit = learn structure
ml_transformed = feature_model.transform(ml_df) #Transform = apply structure

# Select final columns
ml_features_df = ml_transformed.select("encounter_id", "patient_nbr", "features", "label")

print(f"Features created: {ml_features_df.count()} records")

In [0]:
# Writing to ml_features table
ml_features_table = "healthcare_analytics.ml_features.readmission_features"

ml_features_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(ml_features_table)

print("Features saved!")

##5. TRAIN-TEST DATA SPLIT

In [0]:
# Split 80-20 data for training and testing respectively

train_df, test_df = ml_features_df.randomSplit([0.8, 0.2], seed=42)

print(f"Training: {train_df.count()} records")
print(f"Test: {test_df.count()} records")

##6. CREATING RANDOM FOREST AND GRADIENT BOOST TREES MODEL

In [0]:
print("Training Random Forest...")

with mlflow.start_run(run_name="RandomForest_Model"):
    
    # Create model
    rf = RandomForestClassifier(
        featuresCol="features",
        labelCol="label",
        numTrees=100,
        maxDepth=10,
        seed=42
    )
    
    # Train
    rf_model = rf.fit(train_df)
    
    # Predict
    rf_predictions = rf_model.transform(test_df)
    
    # Evaluate
    evaluator_auc = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
    evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
    
    auc_rf = evaluator_auc.evaluate(rf_predictions)
    acc_rf = evaluator_acc.evaluate(rf_predictions)
    
    print(f"Random Forest - AUC: {auc_rf:.4f}, Accuracy: {acc_rf:.4f}")
    
    # Log metrics
    mlflow.log_metric("auc", auc_rf)
    mlflow.log_metric("accuracy", acc_rf)

In [0]:
# Gradient Boosted Trees Model
print("Training Gradient Boosted Trees...")

with mlflow.start_run(run_name="GBT_Model"):
    
    # Create model
    gbt = GBTClassifier(
        featuresCol="features",
        labelCol="label",
        maxIter=50,
        maxDepth=8,
        seed=42
    )
    
    # Train
    gbt_model = gbt.fit(train_df)
    
    # Predict
    gbt_predictions = gbt_model.transform(test_df)
    
    # Evaluate
    auc_gbt = evaluator_auc.evaluate(gbt_predictions)
    acc_gbt = evaluator_acc.evaluate(gbt_predictions)
    
    print(f"GBT - AUC: {auc_gbt:.4f}, Accuracy: {acc_gbt:.4f}")
    
    # Log metrics
    mlflow.log_metric("auc", auc_gbt)
    mlflow.log_metric("accuracy", acc_gbt)

##7. COMPARING AND CHOOSING THE BEST MODEL

In [0]:
# Compare models
print("=" * 60)
print("MODEL COMPARISON")
print("=" * 60)
print(f"Random Forest  - AUC: {auc_rf:.4f}, Accuracy: {acc_rf:.4f}")
print(f"GBT            - AUC: {auc_gbt:.4f}, Accuracy: {acc_gbt:.4f}")
print("=" * 60)
best_auc = auc_rf if auc_rf >= auc_gbt else auc_gbt


if auc_gbt > auc_rf:
    print("Best Model: Gradient Boosted Trees")
else:
    print("Best Model: Random Forest")

##8. SHOWING SAMPLE PREDICTIONS

In [0]:
# Show predictions from best model (using GBT)

sample_predictions = gbt_predictions.select(
    "encounter_id",
    "patient_nbr",
    col("label").alias("actual"),
    col("prediction").alias("predicted")
).limit(20)

display(sample_predictions)

##9. CALCULATING CONFUSION MATRIX

In [0]:
# Calculate confusion matrix

predictions_pd = gbt_predictions.select("label", "prediction").toPandas()

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(predictions_pd['label'], predictions_pd['prediction'])

print("Confusion Matrix:")
print(f"                  Predicted: 0    Predicted: 1")
print(f"Actual: 0         {cm[0,0]:<15} {cm[0,1]:<15}")
print(f"Actual: 1         {cm[1,0]:<15} {cm[1,1]:<15}")

##10. FINAL INFERENCE

In [0]:

print("=" * 60)
print("ML MODEL TRAINING COMPLETE")
print("=" * 60)
print(f"Features created: {len(all_features)}")
print(f"Models trained: 2 (Random Forest, GBT)")
print(f"Best AUC: {best_auc:.4f}")
print(f"MLflow experiments logged")
print("=" * 60)

##11. GENERATE ROC CURVE

In [0]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import numpy as np

# Get predictions with probabilities from GBT model
gbt_pred_pd = gbt_predictions.select("label", "probability").toPandas()

# Extract probability of class 1 (readmitted)
y_true = gbt_pred_pd['label']
y_scores = np.array([float(p[1]) for p in gbt_pred_pd['probability']])

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC Curve - Hospital Readmission Prediction (GBT Model)', fontsize=16, fontweight='bold')
plt.legend(loc="lower right", fontsize=12)
plt.grid(alpha=0.3)

# Display in notebook (Databricks will render it)
plt.show()

# COMMAND ----------

# Log to MLflow (this works!)
import mlflow

with mlflow.start_run(run_name="GBT_Model_Visualizations"):
    # Create the figure again for MLflow
    fig = plt.figure(figsize=(10, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=14)
    plt.title('ROC Curve - Hospital Readmission Prediction (GBT Model)', fontsize=16, fontweight='bold')
    plt.legend(loc="lower right", fontsize=12)
    plt.grid(alpha=0.3)
    
    # Log to MLflow
    mlflow.log_figure(fig, "roc_curve.png")
    plt.close()
    
print("ROC Curve logged to MLflow!")

##12. GENERATE CONFUSION MATRIX HEATMAP

In [0]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Get predictions
predictions_pd = gbt_predictions.select("label", "prediction").toPandas()

# Calculate confusion matrix
cm = confusion_matrix(predictions_pd['label'], predictions_pd['prediction'])

# Calculate percentages
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Create figure
fig, ax = plt.subplots(figsize=(10, 8))

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                               display_labels=['Not Readmitted', 'Readmitted'])
disp.plot(cmap='Blues', ax=ax, colorbar=True, values_format='d')

# Add percentages as annotations
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i + 0.15, f'({cm_percent[i, j]:.1f}%)', 
                ha='center', va='center', color='red', fontsize=11)

ax.set_title('Confusion Matrix - GBT Model\nHospital Readmission Prediction', 
             fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Predicted Label', fontsize=14, fontweight='bold')
ax.set_ylabel('True Label', fontsize=14, fontweight='bold')

# Display
plt.tight_layout()
plt.show()

# COMMAND ----------

# Log to MLflow
with mlflow.start_run(run_name="GBT_Model_Visualizations", nested=True):
    # Recreate figure for MLflow
    fig, ax = plt.subplots(figsize=(10, 8))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                                   display_labels=['Not Readmitted', 'Readmitted'])
    disp.plot(cmap='Blues', ax=ax, colorbar=True, values_format='d')
    
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i + 0.15, f'({cm_percent[i, j]:.1f}%)', 
                    ha='center', va='center', color='red', fontsize=11)
    
    ax.set_title('Confusion Matrix - GBT Model', fontsize=16, fontweight='bold')
    
    mlflow.log_figure(fig, "confusion_matrix.png")
    plt.close()

print("Confusion Matrix logged to MLflow!")

##14. FEATURE IMPORTANCE VISUALIZATION

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Get feature importance from GBT model
feature_importance = gbt_model.featureImportances.toArray()

# Create feature names (only numeric features for clarity)
feature_names = numeric_features.copy()

# Create DataFrame with only the numeric features
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance[:len(numeric_features)]
}).sort_values('Importance', ascending=False)

# Plot
fig = plt.figure(figsize=(12, 8))
bars = plt.barh(importance_df['Feature'], importance_df['Importance'], color='steelblue')

# Highlight top 3 features
num_bars = len(bars)
top_n = 3 if num_bars >= 3 else num_bars

for i in range(top_n):
    bars[i].set_color('#e74c3c')

plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
plt.ylabel('Features', fontsize=12, fontweight='bold')
plt.title('Top Features Predicting Hospital Readmission (GBT Model)', 
          fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)

# Add value labels
for i, v in enumerate(importance_df['Importance']):
    plt.text(v, i, f' {v:.4f}', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

# COMMAND ----------

# Log to MLflow
with mlflow.start_run(run_name="Feature_Importance", nested=True):
    fig = plt.figure(figsize=(12, 8))
    bars = plt.barh(importance_df['Feature'], importance_df['Importance'], color='steelblue')
    
    # Highlight top 3
    num_bars = len(bars)
    top_n = 3 if num_bars >= 3 else num_bars
    
    for i in range(top_n):
        bars[i].set_color('#e74c3c')
    
    plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
    plt.ylabel('Features', fontsize=12, fontweight='bold')
    plt.title('Top Features Predicting Hospital Readmission', fontsize=14, fontweight='bold')
    plt.grid(axis='x', alpha=0.3)
    
    for i, v in enumerate(importance_df['Importance']):
        plt.text(v, i, f' {v:.4f}', va='center', fontweight='bold')
    
    plt.tight_layout()
    mlflow.log_figure(fig, "feature_importance.png")
    plt.close()

print("Feature Importance logged to MLflow!")

##15. FUTURE READMISSION RISK PREDICTION SYSTEM

In [0]:
# Create Prediction Function for New Patients

from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType, StringType

# Function to extract probability of readmission (class 1)
def get_readmission_probability(probability_vector):
    """Extract probability that patient will be readmitted"""
    return float(probability_vector[1])

# Register as UDF
probability_udf = udf(get_readmission_probability, DoubleType())


#Generate Risk Scores for ALL Patients


# Use GBT model to score ALL patients in silver table
all_predictions = gbt_model.transform(ml_features_df)

# Extract risk scores
risk_scores_df = all_predictions.select(
    "encounter_id",
    "patient_nbr",
    col("label").alias("actual_readmitted"),
    col("prediction").alias("predicted_readmitted"),
    probability_udf(col("probability")).alias("readmission_risk_score")
)

# Add risk category based on probability
risk_scores_df = risk_scores_df.withColumn(
    "risk_category",
    when(col("readmission_risk_score") >= 0.7, "VERY HIGH RISK")
    .when(col("readmission_risk_score") >= 0.5, "HIGH RISK")
    .when(col("readmission_risk_score") >= 0.3, "MEDIUM RISK")
    .otherwise("LOW RISK")
)

# Show sample
print("Sample Risk Scores:")
display(risk_scores_df.orderBy(col("readmission_risk_score").desc()).limit(20))


# Identify Top 100 Patients at Risk for NEXT Admission


# Get patients who HAVEN'T been readmitted yet but are at high risk
high_risk_future = risk_scores_df.filter(
    (col("actual_readmitted") == 0) &  # Not yet readmitted
    (col("readmission_risk_score") >= 0.5)  # But high predicted risk
).orderBy(col("readmission_risk_score").desc())

print(f"Patients at risk for FUTURE readmission: {high_risk_future.count()}")
print("\nTop 100 patients needing preventive intervention:")
display(high_risk_future.limit(100))



# Join with Patient Details for Actionable Insights


# Join with silver table to get full patient context
silver_df = spark.table("healthcare_analytics.silver.silver_events")

actionable_df = high_risk_future.join(
    silver_df.select(
        "encounter_id",
        "patient_nbr",
        "age_group",
        "num_medications",
        "number_diagnoses",
        "time_in_hospital",
        "medical_specialty",
        "is_emergency",
        "num_medications_category"
    ),
    on=["encounter_id", "patient_nbr"],
    how="inner"
)

# Add recommended actions
actionable_df = actionable_df.withColumn(
    "recommended_action",
    when(col("readmission_risk_score") >= 0.7, "URGENT: Schedule follow-up within 48 hours")
    .when(col("readmission_risk_score") >= 0.6, "Schedule follow-up within 7 days")
    .when(col("readmission_risk_score") >= 0.5, "Monitor closely, call within 14 days")
    .otherwise("Standard discharge protocol")
)

print("Actionable Patient List with Recommendations:")
display(actionable_df.select(
    "patient_nbr",
    "readmission_risk_score",
    "risk_category",
    "age_group",
    "num_medications",
    "number_diagnoses",
    "medical_specialty",
    "recommended_action"
).orderBy(col("readmission_risk_score").desc()).limit(50))


# Save to Gold Layer for Care Coordination Team



# Save high-risk future patients to gold table
future_risk_table = "healthcare_analytics.gold.future_readmission_risk"

actionable_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(future_risk_table)

print(f"Future readmission risk table created: {future_risk_table}")
print(f"   Records: {actionable_df.count()}")


In [0]:
# Risk Score Distribution Analysis

# Analyze risk distribution
risk_distribution = risk_scores_df.groupBy("risk_category").agg(
    count("*").alias("patient_count"),
    avg("readmission_risk_score").alias("avg_risk_score"),
    sum(when(col("actual_readmitted") == 1, 1).otherwise(0)).alias("actual_readmissions")
).orderBy("avg_risk_score", ascending=False)

print("Risk Category Distribution:")
display(risk_distribution)


# Visualize risk distribution
import matplotlib.pyplot as plt

risk_dist_pd = risk_distribution.toPandas()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Patient count by risk category
colors = ['#d73027', '#fc8d59', '#fee090', '#91bfdb']
ax1.bar(risk_dist_pd['risk_category'], risk_dist_pd['patient_count'], color=colors)
ax1.set_ylabel('Number of Patients', fontsize=12, fontweight='bold')
ax1.set_title('Patient Distribution by Risk Category', fontsize=14, fontweight='bold')
ax1.set_xlabel('Risk Category', fontsize=12, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(risk_dist_pd['patient_count']):
    ax1.text(i, v, f'{v:,}', ha='center', va='bottom', fontweight='bold')

# Plot 2: Actual readmissions by predicted risk
ax2.bar(risk_dist_pd['risk_category'], risk_dist_pd['actual_readmissions'], color=colors)
ax2.set_ylabel('Actual Readmissions', fontsize=12, fontweight='bold')
ax2.set_title('Model Validation: Actual Readmissions by Risk Category', fontsize=14, fontweight='bold')
ax2.set_xlabel('Risk Category', fontsize=12, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(risk_dist_pd['actual_readmissions']):
    ax2.text(i, v, f'{v:,}', ha='center', va='bottom', fontweight='bold')

fig.suptitle('Readmission Risk Assessment - Future Prediction System', 
             fontsize=16, fontweight='bold', y=1.02)

plt.tight_layout()
plt.show()


# Log to MLflow
with mlflow.start_run(run_name="Risk_Distribution_Analysis"):
    # Recreate figure
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    colors = ['#d73027', '#fc8d59', '#fee090', '#91bfdb']
    ax1.bar(risk_dist_pd['risk_category'], risk_dist_pd['patient_count'], color=colors)
    ax1.set_ylabel('Number of Patients', fontsize=12, fontweight='bold')
    ax1.set_title('Patient Distribution by Risk Category', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Risk Category', fontsize=12, fontweight='bold')
    ax1.grid(axis='y', alpha=0.3)
    
    for i, v in enumerate(risk_dist_pd['patient_count']):
        ax1.text(i, v, f'{v:,}', ha='center', va='bottom', fontweight='bold')
    
    ax2.bar(risk_dist_pd['risk_category'], risk_dist_pd['actual_readmissions'], color=colors)
    ax2.set_ylabel('Actual Readmissions', fontsize=12, fontweight='bold')
    ax2.set_title('Actual Readmissions by Risk Category', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Risk Category', fontsize=12, fontweight='bold')
    ax2.grid(axis='y', alpha=0.3)
    
    for i, v in enumerate(risk_dist_pd['actual_readmissions']):
        ax2.text(i, v, f'{v:,}', ha='center', va='bottom', fontweight='bold')
    
    fig.suptitle('Risk Assessment Analysis', fontsize=16, fontweight='bold', y=1.02)
    
    mlflow.log_figure(fig, "risk_distribution.png")
    plt.close()

print("Risk distribution logged to MLflow!")


# Summary Statistics


print("=" * 70)
print("FUTURE READMISSION RISK PREDICTION SYSTEM - SUMMARY")
print("=" * 70)

total_patients = risk_scores_df.count()
high_risk_count = risk_scores_df.filter(col("readmission_risk_score") >= 0.5).count()
very_high_risk = risk_scores_df.filter(col("readmission_risk_score") >= 0.7).count()

print(f"Total patients scored: {total_patients:,}")
print(f"High risk (≥50% probability): {high_risk_count:,} ({high_risk_count/total_patients*100:.1f}%)")
print(f"Very high risk (≥70% probability): {very_high_risk:,} ({very_high_risk/total_patients*100:.1f}%)")
print()
print("Actionable Outputs:")
print(f"Gold table created: {future_risk_table}")
print(f"Top 100 high-risk patients identified")
print(f"Recommended actions assigned")
print(f"Care coordination list ready")
print("=" * 70)