In [33]:
import sqlite3
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_absolute_error, r2_score
from datetime import timedelta
import matplotlib.pyplot as plt

# 1) Load Training Data

In [34]:
conn = sqlite3.connect("RFID.db")

df = pd.read_sql("""
SELECT 
    age, sex, bmi, diagnosis, blood_type,
    gcs_total, wfns_grade, stopbang_score,
    sodium, potassium, creatinine, gfr, alt, ast, bilirubin,
    hemoglobin, wbc, platelets, blood_sugar,
    num_medications, num_investigations, imaging_abnormal_count, comorbidity_count,
    severity_score,
    admission_date,
    recovery_days
FROM PatientFeatures
WHERE recovery_days IS NOT NULL
""", conn)

print("Loaded rows:", len(df))

Loaded rows: 925


# 2) Preprocessing

In [None]:
categorical = ["sex", "blood_type", "diagnosis"]
numeric = [c for c in df.columns if c not in categorical + ["recovery_days", "admission_date"]]
# print(categorical)
# print(numeric)
# print(df.columns)
X = df[categorical + numeric]
y = df["recovery_days"]

# pipeline: one-hot encode categorical + scale numeric
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", StandardScaler(), numeric)
])

# 3) Train & Test

In [41]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", HistGradientBoostingRegressor(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


# ---- Define hyperparameter space ----
param_dist = {
    "regressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "regressor__max_iter": [200, 500, 1000],              # number of boosting stages
    "regressor__max_depth": [None, 5, 10, 20],
    "regressor__min_samples_leaf": [20, 50, 100],         # regularization
    "regressor__l2_regularization": [0.0, 0.1, 1.0, 10.0],
    "regressor__max_bins": [64, 128],                    # histogram binning
    "regressor__early_stopping": [True]                   # prevent overfitting
}

# ---- Randomized search ----
random_search_hgb = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=25,              # try 25 random combos
    cv=3,                   # 3-fold cross validation
    scoring="neg_mean_absolute_error",
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# ---- Fit search ----
random_search_hgb.fit(X_train, y_train)

# ---- Best model ----
best_hgb = random_search_hgb.best_estimator_
print("Best parameters:", random_search_hgb.best_params_)

# ---- Evaluate on test ----
y_pred_hgb = best_hgb.predict(X_test)
mae_hgb = mean_absolute_error(y_test, y_pred_hgb)
r2_hgb = r2_score(y_test, y_pred_hgb)
print("Tuned HGB MAE:", mae_hgb)
print("Tuned HGB R²:", r2_hgb)

MAE: 4.996753716518559
R²: 0.6103936393752393
Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best parameters: {'regressor__min_samples_leaf': 20, 'regressor__max_iter': 500, 'regressor__max_depth': None, 'regressor__max_bins': 64, 'regressor__learning_rate': 0.05, 'regressor__l2_regularization': 10.0, 'regressor__early_stopping': True}
Tuned HGB MAE: 4.638230100313117
Tuned HGB R²: 0.6391247818799596


In [60]:
from sklearn.inspection import permutation_importance

# X_test should be the preprocessed features
X_test_transformed = best_hgb.named_steps['preprocessor'].transform(X_test)

result = permutation_importance(
    best_hgb.named_steps['regressor'], 
    X_test_transformed, 
    y_test, 
    n_repeats=10, 
    random_state=42,
    n_jobs=-1
)

# Get feature names from preprocessor
def get_feature_names(preprocessor):
    feature_names = []
    for name, transformer, columns in preprocessor.transformers_:
        if name != 'remainder':
            if hasattr(transformer, 'get_feature_names_out'):
                names = transformer.get_feature_names_out(columns)
            else:
                names = columns
            feature_names.extend(names)
    return feature_names

feature_names = get_feature_names(best_hgb.named_steps['preprocessor'])

importances = result.importances_mean
feature_importance_dict = dict(zip(feature_names, importances))
sorted_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

print("Top 10 features by permutation importance:")
for feature, importance in sorted_importances[:10]:
    print(f"{feature}: {importance:.4f}")

Top 10 features by permutation importance:
gcs_total: 0.3601
diagnosis_Migraine/Headache: 0.0834
diagnosis_Epilepsy/Seizure: 0.0796
diagnosis_Hemorrhagic stroke: 0.0413
severity_score: 0.0354
diagnosis_Brain tumor: 0.0292
diagnosis_Ischemic stroke: 0.0273
wfns_grade: 0.0137
diagnosis_Other: 0.0116
creatinine: 0.0069


# 4) Save the Model

In [42]:
joblib.dump(model, "model.pkl")
print("Model saved as model.pkl")

Model saved as model.pkl


# 5) Example prediction & insert into Predictions table

In [58]:
admission_id = 2  # change to a valid admission_id in your DB
conn = sqlite3.connect("RFID.db")

row = pd.read_sql(f"SELECT * FROM PatientFeatures WHERE admission_id={admission_id}", conn)

if not row.empty:
    # drop leakage cols
    drop_cols = ["feature_id","admission_id","patient_id","recovery_days",
                    "discharge_date","predicted_recovery_days","predicted_discharge_date",
                    "model_version","prediction_confidence","created_at"]
    X_new = row.drop(columns=[c for c in drop_cols if c in row.columns])

    pred_days = int(model.predict(X_new)[0])
    admission_date = pd.to_datetime(row["admission_date"][0])
    pred_discharge = (admission_date + timedelta(days=pred_days)).strftime("%Y-%m-%d")
    print(pred_days)
    print(pred_discharge)
    print(admission_date)

    cursor = conn.cursor()

    cursor.execute("""
    INSERT INTO Predictions (admission_id, patient_id, predicted_recovery_days,
                             predicted_discharge_date, model_version, confidence)
    VALUES (?, ?, ?, ?, ?, ?)
    """, (int(row["admission_id"]), row["patient_id"][0],
          pred_days, pred_discharge, "RF_v1.0", 0.85))
    conn.commit()
    print(f"Prediction inserted: {pred_days} days, discharge {pred_discharge}")


conn.close()


18
2023-11-14
2023-10-27 16:39:20
2 P0001 18 2023-11-14 RF_v1.0 0.85
Prediction inserted: 18 days, discharge 2023-11-14


  print(int(row["admission_id"]), row["patient_id"][0],
  """, (int(row["admission_id"]), row["patient_id"][0],
