In [1]:
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

# ------------------------------
# 1. Load Preprocessing Objects and Models
# ------------------------------
# Disease Prediction Model (TensorFlow)
disease_model = load_model('pred_model/model2/disease_prediction_model.h5')
disease_label_encoders = joblib.load('pred_model/model2/label_encoders.pkl')  # contains encoder for 'diagnosis'
disease_scaler = joblib.load('pred_model/model2/robust_scaler.pkl')
mlb = joblib.load('pred_model/model2/multi_label_binarizer.pkl')

# Recommendation Model (scikit-learn)
rec_model = joblib.load('recom_model/proto_model/next_recommendation_model.pkl')
rec_le = joblib.load('recom_model/proto_model/next_recommendation_labelencoder.pkl')
outcome_le = joblib.load('recom_model/proto_model/outcome_labelencoder.pkl')

# ------------------------------
# 2. Load Dataset
# ------------------------------
df = pd.read_csv('dataset/million_patients.csv')
print(f"Loaded dataset with {len(df):,} records")
print("Columns in dataset:", df.columns.tolist())

# ------------------------------
# 3. Preprocess for Disease Prediction Model
# ------------------------------
# Create a copy for disease model preprocessing
df_disease = df.copy()

# Drop columns that aren't used for prediction
cols_to_drop = ['patient_id', 'admit_date', 'discharge_date', 'los']
df_disease.drop(columns=[col for col in cols_to_drop if col in df_disease.columns], inplace=True)

# Categorical columns used in disease model
cat_cols = ['gender', 'region', 'diagnosis', 'category', 'medication', 'outcome']
for col in cat_cols:
    if col in df_disease.columns:
        df_disease[col] = disease_label_encoders[col].transform(df_disease[col])

# Process the 'symptoms' column using the saved MultiLabelBinarizer
df_disease['symptoms'] = df_disease['symptoms'].apply(lambda x: x.split(', '))
symptom_encoded = pd.DataFrame(mlb.transform(df_disease['symptoms']), columns=mlb.classes_)
df_disease = pd.concat([df_disease.drop('symptoms', axis=1), symptom_encoded], axis=1)

# Determine numeric columns (use available ones)
if 'lab_test_glucose' in df_disease.columns:
    num_cols = ['age', 'bmi', 'lab_test_glucose', 'systolic_bp', 'diastolic_bp', 'wbc']
elif 'glucose' in df_disease.columns:
    num_cols = ['age', 'bmi', 'glucose', 'systolic_bp', 'diastolic_bp', 'wbc']
else:
    raise KeyError("No glucose column found.")

# Scale numeric features using the scaler fitted for the disease model
df_disease[num_cols] = disease_scaler.transform(df_disease[num_cols])

# The disease model was trained to predict the diagnosis, so drop that column for features.
X_disease = df_disease.drop('diagnosis', axis=1).values

# Predict disease using the TensorFlow model
disease_preds = disease_model.predict(X_disease)
predicted_disease_indices = np.argmax(disease_preds, axis=1)
# Invert label encoding for diagnosis (from our saved label_encoders)
diagnosis_le = disease_label_encoders['diagnosis']
predicted_diseases = diagnosis_le.inverse_transform(predicted_disease_indices)

print("\nSample Disease Predictions:")
print(predicted_diseases[:10])

# ------------------------------
# 4. Preprocess for Recommendation Model
# ------------------------------
# For recommendation, we use a subset of features:
# ['age', 'bmi', 'glucose' (or 'lab_test_glucose'), 'systolic_bp', 'diastolic_bp', 'wbc', 'severity', 'outcome_enc']
df_rec = df.copy()

# Encode 'outcome' for recommendation model using outcome_le
df_rec['outcome_enc'] = outcome_le.transform(df_rec['outcome'])

# Determine which glucose column exists:
if 'lab_test_glucose' in df_rec.columns:
    rec_numeric_cols = ['age', 'bmi', 'lab_test_glucose', 'systolic_bp', 'diastolic_bp', 'wbc', 'severity']
elif 'glucose' in df_rec.columns:
    rec_numeric_cols = ['age', 'bmi', 'glucose', 'systolic_bp', 'diastolic_bp', 'wbc', 'severity']
else:
    raise KeyError("No glucose column found in recommendation data.")

# For recommendation model, we need to scale these features with a scaler fitted on these exact columns.
# Fit a new RobustScaler on rec_numeric_cols.
from sklearn.preprocessing import RobustScaler
rec_scaler = RobustScaler()
df_rec[rec_numeric_cols] = rec_scaler.fit_transform(df_rec[rec_numeric_cols])
joblib.dump(rec_scaler, 'rec_robust_scaler.pkl')
print("Numeric features scaled for recommendation model.")

# Build the feature matrix including outcome_enc
rec_features = rec_numeric_cols + ['outcome_enc']
X_rec = df_rec[rec_features].values

# Predict recommendation using the RandomForest recommendation model
rec_preds = rec_model.predict(X_rec)
predicted_recommendations = rec_le.inverse_transform(rec_preds)

print("\nSample Recommendation Predictions:")
print(predicted_recommendations[:10])




Loaded dataset with 1,000,000 records
Columns in dataset: ['patient_id', 'age', 'gender', 'region', 'bmi', 'diagnosis', 'category', 'severity', 'symptoms', 'glucose', 'systolic_bp', 'diastolic_bp', 'wbc', 'admit_date', 'los', 'discharge_date', 'medication', 'outcome']
[1m31250/31250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 1ms/step

Sample Disease Predictions:
['Heart Failure' 'HIV/AIDS' 'COPD' 'COVID-19' 'Hypertension' 'Cirrhosis'
 'Alzheimer' 'Peripheral Artery Disease' 'Heart Failure' 'Cardiomyopathy']
Numeric features scaled for recommendation model.





Sample Recommendation Predictions:
['Surgery Required' 'Surgery Required' 'Prescribe Medication'
 'Prescribe Medication' 'Home Care Monitoring' 'Admit to ICU'
 'Surgery Required' 'Prescribe Medication' 'Surgery Required'
 'Admit to ICU']
