In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import accuracy_score, classification_report
import joblib

# -------------------------------------
# 1. Load the Dataset
# -------------------------------------
df = pd.read_csv('dataset/million_patients.csv')
print(f"Loaded dataset with {len(df):,} records")

# The dataset columns (for example) include:
# ['patient_id', 'age', 'gender', 'region', 'bmi', 'diagnosis', 'category', 
#  'severity', 'symptoms', 'glucose', 'systolic_bp', 'diastolic_bp', 'wbc', 
#  'admit_date', 'los', 'discharge_date', 'medication', 'outcome']

# -------------------------------------
# 2. Create a Recommendation Column
# -------------------------------------
def generate_recommendation(row):
    """
    Generate a next best action recommendation based on severity and outcome.
    For demonstration:
      - If severity >= 4 and outcome in ['Critical', 'Deteriorated']: Admit to ICU.
      - If severity >= 4 and outcome is not critical: Surgery Required.
      - If severity == 3 and outcome is 'Deteriorated': Refer to Specialist.
      - If severity == 3 and outcome is not deteriorated: Prescribe Medication.
      - Otherwise: Home Care Monitoring.
    """
    outcome = row['outcome']  # e.g., 'Recovered', 'Stable', 'Deteriorated', 'Critical', 'Deceased'
    severity = row['severity']
    if severity >= 4:
        if outcome in ['Critical', 'Deteriorated']:
            return 'Admit to ICU'
        else:
            return 'Surgery Required'
    elif severity == 3:
        if outcome == 'Deteriorated':
            return 'Refer to Specialist'
        else:
            return 'Prescribe Medication'
    else:
        return 'Home Care Monitoring'

df['recommendation'] = df.apply(generate_recommendation, axis=1)
print("Recommendation column added.")

# -------------------------------------
# 3. Preprocess Features for Recommendation Model
# -------------------------------------
# We will use a subset of features.
# Available numeric columns in the dataset are:
#   'age', 'bmi', 'glucose', 'systolic_bp', 'diastolic_bp', 'wbc', 'severity'
# We'll also include 'outcome' (encoded) as an additional feature.
features = ['age', 'bmi', 'glucose', 'systolic_bp', 'diastolic_bp', 'wbc', 'severity']

# Encode 'outcome' as a numeric feature
le_outcome = LabelEncoder()
df['outcome_enc'] = le_outcome.fit_transform(df['outcome'])
features.append('outcome_enc')

# The target variable is "recommendation" – we need to encode it.
le_rec = LabelEncoder()
df['recommendation_enc'] = le_rec.fit_transform(df['recommendation'])

# -------------------------------------
# 4. Build Feature Matrix and Target
# -------------------------------------
X = df[features]
y = df['recommendation_enc']

# Scale numeric features (we exclude outcome_enc since it's already encoded)
scaler = RobustScaler()
X[features[:-1]] = scaler.fit_transform(X[features[:-1]])
joblib.dump(scaler, 'next_recommendation_scaler.pkl')

# -------------------------------------
# 5. Train-Test Split
# -------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training size: {X_train.shape}, Testing size: {X_test.shape}")

# -------------------------------------
# 6. Train the Recommendation Model
# -------------------------------------
rec_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rec_model.fit(X_train, y_train)

# -------------------------------------
# 7. Evaluate the Model
# -------------------------------------
y_pred = rec_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Next Recommendation Model Accuracy: {:.4f}".format(accuracy))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le_rec.classes_))

# -------------------------------------
# 8. Save the Model and Preprocessing Objects
# -------------------------------------
joblib.dump(rec_model, 'recom_model/proto_model/next_recommendation_model.pkl')
joblib.dump(le_rec, 'recom_model/proto_model/next_recommendation_labelencoder.pkl')
joblib.dump(le_outcome, 'recom_model/proto_model/outcome_labelencoder.pkl')

print("✅ Next Recommendation Model and related artifacts saved successfully.")


Loaded dataset with 1,000,000 records
Recommendation column added.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[features[:-1]] = scaler.fit_transform(X[features[:-1]])


Training size: (800000, 8), Testing size: (200000, 8)
Next Recommendation Model Accuracy: 0.9999
Classification Report:
                      precision    recall  f1-score   support

        Admit to ICU       1.00      1.00      1.00     18577
Home Care Monitoring       1.00      1.00      1.00     21397
Prescribe Medication       1.00      1.00      1.00     68548
 Refer to Specialist       1.00      1.00      1.00     12154
    Surgery Required       1.00      1.00      1.00     79324

            accuracy                           1.00    200000
           macro avg       1.00      1.00      1.00    200000
        weighted avg       1.00      1.00      1.00    200000

✅ Next Recommendation Model and related artifacts saved successfully.
