In [5]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# ----------------------------
# LOAD DATA
# ----------------------------
df = pd.read_csv("medical_dataset.csv")

# ----------------------------
# RULE-BASED SAFE LABELING
# ----------------------------
def rule_label(text):
    text = str(text).lower()
    if any(k in text for k in ["breathing", "seizure", "chest pain", "unconscious"]):
        return "Emergency"
    if any(k in text for k in ["rash", "vomiting", "dizziness", "high fever"]):
        return "See_Doctor"
    return "Continue"

df["decision"] = df["side_effects"].apply(rule_label)

# ----------------------------
# FEATURE ENGINEERING
# ----------------------------
le_drug = LabelEncoder()
le_cond = LabelEncoder()
le_gender = LabelEncoder()
le_target = LabelEncoder()

df["drug_enc"] = le_drug.fit_transform(df["drug"])
df["cond_enc"] = le_cond.fit_transform(df["chronic_diseases"])
df["gender_enc"] = le_gender.fit_transform(df["gender"])

# Binary features
df["smoker"] = df["smoker"].map({"Yes":1,"No":0})
df["has_chronic"] = df["chronic_diseases"].apply(lambda x: 0 if str(x).lower() in ["none","no"] else 1)

# Numeric
numeric_features = ["age","weight"]
scaler = StandardScaler()
X_num = scaler.fit_transform(df[numeric_features])

# ----------------------------
# NLP TEXT FEATURES
# ----------------------------
vectorizer = TfidfVectorizer(max_features=1200, ngram_range=(1,2), stop_words="english")
X_text = vectorizer.fit_transform(df["side_effects"])

# ----------------------------
# COMBINE ALL FEATURES
# ----------------------------
X = np.hstack([
    X_text.toarray(),
    df[["drug_enc","cond_enc","gender_enc","smoker","has_chronic"]].values,
    X_num
])
y = le_target.fit_transform(df["decision"])

# ----------------------------
# TRAIN/TEST SPLIT
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ----------------------------
# MODEL
# ----------------------------
model = XGBClassifier(
    n_estimators=500,
    max_depth=7,
    learning_rate=0.04,
    subsample=0.85,
    colsample_bytree=0.85,
    objective="multi:softprob",
    num_class=len(le_target.classes_),
    eval_metric="mlogloss",
    random_state=42
)

model.fit(X_train, y_train)

# ----------------------------
# EVALUATION
# ----------------------------
print(classification_report(y_test, model.predict(X_test), target_names=le_target.classes_))

# ----------------------------
# SAVE ARTIFACTS
# ----------------------------
joblib.dump(model,"medical_model.pkl")
joblib.dump(vectorizer,"tfidf_vectorizer.pkl")
joblib.dump(le_drug,"drug_encoder.pkl")
joblib.dump(le_cond,"condition_encoder.pkl")
joblib.dump(le_gender,"gender_encoder.pkl")
joblib.dump(le_target,"decision_encoder.pkl")
joblib.dump(scaler,"numeric_scaler.pkl")

print("✅ Model trained with demographics & risk factors")



  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


              precision    recall  f1-score   support

    Continue       0.64      0.78      0.70         9
   Emergency       1.00      0.99      1.00       520
  See_Doctor       0.94      0.97      0.95        31

    accuracy                           0.99       560
   macro avg       0.86      0.91      0.88       560
weighted avg       0.99      0.99      0.99       560

✅ Model trained with demographics & risk factors
