In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# ----------------------------
# LOAD DATA
# ----------------------------
df = pd.read_csv("medical_dataset.csv")

# ----------------------------
# RULE-BASED SAFE LABELING
# ----------------------------
def rule_label(text):
    text = str(text).lower()
    if any(k in text for k in ["breathing", "seizure", "chest pain", "unconscious"]):
        return "Emergency"
    if any(k in text for k in ["rash", "vomiting", "dizziness", "high fever"]):
        return "See_Doctor"
    return "Continue"

df["decision"] = df["side_effects"].apply(rule_label)

# ----------------------------
# FEATURE ENGINEERING
# ----------------------------
le_drug = LabelEncoder()
le_gender = LabelEncoder()
le_target = LabelEncoder()

df["drug_enc"] = le_drug.fit_transform(df["drug"])
df["gender_enc"] = le_gender.fit_transform(df["gender"])
y = le_target.fit_transform(df["decision"])

# ----------------------------
# Chronic Diseases Multi-hot Encoding
# ----------------------------
chronic_options = ["Diabetes","Hypertension","Heart Disease","Kidney Disease","None"]

# إذا العمود يحتوي على أكثر من مرض مفصول بفواصل
def encode_chronic(x):
    x_list = [d.strip() for d in str(x).split(",")]
    return [1 if opt in x_list else 0 for opt in chronic_options]

chronic_encoded = np.array(df["chronic_diseases"].apply(encode_chronic).to_list())

# ----------------------------
# Numeric Features
# ----------------------------
numeric_features = ["age","weight","smoker"]  # smoker بالفعل 0/1
df["smoker"] = df["smoker"].map({"Yes":1,"No":0})
X_num = StandardScaler().fit_transform(df[["age","weight","smoker"]])

# ----------------------------
# NLP TEXT FEATURES
# ----------------------------
vectorizer = TfidfVectorizer(max_features=1200, ngram_range=(1,2), stop_words="english")
X_text = vectorizer.fit_transform(df["side_effects"])

# ----------------------------
# COMBINE ALL FEATURES
# ----------------------------
X = np.hstack([
    X_text.toarray(),
    df[["drug_enc","gender_enc"]].values,
    chronic_encoded,
    X_num
])

# ----------------------------
# TRAIN/TEST SPLIT
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ----------------------------
# MODEL
# ----------------------------
model = XGBClassifier(
    n_estimators=500,
    max_depth=7,
    learning_rate=0.04,
    subsample=0.85,
    colsample_bytree=0.85,
    objective="multi:softprob",
    num_class=len(le_target.classes_),
    eval_metric="mlogloss",
    random_state=42
)

model.fit(X_train, y_train)

# ----------------------------
# EVALUATION
# ----------------------------
print(classification_report(y_test, model.predict(X_test), target_names=le_target.classes_))

# ----------------------------
# SAVE ARTIFACTS
# ----------------------------
joblib.dump(model,"medical_model.pkl")
joblib.dump(vectorizer,"tfidf_vectorizer.pkl")
joblib.dump(le_drug,"drug_encoder.pkl")
joblib.dump(le_gender,"gender_encoder.pkl")
joblib.dump(le_target,"decision_encoder.pkl")
joblib.dump(StandardScaler().fit(df[["age","weight","smoker"]]),"numeric_scaler.pkl")

print("✅ Model trained for multi-chronic, multi-feature Streamlit app")



  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


              precision    recall  f1-score   support

    Continue       0.64      0.78      0.70         9
   Emergency       1.00      0.99      1.00       520
  See_Doctor       0.97      0.97      0.97        31

    accuracy                           0.99       560
   macro avg       0.87      0.91      0.89       560
weighted avg       0.99      0.99      0.99       560

✅ Model trained for multi-chronic, multi-feature Streamlit app


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
