In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import joblib

# -----------------------------
# LOAD DATA
# -----------------------------
df = pd.read_csv("medical_dataset.csv")

# -----------------------------
# RULE-BASED LABELS FROM SIDE EFFECTS
# -----------------------------
def label_decision(text):
    text = str(text).lower()
    emergency_keywords = ['breathing', 'chest pain', 'seizure', 'unconscious', 'swelling of face', 'anaphylaxis', 'fainting']
    doctor_keywords = ['rash', 'vomiting', 'severe', 'high fever', 'dizziness', 'blurred vision', 'fast heartbeat']
    for k in emergency_keywords:
        if k in text:
            return 'Emergency'
    for k in doctor_keywords:
        if k in text:
            return 'See_Doctor'
    return 'Continue'

df['decision'] = df['side_effects'].apply(label_decision)

# -----------------------------
# LABEL ENCODING FOR CATEGORICAL FEATURES
# -----------------------------
le_drug = LabelEncoder()
df['drug_enc'] = le_drug.fit_transform(df['drug_name'])

le_cond = LabelEncoder()
df['cond_enc'] = le_cond.fit_transform(df['medical_condition'])

# Encode target
le_target = LabelEncoder()
y = le_target.fit_transform(df['decision'])

# -----------------------------
# TF-IDF VECTORIZE SIDE EFFECTS
# -----------------------------
vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
X_text = vectorizer.fit_transform(df['side_effects'])

# Combine features
X_struct = df[['drug_enc', 'cond_enc']].values
X = np.hstack([X_text.toarray(), X_struct])

# -----------------------------
# TRAIN/TEST SPLIT
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# TRAIN XGBOOST MODEL
# -----------------------------
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    num_class=len(le_target.classes_),
    eval_metric='mlogloss',
    random_state=42
)
model.fit(X_train, y_train)

# -----------------------------
# EVALUATION
# -----------------------------
preds = model.predict(X_test)
print(classification_report(y_test, preds, target_names=le_target.classes_))

# -----------------------------
# SAVE ARTIFACTS
# -----------------------------
joblib.dump(model, 'medical_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(le_drug, 'drug_encoder.pkl')
joblib.dump(le_cond, 'condition_encoder.pkl')
joblib.dump(le_target, 'decision_encoder.pkl')

print("✅ Training completed and artifacts saved successfully!")




              precision    recall  f1-score   support

    Continue       1.00      0.83      0.91         6
   Emergency       1.00      1.00      1.00       521
  See_Doctor       0.94      0.97      0.96        33

    accuracy                           0.99       560
   macro avg       0.98      0.93      0.95       560
weighted avg       0.99      0.99      0.99       560

✅ Training completed and artifacts saved successfully!
