In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
import joblib

# =============================
# LOAD DATA
# =============================
df = pd.read_csv("medical_dataset_updated.csv")
df['side_effects'] = df['side_effects'].fillna("").astype(str)

# =============================
# MEDICAL-GRADE LABELING
# =============================
def label_decision(text):
    text = text.lower()

    emergency = [
        'difficulty breathing', 'chest pain', 'seizure',
        'loss of consciousness', 'anaphylaxis',
        'swelling of face', 'throat closing'
    ]

    see_doctor = [
        'severe', 'persistent vomiting', 'high fever',
        'confusion', 'blurred vision', 'fast heartbeat',
        'rash spreading', 'severe dizziness'
    ]

    for k in emergency:
        if k in text:
            return 'Emergency'

    for k in see_doctor:
        if k in text:
            return 'See_Doctor'

    return 'Continue'


df['decision'] = df['side_effects'].apply(label_decision)

# =============================
# ENCODERS
# =============================
le_drug = LabelEncoder()
df['drug_enc'] = le_drug.fit_transform(df['drug_name'])

le_cond = LabelEncoder()
df['cond_enc'] = le_cond.fit_transform(df['medical_condition'])

le_target = LabelEncoder()
y = le_target.fit_transform(df['decision'])

# =============================
# TF-IDF (STRONG NLP)
# =============================
vectorizer = TfidfVectorizer(
    max_features=1200,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9,
    stop_words='english',
    sublinear_tf=True
)

X_text = vectorizer.fit_transform(df['side_effects'])

# =============================
# CORE FEATURES ONLY
# =============================
X_struct = df[['drug_enc', 'cond_enc']].values
X = np.hstack([X_text.toarray(), X_struct])

# =============================
# TRAIN / TEST SPLIT
# =============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# =============================
# BASE MODEL
# =============================
base_model = XGBClassifier(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    objective='multi:softprob',
    num_class=len(le_target.classes_),
    eval_metric='mlogloss',
    random_state=42
)

# =============================
# PROBABILITY CALIBRATION
# =============================
model = CalibratedClassifierCV(
    base_model,
    method='isotonic',
    cv=3
)

model.fit(X_train, y_train)

# =============================
# EVALUATION
# =============================
preds = model.predict(X_test)

print("\nðŸ“Š MEDICAL-GRADE REPORT\n")
print(classification_report(
    y_test,
    preds,
    target_names=le_target.classes_
))

# =============================
# SAVE ARTIFACTS
# =============================
joblib.dump(model, 'medical_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(le_drug, 'drug_encoder.pkl')
joblib.dump(le_cond, 'condition_encoder.pkl')
joblib.dump(le_target, 'decision_encoder.pkl')

print("âœ… Model trained & saved successfully (Clinical-Grade)")







ðŸ“Š MEDICAL-GRADE REPORT

              precision    recall  f1-score   support

    Continue       0.99      1.00      0.99        95
   Emergency       1.00      1.00      1.00       297
  See_Doctor       0.99      0.99      0.99       168

    accuracy                           0.99       560
   macro avg       0.99      0.99      0.99       560
weighted avg       0.99      0.99      0.99       560

âœ… Model trained & saved successfully (Clinical-Grade)
