In [4]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import joblib

# -----------------------------
# Load Dataset
# -----------------------------
data = pd.read_csv("medical_dataset.csv")

# Keep needed columns only
data = data[['drug_name', 'medical_condition', 'side_effects']].dropna()

# -----------------------------
# RULE-BASED LABEL CREATION (HIGH EMERGENCY SENSITIVITY)
# -----------------------------
def label_decision(text):
    text = text.lower()

    # HIGH-RISK symptoms (Emergency)
    emergency_keywords = [
        'breathing', 'shortness of breath', 'difficulty breathing',
        'chest pain', 'seizure', 'unconscious', 'loss of consciousness',
        'swelling of face', 'anaphylaxis', 'fainting'
    ]

    # MEDIUM-RISK symptoms (See Doctor)
    doctor_keywords = [
        'rash', 'vomiting', 'severe', 'high fever',
        'dizziness', 'blurred vision', 'fast heartbeat'
    ]

    for k in emergency_keywords:
        if k in text:
            return 'Emergency'

    for k in doctor_keywords:
        if k in text:
            return 'See_Doctor'

    return 'Continue'


data['decision'] = data['side_effects'].apply(label_decision)

# -----------------------------
# Encode categorical columns
# -----------------------------
# -----------------------------
# Encode categorical columns (SAFE for new values)
# -----------------------------
le_drug = LabelEncoder()
data['drug_name_enc'] = le_drug.fit_transform(data['drug_name'])

le_cond = LabelEncoder()
data['condition_enc'] = le_cond.fit_transform(data['medical_condition'])

le_target = LabelEncoder()
y = le_target.fit_transform(data['decision'])

# -----------------------------
# Text Vectorization (Side Effects)
# -----------------------------
vectorizer = TfidfVectorizer(
    max_features=500,
    stop_words='english'
)

X_text = vectorizer.fit_transform(data['side_effects'])

# Combine features
# Add fallback category handling by allowing -1 values
X = np.hstack([
    X_text.toarray(),
    data[['drug_name_enc', 'condition_enc']].values
])

# -----------------------------
# Train/Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# XGBoost Model
# -----------------------------
model = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42
)

model.fit(X_train, y_train)

# -----------------------------
# Evaluation
# -----------------------------
preds = model.predict(X_test)
print(classification_report(y_test, preds, target_names=le_target.classes_))

# -----------------------------
# Save artifacts
# -----------------------------
joblib.dump(model, 'xgb_medical_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(le_drug, 'drug_encoder.pkl')
joblib.dump(le_cond, 'condition_encoder.pkl')
joblib.dump(le_target, 'decision_encoder.pkl')

print("Training completed and files saved")




              precision    recall  f1-score   support

    Continue       1.00      0.83      0.91         6
   Emergency       0.99      1.00      1.00       525
  See_Doctor       0.93      0.93      0.93        29

    accuracy                           0.99       560
   macro avg       0.98      0.92      0.95       560
weighted avg       0.99      0.99      0.99       560

Training completed and files saved
