In [16]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, StratifiedKFold
from sklearn.metrics import fbeta_score
import pickle
import os
import warnings

warnings.filterwarnings("ignore")

In [17]:
df = pd.read_csv("../dataset/Training.csv")
df['grouped_prognosis'] = df['grouped_prognosis'].str.strip()

In [18]:
selected_features = [
    'increased_appetite',
    'polyuria',
    'itching',
    'skin_rash',
    'continuous_sneezing'
]

In [19]:
target_groups = [
    'Cardiovascular Diseases',
    'Gastrointestinal & Liver Diseases',
    'Infectious Diseases'
]

In [20]:
for target in target_groups:
    print(f"\n{'='*10} Processing: {target} {'='*10}")
    
    # Define X (Features) and y (Target)
    X = df[selected_features]  # <--- LOCKED to only your 5 features
    y = (df['grouped_prognosis'] == target).astype(int)
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale weight for imbalance
    ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
    
    # Model Setup
    model = XGBClassifier(
        scale_pos_weight=ratio,
        random_state=42,
        verbosity=0,
        objective='binary:logistic',
        n_jobs=-1
    )
    
    xgb_param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [3, 5],
        'learning_rate': [0.1], 
        'min_child_weight': [1]
    }
    
    cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    # Train
    grid = GridSearchCV(
        model, xgb_param_grid, cv=cv_strategy, scoring='roc_auc', n_jobs=-1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    # Optimize Threshold
    oof_probs = cross_val_predict(
        best_model, X_train, y_train, cv=cv_strategy, method='predict_proba', n_jobs=-1
    )[:, 1]
    
    thresholds = np.arange(0.05, 0.95, 0.01)
    best_f05 = 0
    best_thresh = 0.5
    
    for thresh in thresholds:
        oof_preds = (oof_probs >= thresh).astype(int)
        current_f05 = fbeta_score(y_train, oof_preds, beta=0.5)
        if current_f05 > best_f05:
            best_f1 = current_f05
            best_thresh = thresh
            
    print(f"Features Used: {selected_features}")
    print(f"Best CV AUC: {grid.best_score_:.4f}")
    print(f"Optimal Threshold: {best_thresh:.3f}")

    # Save ONLY the model object
    safe_name = target.replace(" & ", "_").replace(" ", "_").lower()
    filename = os.path.join("../models/", f"model_side_{safe_name}_xgb.pkl")
    
    with open(filename, 'wb') as f:
        pickle.dump(best_model, f)
        
    print(f"Saved model to: {filename}")


Features Used: ['increased_appetite', 'polyuria', 'itching', 'skin_rash', 'continuous_sneezing']
Best CV AUC: 0.6645
Optimal Threshold: 0.590
Saved model to: ../models/model_side_cardiovascular_diseases_xgb.pkl

Features Used: ['increased_appetite', 'polyuria', 'itching', 'skin_rash', 'continuous_sneezing']
Best CV AUC: 0.7388
Optimal Threshold: 0.940
Saved model to: ../models/model_side_gastrointestinal_liver_diseases_xgb.pkl

Features Used: ['increased_appetite', 'polyuria', 'itching', 'skin_rash', 'continuous_sneezing']
Best CV AUC: 0.6971
Optimal Threshold: 0.890
Saved model to: ../models/model_side_infectious_diseases_xgb.pkl
