In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import xgboost as xgb
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

In [2]:

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('predict_improvement_Dataset.csv')

df['allergies'] = df['allergies'].fillna('None')
df['chronic conditions'] = df['chronic conditions'].fillna('None')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

Loading dataset...
Dataset shape: (15000, 8)
Columns: ['age', 'gender', 'diagnosis', 'medicine', 'allergies', 'chronic conditions', 'severity_score', 'improved']

First few rows:
   age  gender                     diagnosis     medicine   allergies  \
0   41    Male  Type 2 Diabetes_uncontrolled      Insulin  Penicillin   
1   54    Male            Hypertension_51-70   Amlodipine        None   
2   42    Male  Type 2 Diabetes_uncontrolled      Insulin        None   
3   16  Female                 Asthma_severe  Fluticasone        None   
4   22    Male                 Asthma_severe  Fluticasone        None   

  chronic conditions  severity_score improved  
0               None       38.189022      yes  
1               None       59.748302       no  
2                CKD       68.142473       no  
3               None       36.109226      yes  
4               None       64.668066       no  


In [3]:

print(f"\nTarget distribution:")
print(df['improved'].value_counts(normalize=True))


Target distribution:
improved
no     0.5426
yes    0.4574
Name: proportion, dtype: float64


In [4]:

# Preprocessing
print("\nPreprocessing data...")

# Separate features and target
X = df.drop('improved', axis=1)
y = df['improved']


Preprocessing data...


In [5]:

# Encode categorical variables
label_encoders = {}
categorical_columns = ['gender', 'diagnosis', 'medicine', 'allergies', 'chronic conditions']

In [6]:

X_encoded = X.copy()
for col in categorical_columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [7]:

# Encode target variable
y_encoded = LabelEncoder().fit_transform(y)

print(f"Features after encoding: {list(X_encoded.columns)}")
print(f"Feature shapes: {X_encoded.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

Features after encoding: ['age', 'gender', 'diagnosis', 'medicine', 'allergies', 'chronic conditions', 'severity_score']
Feature shapes: (15000, 7)


In [8]:

# Scale features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (12000, 7)
Test set size: (3000, 7)


In [9]:

# Define models
models = {
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    # 'Random Forest': RandomForestClassifier(random_state=42),
    # 'SVM': SVC(random_state=42, probability=True),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    # 'CatBoost': CatBoostClassifier(random_seed=42, verbose=0)
}

In [10]:

# Train and evaluate models
results = {}
trained_models = {}

print("\n" + "="*50)
print("TRAINING AND EVALUATING MODELS")
print("="*50)

for name, model in models.items():
    print(f"\n🔹 Training {name}...")

    # Use scaled data for SVM, original for tree-based models
    if name == 'SVM':
        X_train_use = X_train_scaled
        X_test_use = X_test_scaled
    else:
        X_train_use = X_train
        X_test_use = X_test

    # Train the model
    model.fit(X_train_use, y_train)
    trained_models[name] = model

    # Make predictions
    y_pred = model.predict(X_test_use)
    y_pred_proba = model.predict_proba(X_test_use)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Cross-validation
    cv_scores = cross_val_score(model, X_train_use, y_train, cv=5, scoring='accuracy')

    results[name] = {
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

    print(f"✅ {name} Results:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   ROC-AUC: {roc_auc:.4f}")
    print(f"   CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")


TRAINING AND EVALUATING MODELS

🔹 Training XGBoost...
✅ XGBoost Results:
   Accuracy: 0.9290
   ROC-AUC: 0.9791
   CV Accuracy: 0.9183 (+/- 0.0028)

🔹 Training LightGBM...
[LightGBM] [Info] Number of positive: 5489, number of negative: 6511
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 390
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.457417 -> initscore=-0.170747
[LightGBM] [Info] Start training from score -0.170747
[LightGBM] [Info] Number of positive: 4392, number of negative: 5208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003642 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 389
[LightGBM] [Info] Number of data points in the train set: 9600, number of

In [11]:

# Detailed evaluation for each model
print("\n" + "="*50)
print("DETAILED MODEL EVALUATION")
print("="*50)

for name in models.keys():
    print(f"\n🔸 {name} Detailed Results:")
    print("-" * 30)
    print("Classification Report:")
    print(classification_report(y_test, results[name]['y_pred'],
                              target_names=['no improvement', 'improvement']))

    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, results[name]['y_pred'])
    print(cm)


DETAILED MODEL EVALUATION

🔸 XGBoost Detailed Results:
------------------------------
Classification Report:
                precision    recall  f1-score   support

no improvement       0.93      0.94      0.93      1628
   improvement       0.93      0.92      0.92      1372

      accuracy                           0.93      3000
     macro avg       0.93      0.93      0.93      3000
  weighted avg       0.93      0.93      0.93      3000


Confusion Matrix:
[[1530   98]
 [ 115 1257]]

🔸 LightGBM Detailed Results:
------------------------------
Classification Report:
                precision    recall  f1-score   support

no improvement       0.94      0.95      0.94      1628
   improvement       0.93      0.92      0.93      1372

      accuracy                           0.94      3000
     macro avg       0.93      0.93      0.93      3000
  weighted avg       0.93      0.94      0.93      3000


Confusion Matrix:
[[1540   88]
 [ 107 1265]]


In [12]:

# Feature importance for tree-based models
print("\n" + "="*50)
print("FEATURE IMPORTANCE")
print("="*50)

feature_names = list(X_encoded.columns)

# for name in ['XGBoost', 'Random Forest','LightGBM','CatBoost']:
for name in ['XGBoost','LightGBM']:
    print(f"\n🔸 {name} Feature Importance:")
    model = trained_models[name]
    if hasattr(model, 'feature_importances_'):
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)

        print(importance_df)


FEATURE IMPORTANCE

🔸 XGBoost Feature Importance:
              feature  importance
6      severity_score    0.780545
1              gender    0.038526
0                 age    0.037191
5  chronic conditions    0.037068
3            medicine    0.036226
2           diagnosis    0.035282
4           allergies    0.035163

🔸 LightGBM Feature Importance:
              feature  importance
6      severity_score         998
0                 age         931
2           diagnosis         319
5  chronic conditions         254
3            medicine         229
4           allergies         150
1              gender         119


In [13]:

# Hyperparameter tuning for best model
print("\n" + "="*50)
print("HYPERPARAMETER TUNING")
print("="*50)


HYPERPARAMETER TUNING


In [14]:

# Find best performing model
best_model_name = max(results.keys(), key=lambda x: results[x]['roc_auc'])
print(f"Best performing model: {best_model_name} (ROC-AUC: {results[best_model_name]['roc_auc']:.4f})")

Best performing model: LightGBM (ROC-AUC: 0.9812)


In [15]:
if best_model_name == 'XGBoost':
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth':    [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    }
    base_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')

elif best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators':     [100, 200, 300],
        'max_depth':        [10, 20, None],
        'min_samples_split': [2, 5, 10]
    }
    base_model = RandomForestClassifier(random_state=42)

elif best_model_name == 'LightGBM':
    param_grid = {
        'n_estimators':     [100, 200, 300],
        'num_leaves':       [31, 50, 100],
        'learning_rate':    [0.01, 0.05, 0.1]
    }
    base_model = lgb.LGBMClassifier(random_state=42)

elif best_model_name == 'CatBoost':
    param_grid = {
        'iterations':        [200, 500],
        'depth':             [4, 6, 8],
        'learning_rate':     [0.01, 0.05]
    }
    base_model = CatBoostClassifier(random_seed=42, verbose=0)

else:  # SVM
    param_grid = {
        'C':      [0.1, 1, 10],
        'gamma':  ['scale','auto'],
        'kernel': ['rbf','linear']
    }
    base_model = SVC(random_state=42, probability=True)

# Choose appropriate training data
X_tune = X_train_scaled if best_model_name == 'SVM' else X_train
X_test_final = X_test_scaled if best_model_name == 'SVM' else X_test

# Grid search
print(f"\nTuning hyperparameters for {best_model_name}...")
grid_search = GridSearchCV(base_model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_tune, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV ROC-AUC:", grid_search.best_score_)



Tuning hyperparameters for LightGBM...
[LightGBM] [Info] Number of positive: 5489, number of negative: 6511
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 390
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.457417 -> initscore=-0.170747
[LightGBM] [Info] Start training from score -0.170747
Best parameters: {'learning_rate': 0.01, 'n_estimators': 200, 'num_leaves': 31}
Best CV ROC-AUC: 0.9804384160527948


In [16]:

# Final model evaluation
final_model = grid_search.best_estimator_
y_pred_final = final_model.predict(X_test_final)
y_pred_proba_final = final_model.predict_proba(X_test_final)[:, 1]

final_accuracy = accuracy_score(y_test, y_pred_final)
final_roc_auc = roc_auc_score(y_test, y_pred_proba_final)

print(f"\nFinal tuned {best_model_name} Results:")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"ROC-AUC: {final_roc_auc:.4f}")


Final tuned LightGBM Results:
Accuracy: 0.9353
ROC-AUC: 0.9808


In [17]:

# Summary comparison
print("\n" + "="*50)
print("FINAL MODEL COMPARISON SUMMARY")
print("="*50)

summary_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[name]['accuracy'] for name in results.keys()],
    'ROC-AUC': [results[name]['roc_auc'] for name in results.keys()],
    'CV Accuracy': [results[name]['cv_mean'] for name in results.keys()],
    'CV Std': [results[name]['cv_std'] for name in results.keys()]
})

print(summary_df.round(4))

# Add tuned model results
print(f"\nTuned {best_model_name}:")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"ROC-AUC: {final_roc_auc:.4f}")

print("\n🎯 RECOMMENDATIONS:")
print(f"• Best base model: {best_model_name}")
print(f"• After tuning, {best_model_name} achieved {final_roc_auc:.4f} ROC-AUC")
if final_roc_auc > results[best_model_name]['roc_auc']:
    improvement = final_roc_auc - results[best_model_name]['roc_auc']
    print(f"• Hyperparameter tuning improved performance by {improvement:.4f}")
else:
    print("• Hyperparameter tuning did not significantly improve performance")


FINAL MODEL COMPARISON SUMMARY
      Model  Accuracy  ROC-AUC  CV Accuracy  CV Std
0   XGBoost     0.929   0.9791       0.9183  0.0014
1  LightGBM     0.935   0.9812       0.9246  0.0015

Tuned LightGBM:
Accuracy: 0.9353
ROC-AUC: 0.9808

🎯 RECOMMENDATIONS:
• Best base model: LightGBM
• After tuning, LightGBM achieved 0.9808 ROC-AUC
• Hyperparameter tuning did not significantly improve performance


In [18]:

# Sample predictions
print("\n" + "="*50)
print("SAMPLE PREDICTIONS")
print("="*50)

sample_indices = np.random.choice(len(X_test), 5, replace=False)
for i in sample_indices:
    actual = 'improvement' if y_test[i] == 1 else 'no improvement'
    predicted = 'improvement' if y_pred_final[i] == 1 else 'no improvement'
    probability = y_pred_proba_final[i]

    print(f"\nSample {i+1}:")
    print(f"  Actual: {actual}")
    print(f"  Predicted: {predicted}")
    print(f"  Probability of improvement: {probability:.3f}")


SAMPLE PREDICTIONS

Sample 376:
  Actual: improvement
  Predicted: improvement
  Probability of improvement: 0.923

Sample 408:
  Actual: no improvement
  Predicted: no improvement
  Probability of improvement: 0.064

Sample 2492:
  Actual: improvement
  Predicted: improvement
  Probability of improvement: 0.916

Sample 2002:
  Actual: no improvement
  Predicted: no improvement
  Probability of improvement: 0.237

Sample 2543:
  Actual: improvement
  Predicted: improvement
  Probability of improvement: 0.923


In [19]:
import joblib
joblib.dump(final_model, 'predict_improvement_model.pkl')
print("▶ Saved best model to best_model.pkl")
joblib.dump(label_encoders, 'predict_improvement_encoders.pkl')
print("▶ Saved label encoders to predict_improvement_encoders")
target_encoder = LabelEncoder().fit(y)
joblib.dump(target_encoder, 'predict_improvement_target.pkl')
print("▶ Saved target encoder to target.pkl")


▶ Saved best model to best_model.pkl
▶ Saved label encoders to predict_improvement_encoders
▶ Saved target encoder to target.pkl
