In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, StratifiedKFold
from sklearn.metrics import fbeta_score
import pickle
import os
import warnings

warnings.filterwarnings("ignore")

In [20]:
df = pd.read_csv("../dataset/Training.csv")
df['prognosis'] = df['prognosis'].str.strip()
df['grouped_prognosis'] = df['grouped_prognosis'].str.strip()

In [21]:
df['diabetes_alert'] = (df['prognosis'] == 'Diabetes').astype(int)

In [22]:
# Prepare temporary X and y for Lasso
X_temp = df.drop(['grouped_prognosis', 'diabetes_alert', 'prognosis'], axis=1)
y_temp = df['diabetes_alert']

# Scale data (crucial for Lasso)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_temp)

# Fit Lasso
lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X_scaled, y_temp)

# Extract Top 5 Features
feature_importance = pd.DataFrame({
    'Feature': X_temp.columns,
    'Abs_Coefficient': np.abs(lasso.coef_)
})

top_5_features_diabetes = feature_importance.sort_values(by='Abs_Coefficient', ascending=False).head(5)['Feature'].tolist()
print(f"Selected Top 5 Features: {top_5_features_diabetes}")

Selected Top 5 Features: ['increased_appetite', 'polyuria', 'itching', 'skin_rash', 'continuous_sneezing']


In [23]:
X = df[top_5_features_diabetes]
y = df['diabetes_alert'].values

# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training shape: {X_train.shape}, Test shape: {X_test.shape}")

# Calculate Class Weight
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

# Initialize Model
model = XGBClassifier(
    scale_pos_weight=ratio,
    random_state=42,
    verbosity=0,
    objective='binary:logistic',
    eval_metric='auc',
    n_jobs=-1
)

# Hyperparameter Grid
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid Search
grid = GridSearchCV(
    estimator=model,
    param_grid=xgb_param_grid,
    cv=cv_strategy,
    scoring='roc_auc',
    n_jobs=-1
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_

print(f"Best AUC Params: {grid.best_params_}")
print(f"Best CV AUC Score: {grid.best_score_:.4f}")

Training shape: (3936, 5), Test shape: (984, 5)
Best AUC Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50}
Best CV AUC Score: 1.0000


In [24]:
oof_probs = cross_val_predict(
    best_model, 
    X_train, 
    y_train, 
    cv=cv_strategy, 
    method='predict_proba', 
    n_jobs=-1
)[:, 1]

thresholds = np.arange(0.05, 0.95, 0.01)
best_f15 = 0
best_thresh = 0.5

for thresh in thresholds:
    oof_preds = (oof_probs >= thresh).astype(int)
    current_f15 = fbeta_score(y_train, oof_preds, beta=1.5)
    if current_f15 > best_f15:
        best_f1 = current_f15
        best_thresh = thresh

print(f"Optimal Threshold for F1: {best_thresh:.3f}")
print(f"Best OOF F1 Score: {best_f15:.4f}")

Optimal Threshold for F1: 0.690
Best OOF F1 Score: 0.0000


In [25]:
filename = os.path.join("../models/", "model_alert_xgb.pkl")
with open(filename, 'wb') as file:
    pickle.dump(best_model, file)

In [26]:
print("\n--- Testing Model Load ---")
with open(filename, 'rb') as file:
    model_loaded = pickle.load(file)

# Creating a dummy array based on the 5 selected features
array_example = np.array([0, 0, 0, 0, 0]).reshape(1, -1)

prob_neg = model_loaded.predict_proba(array_example)[0][0]
prob_pos = model_loaded.predict_proba(array_example)[0][1]

print(f"Input Features: {top_5_features_diabetes}")
print(f"Class 0 (Negative): {prob_neg:.4f}")
print(f"Class 1 (Positive): {prob_pos:.4f}")


--- Testing Model Load ---
Input Features: ['increased_appetite', 'polyuria', 'itching', 'skin_rash', 'continuous_sneezing']
Class 0 (Negative): 0.6971
Class 1 (Positive): 0.3029
