In [40]:
import pandas as pd
import numpy as np
import duckdb
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, StratifiedKFold
from sklearn.metrics import fbeta_score, classification_report, confusion_matrix, roc_auc_score

In [41]:
import pickle

In [42]:
import warnings
warnings.filterwarnings("ignore")

# Import data

In [43]:
df = pd.read_csv("./dataset/Training.csv")

diabetes

In [44]:
df_diabetes = duckdb.query("""
select *
, case when prognosis = 'Diabetes ' then 1 else 0 end diabetes_alert
from df
""").to_df()

In [45]:
df_diabetes = df_diabetes.drop(columns='prognosis')

In [46]:
df_diabetes['diabetes_alert'].value_counts()

diabetes_alert
0    4800
1     120
Name: count, dtype: int64

typhoid

In [47]:
df_typhoid = duckdb.query("""
select *
, case when prognosis = 'Typhoid' then 1 else 0 end typhoid_alert
from df
""").to_df()

In [48]:
df_typhoid = df_typhoid.drop(columns='prognosis')

In [49]:
df_typhoid['typhoid_alert'].value_counts()

typhoid_alert
0    4800
1     120
Name: count, dtype: int64

# Lasso (Top 5 Feature)

diabetes

In [50]:
X = df_diabetes.drop('diabetes_alert', axis=1)
y = df_diabetes['diabetes_alert']

In [51]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X_scaled, y)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso.coef_,
    'Abs_Coefficient': np.abs(lasso.coef_)
})

top_5_features = feature_importance.sort_values(by='Abs_Coefficient', ascending=False).head(5)
top_5_features_diabetes = top_5_features['Feature'].to_list()

In [52]:
top_5_features_diabetes

['increased_appetite',
 'polyuria',
 'itching',
 'skin_rash',
 'continuous_sneezing']

typhoid

In [53]:
X = df_typhoid.drop('typhoid_alert', axis=1)
y = df_typhoid['typhoid_alert']

In [54]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X_scaled, y)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso.coef_,
    'Abs_Coefficient': np.abs(lasso.coef_)
})

top_5_features = feature_importance.sort_values(by='Abs_Coefficient', ascending=False).head(5)
top_5_features_typhoid = top_5_features['Feature'].to_list()

In [55]:
top_5_features_typhoid

['toxic_look_(typhos)',
 'belly_pain',
 'itching',
 'skin_rash',
 'continuous_sneezing']

# Training

diabetes

In [56]:
X = df_diabetes[top_5_features_diabetes]
y = df_diabetes['diabetes_alert'].values

In [57]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [58]:
print(f"Training shape: {X_train.shape}, Test shape: {X_test.shape}")

Training shape: (3444, 5), Test shape: (1476, 5)


In [59]:
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
model = XGBClassifier(
    scale_pos_weight=ratio,
    random_state=42,
    verbosity=0,
    objective='binary:logistic',
    eval_metric='auc',
    n_jobs=-1
)

xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

grid = GridSearchCV(
    estimator=model,
    param_grid=xgb_param_grid,
    cv=cv_strategy,
    scoring='roc_auc',
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print(f"Best AUC Params: {grid.best_params_}")
print(f"Best CV AUC Score: {grid.best_score_:.4f}")

Best AUC Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50}
Best CV AUC Score: 1.0000


In [60]:
oof_probs = cross_val_predict(
    best_model, 
    X_train, 
    y_train, 
    cv=cv_strategy, 
    method='predict_proba', 
    n_jobs=-1
)[:, 1]

thresholds = np.arange(0.05, 0.95, 0.01)
best_f1 = 0
best_thresh = 0.5

for thresh in thresholds:
    oof_preds = (oof_probs >= thresh).astype(int)
    current_f1 = fbeta_score(y_train, oof_preds, beta=1)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_thresh = thresh

print(f"Optimal Threshold for F1: {best_thresh:.3f}")
print(f"Best OOF F1 Score: {best_f1:.4f}")

Optimal Threshold for F1: 0.310
Best OOF F1 Score: 1.0000


In [61]:
with open('./trained_model/lgb_model_diabetes.pkl', 'wb') as file:
    pickle.dump(best_model, file)

typhoid

In [62]:
X = df_typhoid[top_5_features_typhoid]
y = df_typhoid['typhoid_alert'].values

In [63]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [64]:
print(f"Training shape: {X_train.shape}, Test shape: {X_test.shape}")

Training shape: (3444, 5), Test shape: (1476, 5)


In [65]:
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
model = XGBClassifier(
    scale_pos_weight=ratio,
    random_state=42,
    verbosity=0,
    objective='binary:logistic',
    eval_metric='auc',
    n_jobs=-1
)

xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

grid = GridSearchCV(
    estimator=model,
    param_grid=xgb_param_grid,
    cv=cv_strategy,
    scoring='roc_auc',
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print(f"Best AUC Params: {grid.best_params_}")
print(f"Best CV AUC Score: {grid.best_score_:.4f}")

Best AUC Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50}
Best CV AUC Score: 1.0000


In [66]:
oof_probs = cross_val_predict(
    best_model, 
    X_train, 
    y_train, 
    cv=cv_strategy, 
    method='predict_proba', 
    n_jobs=-1
)[:, 1]

thresholds = np.arange(0.05, 0.95, 0.01)
best_f1 = 0
best_thresh = 0.5

for thresh in thresholds:
    oof_preds = (oof_probs >= thresh).astype(int)
    current_f1 = fbeta_score(y_train, oof_preds, beta=1)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_thresh = thresh

print(f"Optimal Threshold for F1: {best_thresh:.3f}")
print(f"Best OOF F1 Score: {best_f1:.4f}")

Optimal Threshold for F1: 0.310
Best OOF F1 Score: 1.0000


In [67]:
with open('./trained_model/lgb_model_typhoid.pkl', 'wb') as file:
    pickle.dump(best_model, file)

# Test

diabetes

In [68]:
with open('./trained_model/lgb_model_diabetes.pkl', 'rb') as file:
    model_diabetes = pickle.load(file)

In [69]:
array_example = np.array([1,0,1,0,0]).reshape(1, -1)

In [70]:
model_diabetes.predict_proba(array_example)[0][0]

np.float32(0.30295193)

In [71]:
model_diabetes.predict_proba(array_example)[0][1]

np.float32(0.69704807)

typhoid

In [72]:
with open('./trained_model/lgb_model_typhoid.pkl', 'rb') as file:
    model_typhoid = pickle.load(file)

In [73]:
array_example = np.array([0,0,0,0,0]).reshape(1, -1)

In [74]:
model_typhoid.predict_proba(array_example)[0][0]

np.float32(0.697048)

In [75]:
model_typhoid.predict_proba(array_example)[0][1]

np.float32(0.302952)