In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

df = pd.read_csv('diabetes_old.csv')
df = df[df['age']>=18]
df = df[df['bmi']<=40]
#Preprocess the data
numeric_col=[]
non_numeric_col=[]
for column in df.columns:
    if pd.api.types.is_numeric_dtype(df[column]):
        if(df[column].nunique()<5):
            non_numeric_col.append(column)
        else:
            numeric_col.append(column)
    else:
        non_numeric_col.append(column)
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['smoking_history'] = df['smoking_history'].replace({'not current':'former','ever':'never'})
df_copy = df.copy()
for col in non_numeric_col:
    df[col]=le.fit_transform(df[col])



In [2]:
y = df['diabetes']
X = df.drop('diabetes', axis = 1)

In [3]:
scale_pos_weight = y.value_counts()[0] /  y.value_counts()[1]

In [4]:
#Normalize the data
scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5, stratify = y_test)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import precision_recall_curve, fbeta_score


# Assuming X_train_scaled, X_test_scaled, y_train, and y_test are defined
optuna.logging.set_verbosity(optuna.logging.WARNING)
def objective(trial):
    dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
    dval = xgb.DMatrix(X_val_scaled, label=y_val)

    param = {
        'booster': 'dart',
        'tree_method': 'gpu_hist',
        'scale_pos_weight': scale_pos_weight,  #
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1),
        'subsample': trial.suggest_float('subsample', 0.5, 0.8),
        'eval_metric': 'auc'  
    }
   
    num_boost_round = 100
    model = xgb.train(param, dtrain, num_boost_round=num_boost_round)
    preds = model.predict(dval)
    precision, recall, thresholds = precision_recall_curve(y_val, preds)
    beta = 1.8
    # Calculate F2 scores
    with np.errstate(divide='ignore', invalid='ignore'):
        f2_scores = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

    f2_scores = np.nan_to_num(f2_scores)

    max_f2_score = np.max(f2_scores)

    return max_f2_score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
# To use the best parameters:
best_params = study.best_trial.params

max_depth = best_params['max_depth']
gamma = best_params['gamma']
learning_rate = best_params['learning_rate']
subsample = best_params['subsample']



In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
params = {
    'tree_method': 'gpu_hist',
    'booster':'dart',
    'eval_metric':'auc',
    'scale_pos_weight':scale_pos_weight,
    'max_depth': max_depth, 'gamma': gamma, 'learning_rate': learning_rate, 'subsample': subsample
}
dtrain = xgb.DMatrix(X_train_scaled, label = y_train)
dtest = xgb.DMatrix(X_test_scaled, label = y_test)
dval = xgb.DMatrix(X_val_scaled, label=y_val)
model = xgb.train(params, dtrain, num_boost_round = 100)
y_pred_val_original = model.predict(dval)

In [None]:
from sklearn.metrics import precision_recall_curve, fbeta_score
import numpy as np

precision, recall, thresholds = precision_recall_curve(y_val, y_pred_val_original)
beta = 1.8

# Calculate F-beta scores for each threshold
f_beta_scores = [(1 + beta**2) * (prec * rec) / ((beta**2 * prec) + rec) for prec, rec in zip(precision, recall)]

# Find the index of the maximum F-beta score
opt_idx = np.argmax(f_beta_scores)
opt_threshold = thresholds[opt_idx] if opt_idx < len(thresholds) else 1.0
opt_f_beta_score = f_beta_scores[opt_idx]

print(f"Optimal threshold: {opt_threshold}, F{beta} Score: {opt_f_beta_score}")

# Use the optimal threshold to convert probabilities to binary predictions
y_pred_optimal = (y_pred_val_original >= opt_threshold).astype(int)

# Calculate the final F-beta score using the optimal threshold
final_f_beta_score = fbeta_score(y_val, y_pred_optimal, beta=beta)
print(f"Final F{beta} Score with Optimal Threshold: {final_f_beta_score}")


In [None]:
y_pred_val = (y_pred_val_original > opt_threshold).astype(int)
print(classification_report(y_val, y_pred_val))
print(f' AUC score is : {roc_auc_score(y_val, y_pred_val)}')

In [None]:
y_pred_test = model.predict(dtest)
y_pred_test = (y_pred_test > opt_threshold).astype(int)
print(classification_report(y_test, y_pred_test))
print(f' AUC score is : {roc_auc_score(y_test, y_pred_test)}')