In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import joblib
import warnings

warnings.filterwarnings("ignore")

In [43]:
df = pd.read_csv('https://github.com/plotly/datasets/raw/refs/heads/master/diabetes.csv')

In [44]:
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 24.9:
        return 'Normal'
    elif bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obese'

df['BMI_category'] = df['BMI'].apply(bmi_category)
df[['BMI', 'BMI_category']].head()


Unnamed: 0,BMI,BMI_category
0,33.6,Obese
1,26.6,Overweight
2,23.3,Normal
3,28.1,Overweight
4,43.1,Obese


In [45]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [46]:
numeric_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age']
categorical_features = ['BMI_category']

scaler = StandardScaler()
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])
val_df[numeric_features] = scaler.transform(val_df[numeric_features])


encoder = OneHotEncoder(sparse_output=False)
train_encoded = encoder.fit_transform(train_df[categorical_features])
val_encoded = encoder.transform(val_df[categorical_features])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_features))
val_encoded_df = pd.DataFrame(val_encoded, columns=encoder.get_feature_names_out(categorical_features))

X_train = pd.concat([train_df[numeric_features].reset_index(drop=True), train_encoded_df], axis=1)
X_val = pd.concat([val_df[numeric_features].reset_index(drop=True), val_encoded_df], axis=1)

y_train = train_df['Outcome']
y_val = val_df['Outcome']

print(X_train.head())

    Glucose  BloodPressure  SkinThickness   Insulin       BMI       Age  \
0 -1.151398      -3.752683      -1.322774 -0.701206 -4.135256 -1.035940   
1 -0.276643       0.680345       0.233505 -0.701206 -0.489169  1.487101   
2  0.566871      -1.265862      -0.090720  0.013448 -0.424522 -0.948939   
3  1.254179      -1.049617      -1.322774 -0.701206 -1.303720  2.792122   
4  0.410665       0.572222       1.076490  2.484601  1.838121  1.139095   

   BMI_category_Normal  BMI_category_Obese  BMI_category_Overweight  \
0                  0.0                 0.0                      0.0   
1                  0.0                 0.0                      1.0   
2                  0.0                 0.0                      1.0   
3                  1.0                 0.0                      0.0   
4                  0.0                 1.0                      0.0   

   BMI_category_Underweight  
0                       1.0  
1                       0.0  
2                       0.0  
3 

In [47]:
best_knn_f1, best_k = 0, 0
for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    preds = knn.predict(X_val)
    f1 = f1_score(y_val, preds)
    print(f"k={k}, F1 Score: {f1}")
    if f1 > best_knn_f1:
        best_knn_f1, best_k = f1, k

print(f"Best K: {best_k}, Best F1 Score: {best_knn_f1}")


k=3, F1 Score: 0.5454545454545454
k=5, F1 Score: 0.6037735849056604
k=7, F1 Score: 0.6605504587155964
Best K: 7, Best F1 Score: 0.6605504587155964


In [48]:
best_dt_f1, best_depth = 0, 0
for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    preds = dt.predict(X_val)
    f1 = f1_score(y_val, preds)
    print(f"max_depth={depth}, F1 Score: {f1}")
    if f1 > best_dt_f1:
        best_dt_f1, best_depth = f1, depth

print(f"Best max_depth: {best_depth}, Best F1 Score: {best_dt_f1}")


max_depth=3, F1 Score: 0.6476190476190476
max_depth=5, F1 Score: 0.6379310344827587
max_depth=7, F1 Score: 0.5535714285714286
Best max_depth: 3, Best F1 Score: 0.6476190476190476


In [49]:
best_model = knn if best_knn_f1 > best_dt_f1 else dt
joblib.dump(scaler, 'models/scaler.pkl')
joblib.dump(encoder, 'models/encoder.pkl')
joblib.dump(best_model, 'models/best_model.pkl')

def inference(sample):
    scaler = joblib.load('models/scaler.pkl')
    encoder = joblib.load('models/encoder.pkl')
    model = joblib.load('models/best_model.pkl')

    sample_numeric = scaler.transform(sample[numeric_features])

    sample_categorical = encoder.transform(sample[categorical_features])

    sample_transformed = pd.concat([pd.DataFrame(sample_numeric), pd.DataFrame(sample_categorical)], axis=1)

    prediction = model.predict(sample_transformed)
    return prediction

for i in range(5):
    sample = val_df.iloc[[i]]
    print(f"Sample {i + 1} Prediction: {inference(sample)}")

Sample 1 Prediction: [0]
Sample 2 Prediction: [0]
Sample 3 Prediction: [0]
Sample 4 Prediction: [0]
Sample 5 Prediction: [0]
