In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv("data/diabetes.csv")

# convert all the values to float 

df['gender'] = df['gender'].map({'female': 0, 'male': 1})
df['diabetes'] = df['diabetes'].map({'No diabetes': 0, 'Diabetes': 1})

for column in ['chol_hdl_ratio', 'bmi', 'waist_hip_ratio']:
    df[column] = df[column].str.replace(',', '.').astype(float)

for column in ['cholesterol', 'glucose', 'hdl_chol', 'age', 'height', 'weight', 'systolic_bp', 'diastolic_bp', 'waist', 'hip']:
    df[column] = df[column].astype(float)


df.head()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,1,193.0,77.0,49.0,3.9,19.0,0,61.0,119.0,22.5,118.0,70.0,32.0,38.0,0.84,0
1,2,146.0,79.0,41.0,3.6,19.0,0,60.0,135.0,26.4,108.0,58.0,33.0,40.0,0.83,0
2,3,217.0,75.0,54.0,4.0,20.0,0,67.0,187.0,29.3,110.0,72.0,40.0,45.0,0.89,0
3,4,226.0,97.0,70.0,3.2,20.0,0,64.0,114.0,19.6,122.0,64.0,31.0,39.0,0.79,0
4,5,164.0,91.0,67.0,2.4,20.0,0,70.0,141.0,20.2,122.0,86.0,32.0,39.0,0.82,0


In [3]:
df.describe()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
count,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0
mean,195.5,207.230769,107.338462,50.266667,4.524615,46.774359,0.415385,65.951282,177.407692,28.775641,137.133333,83.289744,37.869231,42.992308,0.881385,0.153846
std,112.727548,44.666005,53.798188,17.279069,1.736634,16.435911,0.493421,3.918867,40.407824,6.600915,22.859528,13.498192,5.760947,5.664342,0.073212,0.361265
min,1.0,78.0,48.0,12.0,1.5,19.0,0.0,52.0,99.0,15.2,90.0,48.0,26.0,30.0,0.68,0.0
25%,98.25,179.0,81.0,38.0,3.2,34.0,0.0,63.0,150.25,24.1,122.0,75.0,33.0,39.0,0.83,0.0
50%,195.5,203.0,90.0,46.0,4.2,44.5,0.0,66.0,173.0,27.8,136.0,82.0,37.0,42.0,0.88,0.0
75%,292.75,229.0,107.75,59.0,5.4,60.0,1.0,69.0,200.0,32.275,148.0,90.0,41.0,46.0,0.93,0.0
max,390.0,443.0,385.0,120.0,19.3,92.0,1.0,76.0,325.0,55.8,250.0,124.0,56.0,64.0,1.14,1.0


In [4]:
df.dtypes

patient_number       int64
cholesterol        float64
glucose            float64
hdl_chol           float64
chol_hdl_ratio     float64
age                float64
gender               int64
height             float64
weight             float64
bmi                float64
systolic_bp        float64
diastolic_bp       float64
waist              float64
hip                float64
waist_hip_ratio    float64
diabetes             int64
dtype: object

In [5]:
df["diabetes"]

0      0
1      0
2      0
3      0
4      0
      ..
385    0
386    1
387    0
388    1
389    0
Name: diabetes, Length: 390, dtype: int64

In [6]:
# Encode categorical variables


# Define features and target variable
X = df.drop(['patient_number', 'diabetes'], axis=1)
y = df['diabetes'].fillna(0)


In [7]:
def calculate_cross_val_accuracy(X, y, model, cv=5):
    accuracies = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = round(100 * accuracies.mean(), 2)
    return mean_accuracy


models = {'KNeighborsClassifier': KNeighborsClassifier(),
'LogisticRegression': LogisticRegression(max_iter=2500),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'SVC': SVC(),
'RandomForestClassifier': RandomForestClassifier()}

for model_name, model in models.items():
    mean_accuracy = calculate_cross_val_accuracy(X, y, model, cv=5)
    print(f"The cross-validated accuracy of {model_name} is {mean_accuracy}.")

        


The cross-validated accuracy of KNeighborsClassifier is 92.82.
The cross-validated accuracy of LogisticRegression is 84.1.
The cross-validated accuracy of DecisionTreeClassifier is 78.21.
The cross-validated accuracy of SVC is 92.31.
The cross-validated accuracy of RandomForestClassifier is 78.97.


In [8]:
preprocessing_steps = [
    ('scaling', StandardScaler()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
]

models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': [100, 200, 300],
            'model__max_depth': [None, 5, 10]         
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'model__C': [0.1, 1, 10],                
            'model__kernel': ['linear', 'rbf']
        }
    }
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for preprocess_name, preprocess_step in preprocessing_steps:
    for model_name, model_info in models.items():
        model = model_info['model']
        params = model_info['params']
        
        pipeline = Pipeline([
            (preprocess_name, preprocess_step),
            ('model', model)
        ])
        
        grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        
        print(f"Best hyperparameters for {preprocess_name} + {model_name}:")
        print(grid_search.best_params_)
        
        cv_scores = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=5, scoring='accuracy')
        mean_cv_score = cv_scores.mean()
        print(f"Cross-validated accuracy: {mean_cv_score}")

        best_model = grid_search.best_estimator_
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)
        print(f"Test accuracy: {test_accuracy}")

Best hyperparameters for scaling + RandomForest:
{'model__max_depth': 5, 'model__n_estimators': 300}
Cross-validated accuracy: 0.9133640552995391
Test accuracy: 0.8846153846153846
Best hyperparameters for scaling + SVM:
{'model__C': 10, 'model__kernel': 'linear'}
Cross-validated accuracy: 0.9230926779313876
Test accuracy: 0.9102564102564102


KeyboardInterrupt: 