In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [24]:
RANDOM_STATE = 42

In [19]:
data = pd.read_csv("./diabetes_prediction_dataset.csv")

In [20]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [22]:
print("Missing values by column:")
data.isna().sum()

Missing values by column:


gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

An example sample

In [31]:
data.loc[0]

gender                 Female
age                      80.0
hypertension                0
heart_disease               1
smoking_history         never
bmi                     25.19
HbA1c_level               6.6
blood_glucose_level       140
diabetes                    0
Name: 0, dtype: object

There are two categorical features (gender & smoking_history) that need to be discretized

In [36]:
categorical_cols = ["gender", "smoking_history"]
for col in categorical_cols:
    unique_vals = data[col].unique()
    print(f"Possible values for {col}: ", unique_vals)
    print(f"Count: {len(unique_vals)}")

    # Convert to numerical values
    mapping = dict(zip(unique_vals, range(len(unique_vals))))
    data[col] = data[col].map(mapping)

Possible values for gender:  [0 1 2]
Count: 3
Possible values for smoking_history:  [0 1 2 3 4 5]
Count: 6


In [37]:
labels = data["diabetes"]
features = data.drop(columns="diabetes")

In [38]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=RANDOM_STATE)

In [39]:
minmax = MinMaxScaler() # Scale values between 0 and 1 (default)
standardize = StandardScaler() # Remove mean and scale to variance of 1

X_train = pd.DataFrame(standardize.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(standardize.fit_transform(X_test), columns=X_test.columns)

X_train = pd.DataFrame(minmax.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(minmax.fit_transform(X_test), columns=X_test.columns)

A random forest classifier

In [42]:
forest = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
forest.fit(X_train, y_train)

In [47]:
predictions = forest.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, predictions))

Accuracy:  0.9706
