In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
import tqdm as notebook_tqdm
import matplotlib.pyplot as plt
import numpy as np
import kagglehub


# Dataset

In [2]:
path = kagglehub.dataset_download("iammustafatz/diabetes-prediction-dataset")
diabetes_df = pd.read_csv(path + '/diabetes_prediction_dataset.csv')
diabetes_df.head()


Downloading from https://www.kaggle.com/api/v1/datasets/download/iammustafatz/diabetes-prediction-dataset?dataset_version_number=1...


100%|██████████| 734k/734k [00:00<00:00, 57.5MB/s]

Extracting files...





Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


# OneHot Encoding & Drop Nan

In [9]:
diabetes_df['gender'] = diabetes_df['gender'].replace({'Male':1, 'Female':0, 'Other': np.nan})
diabetes_df['smoking_history'] = diabetes_df['smoking_history'].replace(
    {'never': 0, 'current': 1, 'former': 2, 'ever': 3, 'not current': 4, 'No Info': np.nan}
)

diabetes_df = diabetes_df.dropna()
diabetes_df.head()

  diabetes_df['gender'] = diabetes_df['gender'].replace({'Male':1, 'Female':0, 'Other': np.nan})
  diabetes_df['smoking_history'] = diabetes_df['smoking_history'].replace(


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0.0,80.0,0,1,0.0,25.19,6.6,140,0
2,1.0,28.0,0,0,0.0,27.32,5.7,158,0
3,0.0,36.0,0,0,1.0,23.45,5.0,155,0
4,1.0,76.0,1,1,1.0,20.14,4.8,155,0
5,0.0,20.0,0,0,0.0,27.32,6.6,85,0


In [10]:
diabetes_df.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
gender,64172.0,0.394565,0.488761,0.0,0.0,0.0,1.0,1.0
age,64172.0,46.547268,19.539695,0.16,31.0,47.0,61.0,80.0
hypertension,64172.0,0.097909,0.297194,0.0,0.0,0.0,0.0,1.0
heart_disease,64172.0,0.047045,0.211738,0.0,0.0,0.0,0.0,1.0
smoking_history,64172.0,1.024668,1.35815,0.0,0.0,0.0,2.0,4.0
bmi,64172.0,28.424262,6.515975,10.08,24.6,27.32,31.1,91.82
HbA1c_level,64172.0,5.564279,1.095535,3.5,4.8,5.8,6.2,9.0
blood_glucose_level,64172.0,139.629792,42.166693,80.0,100.0,140.0,159.0,300.0
diabetes,64172.0,0.109799,0.312641,0.0,0.0,0.0,0.0,1.0


# Split Train-Valid-Test

In [None]:
X = diabetes_df.drop(["diabetes"], axis=1)
y = diabetes_df["diabetes"]

sc_X = StandardScaler()
X = pd.DataFrame(sc_X.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.10, random_state=42, shuffle=True)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
def create_dataloader(x_set, y_set, batch_size):
  X = torch.tensor(x_set.values, dtype=torch.float32).to(device)
  y = torch.tensor(y_set.to_numpy(), dtype=torch.float32).reshape(-1, 1).to(device)

  return DataLoader(list(zip(X, y)), shuffle=True, batch_size=batch_size)

train_loader = create_dataloader(X_train, y_train, 64)
valid_loader = create_dataloader(X_valid, y_valid, len(X_valid))
test_loader = create_dataloader(X_test, y_test, len(X_test))

## **Multulayer Perceptron**

In [None]:
class MultiLayerNet1(torch.nn.Module):
    def __init__(self, input_size, frst_lyr, scnd_lyr, output_size):
        super(MultiLayerNet1, self).__init__()
        self.input = torch.nn.Linear(input_size, frst_lyr)
        self.hidden_layer = torch.nn.Linear(frst_lyr, scnd_lyr)
        self.hidden_layer_2 = torch.nn.Linear(scnd_lyr, scnd_lyr)

        self.output_layer = torch.nn.Linear(scnd_lyr, output_size)

    def forward(self, x):
        frst_hidden_output = torch.sigmoid(self.input(x))
        scnd_hidden_output = torch.sigmoid(self.hidden_layer(frst_hidden_output))

        y_pred = torch.sigmoid(self.output_layer(scnd_hidden_output))

        return y_pred

In [None]:
import torch
import torch.nn as nn

class MultiLayerNet2(nn.Module):
    def __init__(self, input_size, frst_lyr, scnd_lyr, third_lyr, output_size, dropout_rate=0.7):
        super(MultiLayerNet2, self).__init__()

        self.first_hidden_layer = nn.Linear(input_size, frst_lyr)
        self.dropout2 = nn.Dropout(0.2)

        self.second_hidden_layer = nn.Linear(frst_lyr, scnd_lyr)
        self.dropout3 = nn.Dropout(0.3)

        self.third_hidden_layer = nn.Linear(scnd_lyr, third_lyr)

        self.output_layer = nn.Linear(third_lyr, output_size)

    def forward(self, x):
        frst_hidden_output = torch.relu(self.first_hidden_layer(x))
        frst_hidden_output = self.dropout2(frst_hidden_output)

        scnd_hidden_output = torch.relu(self.second_hidden_layer(frst_hidden_output))
        scnd_hidden_output = self.dropout3(scnd_hidden_output)

        third_hidden_output = torch.relu(self.third_hidden_layer(scnd_hidden_output))

        y_pred = torch.sigmoid(self.output_layer(third_hidden_output))

        return y_pred


In [None]:
learningRate = 0.01
epochs = 1500

model = MultiLayerNet2(8, 128, 64, 16, 1).to(device)

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)

# Training

In [None]:
training_loss = []
eval_loss = []

for epoch in range(epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    training_loss.append(loss.item())

    if epoch % 100 == 0:
      print('epoch {}, train loss {}'.format(epoch, loss.item()))

    # Validation
    model.eval()
    with torch.no_grad():  # Disable gradient computation during evaluation
        for x__val_batch, y__val_batch in valid_loader:
            val_outputs = model(x__val_batch)
            val_loss = criterion(val_outputs, y__val_batch)

        if epoch % 100 == 0:
          acc = (val_outputs.round() == y__val_batch).float().mean()
          print("Model accuracy: %.2f%%" % (float(acc)*100))
          print('epoch {}, val_loss {}\n'.format(epoch, val_loss.item()))

        eval_loss.append(val_loss.item())


epoch 0, train loss 0.6931471824645996
Model accuracy: 89.25%
epoch 0, val_loss 0.6931471824645996

epoch 100, train loss 0.6931471824645996
Model accuracy: 89.25%
epoch 100, val_loss 0.6931471824645996

epoch 200, train loss 0.6931471824645996
Model accuracy: 89.25%
epoch 200, val_loss 0.6931471824645996



KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(training_loss, linestyle='-', color='b', label='Training Loss')
plt.plot(eval_loss, linestyle=':', color='r', label='Validation Loss')
# plt.plot(acc_list, linestyle='-', color='g', label='Accuracy')

plt.xlabel('Epochs')
plt.ylabel('Cross Entropy Loss')
plt.title('Training Progress')

plt.grid(True)

# Show legend
plt.legend()

In [None]:
test_loader = create_dataloader(X_test, y_test, len(X_test))

model.eval()
with torch.no_grad():  # Disable gradient computation during evaluation
    for x__test_batch, y__test_batch in test_loader:
        test_outputs = model(x__test_batch)
        test_loss = criterion(test_outputs, y__test_batch)

        acc = (test_outputs.round() == y__test_batch).float().mean()
        print("Model accuracy: %.2f%%" % (float(acc)*100))
        print('epoch {}, test_loss {}\n'.format(epoch, test_loss.item()))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from scipy.stats import uniform

# Define MLP and hyperparameter distribution
mlp = MLPClassifier(max_iter=1000, random_state=42)
param_distributions = {
    'hidden_layer_sizes': [
        (128, 64, 16),
        (128, 64, 32), (64, 128, 32)
        ],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': uniform(0.0001, 0.01),
    'learning_rate': ['adaptive']
}

# Perform Randomized Search
random_search = RandomizedSearchCV(mlp, param_distributions, n_iter=20, cv=2, scoring='accuracy',
                                    verbose=2, n_jobs=3, random_state=42)
random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# evaluate_model(best_mlp, X_test, y_test)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
Best Parameters: {'activation': 'logistic', 'alpha': 0.003924619912671627, 'hidden_layer_sizes': (128, 64, 16), 'learning_rate': 'adaptive', 'solver': 'adam'}
Best Score: 0.9616760937319635
