<a href="https://colab.research.google.com/github/MdSaifulIslamSajol/GDdemo/blob/main/%F0%9F%92%8A_Diabetes_Prediction_with_DNN_(PyTorch)_%F0%9F%A9%BA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'diabetes-health-indicators-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1703281%2F2789260%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240409%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240409T155716Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dc391096f064c4a6ac9c5843191792f87584b8984d2d46f1d39649fc35cc4823f81d3b9447ab0883e6648a0b40d888e7d41c6b41a9a9f952dbc1ebbee3df760b0c8c9d746b789bd75cc15ada2ff99dfa9f591d33600ecc62fdda63ac8cf7e58f3e632a990d26737d1a2f06a340c1b9aef8273231fd2bb746950b333b1773698f5783dc966d637f821acda7a71af5f01cc38e95034fe96a1d0466041edf11836080eb92f2f4cacaeefaa64f861a28bdf4dcf24e3275d41e737c89c01b478f2c85f07c4ffc68fab6b37f3a8572af55056fc42f1aef19d45702ecd9ff157356897b4ee688553fcb6656a314e4f2667ba4f99cbee4fc3ef096a66c500625d7124146d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Let's predict Diabetes using ```'diabetes_binary_5050split_health_indicators_BRFSS2015.csv'``` file.**

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [None]:
# Set random seed and Use 'cuda' GPU

torch.manual_seed(0)

if torch.cuda.is_available():
    device = 'cuda'
    torch.cuda.manual_seed_all(0)

else:
    device = 'cpu'

In [None]:
# Get Total Dataset
df_train = pd.read_csv('../input/diabetes-health-indicators-dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
df_train['Diabetes_binary'].value_counts()

In [None]:
df_train.info()

As I can see with ```df_train.head()``` and ```df_train.info()```, it would be better to convert the values into **int**.

In [None]:
df_train = df_train.astype(int)

# Data Preparation

## Dataset Split

Train set + Validation set + Test set

In [None]:
# Dataset split
X = df_train.drop('Diabetes_binary', axis=1)
y = df_train['Diabetes_binary']

In [None]:
# Use 10 % of total data as Test set and the rest as (Train + Validation) set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1)

# Use 20 % of (Train + Validation) set as Validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2)

## Scaling

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

## Convert into Tensor

In [None]:
X_train = torch.FloatTensor(X_train).to(device)
X_val = torch.FloatTensor(X_val).to(device)

y_train = torch.LongTensor(y_train.values).to(device)
y_val = torch.LongTensor(y_val.values).to(device)

# Model

In [None]:
# Hyperparameter
learning_rate = 1e-1
n_epochs = 500
drop_prob = 0.3

In [None]:
X_train.shape

In [None]:
# Model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.fc1 = nn.Linear(21, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, 64)
        self.fc5 = nn.Linear(64, 2)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight.data)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)

        out = self.fc2(out)
        out = self.relu(out)
        out = self.dropout(out)

        out = self.fc3(out)
        out = self.relu(out)
        out = self.dropout(out)

        out = self.fc4(out)
        out = self.relu(out)
        out = self.dropout(out)

        out = self.fc5(out)
        return out

In [None]:
model = Net().to(device)

# Optimizer and Loss function
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
loss_fn = nn.CrossEntropyLoss()

# Train

Train with Validation

In [None]:
train_loss = list()
val_loss = list()

for epoch in range(1, n_epochs+1):
    model.train()
    H = model(X_train)
    loss = loss_fn(H, y_train)

    train_loss.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    acc = (torch.argmax(H, dim=1) == y_train).float().mean().item()

    model.eval()
    with torch.no_grad():
        H_val = model(X_val)
        loss_val = loss_fn(H_val, y_val)
        acc_val = (torch.argmax(H_val, dim=1) == y_val).float().mean().item()

        val_loss.append(loss_val.item())

    if epoch % 50 == 0:
        print('Epoch {:4d} / {}, Cost : {:.4f}, Acc : {:.2f} %, Val Cost : {:.4f}, Val Acc : {:.2f} %'.format(
            epoch, n_epochs, loss.item(), acc*100, loss_val.item(), acc_val*100))

Let's visualize the Train loss and Validadtion loss.

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(train_loss, label='Train')
plt.plot(val_loss, label='Validation')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

Validation loss is lower than Train loss because of 'Dropout' layer.

# Predict Test set

Use the whole train set(Train + Validation above).

In [None]:
# Scaling
scaler = MinMaxScaler()

X_train_val = scaler.fit_transform(X_train_val)
X_test = scaler.transform(X_test)

In [None]:
# To Tensor
X_train_val = torch.FloatTensor(X_train_val).to(device)
X_test = torch.FloatTensor(X_test).to(device)

y_train_val = torch.LongTensor(y_train_val.values).to(device)
y_test = torch.LongTensor(y_test.values).to(device)

In [None]:
model = Net().to(device)

# Optimizer and Loss function
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

loss_fn = nn.CrossEntropyLoss()

In [None]:
# Train
for epoch in range(1, n_epochs+1):
    model.train()
    H = model(X_train)
    loss = loss_fn(H, y_train)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    acc = (torch.argmax(H, dim=1) == y_train).float().mean().item()

    if epoch % 50 == 0:
        print('Epoch {:4d} / {}, Cost : {:.4f}, Acc : {:.2f} %'.format(
            epoch, n_epochs, loss.item(), acc*100))

In [None]:
# Predict Test set
model.eval()
with torch.no_grad():
    pred = model(X_test)

Let's check Test loss and Test accuracy.

In [None]:
test_loss = loss_fn(pred, y_test)
test_acc = (torch.argmax(pred, dim=1) == y_test).float().mean().item()

print('Test Loss : {:.4f}'.format(test_loss))
print('Test Accuacy : {:.2f} %'.format(test_acc*100))

### Please **Upvote** if you like my notebook!
### Thank you!