<a href="https://colab.research.google.com/github/LivaIg/Diabetes-classification/blob/main/diabetes_classification_neural__networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Diabetes Prediction Challange with neural networks**
Task is to build a machine learning classifier that predicts whether a patient is diagnosed with diabetes or not, based on a set of health indicators.

The dataset includes 21 features describing lifestyle, demographic, and health-related factors (e.g., BMI, smoking status, physical activity, age, blood pressure, cholesterol levels, etc.).

In [2]:
#loading packages
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Dropout, Activation
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras import regularizers
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler


# Loding data sets

In [13]:
X_test = pd.read_csv('X_test.csv')
y_train_df = pd.read_csv('y_train.csv')
X_train_df = pd.read_csv('X_train.csv')

Droping ID column

In [None]:
if 'ID' in X_train_df.columns:
    X_train_df = X_train_df.drop('ID', axis=1)
if 'ID' in X_test.columns:
    X_test = X_test.drop('ID', axis=1)

print(X_train_df.head())
print(X_train_df.info())

# Prepare data and split into train-test data

In [15]:
y_train_mapped = y_train_df['Diabetes'].map({'Yes': 1, 'No': 0}).values

# Split training data into training and validation sets
X_train, x_val, y_train, y_val = train_test_split(
    X_train_df,
    y_train_mapped,
    test_size=0.1,
    random_state=42,
    stratify=y_train_mapped
)

# One hot encoder for gender column

In [None]:

enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# 1. Encode Gender for TRAINING set (X_train)
gender_encoded_train = enc.fit_transform(X_train[["Gender"]])
gender_encoded_train_df = pd.DataFrame(
    gender_encoded_train,
    columns=enc.get_feature_names_out(["Gender"]),
    index=X_train.index
)

# 2. Encode Gender for VALIDATION set (x_val)
gender_encoded_val = enc.transform(x_val[["Gender"]])
gender_encoded_val_df = pd.DataFrame(
    gender_encoded_val,
    columns=enc.get_feature_names_out(["Gender"]),
    index=x_val.index
)

# 3. Drop original Gender column and concatenate encoded columns
X_train = pd.concat([X_train.drop("Gender", axis=1), gender_encoded_train_df], axis=1)
x_val = pd.concat([x_val.drop("Gender", axis=1), gender_encoded_val_df], axis=1)


# Performing Scaling

In [None]:



# Identify all numerical columns (all columns remaining should be numerical/scaled features)
numerical_cols = X_train.columns

# Initialize and fit the scaler on the TRAINING data ONLY
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

# Apply the same scaler (using .transform) to the validation data
x_val[numerical_cols] = scaler.transform(x_val[numerical_cols])

# Adjusting to the right format for TensorFlow
X_train = X_train.astype('float32')
x_val = x_val.astype('float32')
y_train = np.array(y_train).reshape(-1).astype('float32')
y_val = np.array(y_val).reshape(-1).astype('float32')



# Model testing with diverging parameters(layers, batch size, epochs, dropaout, normalization etc.)

In [22]:
early_stopping = EarlyStopping(
    min_delta=0.001, # minimium amount of change
    monitor='val_loss',
    patience=15, # how many epochs to wait before stopping
    restore_best_weights=True,
)


_epochs = 100
_batch_size = 16
_lr = 0.0001


# Build medium model
model_small = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64, kernel_regularizer=regularizers.l2(0.001)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])


model_small.compile(optimizer=Adam(learning_rate=_lr),
              loss='binary_crossentropy',
              metrics=['accuracy'])


history_small = model_small.fit(X_train, y_train,
                                  validation_data=(x_val, y_val),
                                  callbacks=[early_stopping],
                                  epochs=100, verbose=1)

Epoch 1/100
[1m1492/1492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.6071 - loss: 0.7142 - val_accuracy: 0.7346 - val_loss: 0.5538
Epoch 2/100
[1m1492/1492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7276 - loss: 0.5718 - val_accuracy: 0.7452 - val_loss: 0.5394
Epoch 3/100
[1m1492/1492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7342 - loss: 0.5601 - val_accuracy: 0.7471 - val_loss: 0.5329
Epoch 4/100
[1m1492/1492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7423 - loss: 0.5480 - val_accuracy: 0.7522 - val_loss: 0.5285
Epoch 5/100
[1m1492/1492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7416 - loss: 0.5438 - val_accuracy: 0.7516 - val_loss: 0.5254
Epoch 6/100
[1m1492/1492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7427 - loss: 0.5420 - val_accuracy: 0.7524 - val_loss: 0.5226
Epoch 7/10