In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
from torchvision.transforms import Normalize

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("Data/water_potability.csv")
df = df.dropna()

X = torch.tensor(df[['ph', 'Sulfate', 'Conductivity', 'Organic_carbon']].values).float()
y = torch.tensor(df['Potability'].values).float()


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32).reshape(-1, 1)

  X_train = torch.tensor(X_train, dtype=torch.float32)
  y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
  X_val = torch.tensor(X_val, dtype=torch.float32)
  y_val = torch.tensor(y_val, dtype=torch.float32).reshape(-1, 1)


Loading the data retrieved from Kaggle

In [3]:
# Define the model
model = nn.Sequential(
    nn.Linear(4, 12),
    nn.ReLU(),
    nn.Linear(12, 8),
    nn.ReLU(),
    nn.Linear(8, 1),
    nn.Sigmoid()
)
print(model)


criterion = nn.BCELoss()  # binary cross-entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)




Sequential(
  (0): Linear(in_features=4, out_features=12, bias=True)
  (1): ReLU()
  (2): Linear(in_features=12, out_features=8, bias=True)
  (3): ReLU()
  (4): Linear(in_features=8, out_features=1, bias=True)
  (5): Sigmoid()
)


### Training and Evaluation Loop

Looking at loss function and accuracy

In [4]:
n_epochs = 100
batch_size = 10

for epoch in range(n_epochs):
    # Training loop
    for i in range(0, len(X_train), batch_size):
        Xbatch = X_train[i:i + batch_size]
        y_pred = model(Xbatch)
        ybatch = y_train[i:i + batch_size]
        loss = criterion(y_pred, ybatch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Validation loop
    model.eval()
    with torch.no_grad():
        y_val_pred = model(X_val)
        val_loss = criterion(y_val_pred, y_val)
    
    # Print training and validation loss for each epoch
    print(f'Epoch {epoch + 1}/{n_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

# Compute accuracy 
with torch.no_grad():
    y_pred = model(X_train)
accuracy = (y_pred.round() == y_train).float().mean()
print(f"Training Accuracy {accuracy}")

with torch.no_grad():
    y_val_pred = model(X_val)
accuracy_val = (y_val_pred.round() == y_val).float().mean()
print(f"Validation Accuracy {accuracy_val}")


Epoch 1/100, Training Loss: 1.0866477489471436, Validation Loss: 0.8872309327125549
Epoch 2/100, Training Loss: 0.6821507215499878, Validation Loss: 0.6924028992652893
Epoch 3/100, Training Loss: 0.6694460511207581, Validation Loss: 0.6896106600761414
Epoch 4/100, Training Loss: 0.6574300527572632, Validation Loss: 0.6872086524963379
Epoch 5/100, Training Loss: 0.64174485206604, Validation Loss: 0.6854843497276306
Epoch 6/100, Training Loss: 0.6270387768745422, Validation Loss: 0.683859646320343
Epoch 7/100, Training Loss: 0.6111711859703064, Validation Loss: 0.6829855442047119
Epoch 8/100, Training Loss: 0.5968443751335144, Validation Loss: 0.6822667121887207
Epoch 9/100, Training Loss: 0.5842128992080688, Validation Loss: 0.6815661787986755
Epoch 10/100, Training Loss: 0.5753598809242249, Validation Loss: 0.6815177798271179
Epoch 11/100, Training Loss: 0.5686315894126892, Validation Loss: 0.6819702386856079
Epoch 12/100, Training Loss: 0.5660653114318848, Validation Loss: 0.682112872

### Overfitting

To reduce overfitting 
    - Reduce model size
    - Using weights decay
    - Obtain new data or augmentation