In [6]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [7]:
!pip install watermark

[0m

In [8]:
# use watermark to keep track of package versions
%load_ext watermark
%watermark -v -p pandas,numpy,matplotlib,torch

Python implementation: CPython
Python version       : 3.9.16
IPython version      : 8.5.0

pandas    : 1.5.0
numpy     : 1.23.4
matplotlib: 3.6.1
torch     : 1.12.1+cu116



In [9]:
# load data
df = pd.read_csv("data_banknote_authentication.txt", header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [10]:
# split data into features and labels
X_features = df[[0,1,2,3]].values
y_labels = df[4].values

In [11]:
# check shape of features and labels
print(X_features.shape)

(1372, 4)


In [12]:
# check label distribution
np.bincount(y_labels)

array([762, 610])

### Define the dataloader

In [13]:
# define dataset class
class MyDataset(Dataset):
    def __init__(self, X, y):

        # define features and labels
        self.features = torch.tensor(X, dtype = torch.float32)
        self.labels = torch.tensor(y, dtype = torch.float32)

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        return x, y
    
    def __len__(self):
        return self.labels.shape[0]

In [14]:
# train size
train_size = int(0.8 * len(df))
train_size

1097

In [15]:
# validation size
val_size = X_features.shape[0] - train_size
val_size

275

In [16]:
# use torch.utils to generate dataloader
dataset = MyDataset(X_features, y_labels)

# split dataset into train and validation
train_set, val_set = torch.utils.data.random_split(dataset, [train_size, val_size])

In [17]:
# make train dataloader
train_loader = DataLoader(dataset = train_set,
                          batch_size = 10,
                          shuffle = True,
                          )

# make validation dataloader
val_loader = DataLoader(dataset = val_set,
                        batch_size = 10,
                        shuffle = False,)

### We will create the standardization process here

In [27]:
# create standardization


train_mean = torch.zeros(X_features.shape[1])

for x, y in train_loader:
    train_mean += x.sum(dim=0)
    
train_mean /= len(train_set)

train_std = torch.zeros(X_features.shape[1])
for x, y in train_loader:
    train_std += ((x - train_mean)**2).sum(dim=0)

train_std = torch.sqrt(train_std / (len(train_set)-1))



train_std /= len(train_set)

In [28]:
# print mean and std
print(f"Feature means: {train_mean}")
print(f"Feature stds: {train_std}")

Feature means: tensor([ 0.4858,  2.0245,  1.3277, -1.1991])
Feature stds: tensor([0.0026, 0.0054, 0.0039, 0.0019])


In [29]:
# for smaller datasets
all_x = []
for x, y in train_loader:
    all_x.append(x)
    
train_std = torch.concat(all_x).std(dim=0)
train_mean = torch.concat(all_x).mean(dim=0)

print(train_std)
print(train_mean)

tensor([2.8454, 5.8915, 4.2544, 2.1216])
tensor([ 0.4858,  2.0245,  1.3277, -1.1991])


In [30]:
# standardization function
def standardize(x, mean, std):
    return (x - mean) / std

In [21]:
# implement model
class LogisticRegression(nn.Module):

    def __init__(self, num_features):
        super().__init__()
        self.linear = nn.Linear(num_features, 1)

    def forward(self, x):
        logits = self.linear(x)
        preds = torch.sigmoid(logits)
        return preds

In [22]:
# model agnostic code
device = "cuda" if torch.cuda.is_available() else 'cpu'

In [23]:
# define model
model = LogisticRegression(num_features = 4).to(device)

# define optimizer
optimizer = optim.SGD(params = model.parameters(), lr = 0.1)

In [31]:
# define training loop
epochs = 10

for epoch in range(epochs):
    model.train()

    for batch_idx, (features, class_labels) in enumerate(train_loader):

        # standardize features
        features = standardize(features, train_mean, train_std)
        # make predictions
        preds = model(features.to(device))

        # calculate loss
        loss = F.binary_cross_entropy(preds, class_labels.view(preds.shape).to(device))

        # zero gradients
        optimizer.zero_grad()

        # backprop
        loss.backward()

        # optimizer step (gradient descent)
        optimizer.step()

        ### LOGGING
        if not batch_idx % 20: # every 20 batches
            print(f"Epoch: {epoch+1:03d}/{epochs:03d} | "
                  f"Batch {batch_idx:03d}/{len(train_loader):03d} | "
                  f"Loss: {loss:.2f}")

Epoch: 001/010 | Batch 000/110 | Loss: 0.63
Epoch: 001/010 | Batch 020/110 | Loss: 0.54
Epoch: 001/010 | Batch 040/110 | Loss: 0.38
Epoch: 001/010 | Batch 060/110 | Loss: 0.14
Epoch: 001/010 | Batch 080/110 | Loss: 0.41
Epoch: 001/010 | Batch 100/110 | Loss: 0.11
Epoch: 002/010 | Batch 000/110 | Loss: 0.11
Epoch: 002/010 | Batch 020/110 | Loss: 0.18
Epoch: 002/010 | Batch 040/110 | Loss: 0.06
Epoch: 002/010 | Batch 060/110 | Loss: 0.10
Epoch: 002/010 | Batch 080/110 | Loss: 0.13
Epoch: 002/010 | Batch 100/110 | Loss: 0.13
Epoch: 003/010 | Batch 000/110 | Loss: 0.08
Epoch: 003/010 | Batch 020/110 | Loss: 0.08
Epoch: 003/010 | Batch 040/110 | Loss: 0.06
Epoch: 003/010 | Batch 060/110 | Loss: 0.11
Epoch: 003/010 | Batch 080/110 | Loss: 0.12
Epoch: 003/010 | Batch 100/110 | Loss: 0.07
Epoch: 004/010 | Batch 000/110 | Loss: 0.04
Epoch: 004/010 | Batch 020/110 | Loss: 0.04
Epoch: 004/010 | Batch 040/110 | Loss: 0.08
Epoch: 004/010 | Batch 060/110 | Loss: 0.16
Epoch: 004/010 | Batch 080/110 |

### Check Accuaracy

In [34]:
# create function to compute accuracy
def compute_accuracy(model, dataloader):
    model.eval()

    correct = 0.0
    num_examples = 0

    for idx, (features, class_labels) in enumerate(dataloader):
        # standardize features
        features = standardize(features, train_mean, train_std)
        # make predictions
        with torch.inference_mode():
            pred = model(features.to(device))

        # calculate accuracy
        preds = torch.where(pred > 0.5, 1, 0)
        lab = class_labels.view(preds.shape).to(preds.dtype).to(device)
        compare = lab == preds
        correct += torch.sum(compare)
        num_examples += len(compare)
    
    return correct / num_examples

In [35]:
# compute accuracy on train and validation
train_acc = compute_accuracy(model, train_loader)
print(f"Train Accuracy: {train_acc*100:.2f}%")

train_val = compute_accuracy(model, val_loader)
print(f"Val Accuray {train_val * 100:2f}%")

Train Accuracy: 97.99%
Val Accuray 98.181824%
