## Import all the necessary dependencies.

In [59]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

## Set the relevant hyper-parameters

In [60]:
dataset_path = "data\processed\processed.cleveland.csv"
train_test_ratio = 0.9  # Ratio of training examples to test examples
batch_size = 128
epochs = 275
learning_rate = 1.0e-4

## Data Ingestion Check

In [61]:
file = pd.read_csv(dataset_path)

X_shape, y_shape = file.shape
print("File Shape: (" + str(X_shape) + ", " + str(y_shape) + ") \n")

X = file.iloc[:, :y_shape - 1]
y = file.iloc[:, y_shape - 1]

print("X:\n")
print(X)
print("\ny:\n")
print(y)

File Shape: (302, 14) 

X:

     63.0  1.0  1.0.1  145.0  233.0  1.0.2  2.0  150.0  0.0  2.3  3.0  0.0.1  \
0    67.0  1.0    4.0  160.0  286.0    0.0  2.0  108.0  1.0  1.5  2.0    3.0   
1    67.0  1.0    4.0  120.0  229.0    0.0  2.0  129.0  1.0  2.6  2.0    2.0   
2    37.0  1.0    3.0  130.0  250.0    0.0  0.0  187.0  0.0  3.5  3.0    0.0   
3    41.0  0.0    2.0  130.0  204.0    0.0  2.0  172.0  0.0  1.4  1.0    0.0   
4    56.0  1.0    2.0  120.0  236.0    0.0  0.0  178.0  0.0  0.8  1.0    0.0   
..    ...  ...    ...    ...    ...    ...  ...    ...  ...  ...  ...    ...   
297  45.0  1.0    1.0  110.0  264.0    0.0  0.0  132.0  0.0  1.2  2.0    0.0   
298  68.0  1.0    4.0  144.0  193.0    1.0  0.0  141.0  0.0  3.4  2.0    2.0   
299  57.0  1.0    4.0  130.0  131.0    0.0  0.0  115.0  1.0  1.2  2.0    1.0   
300  57.0  0.0    2.0  130.0  236.0    0.0  2.0  174.0  0.0  0.0  2.0    1.0   
301  38.0  1.0    3.0  138.0  175.0    0.0  0.0  173.0  0.0  0.0  1.0    0.0   

     6.0  


__Note:__ The given datasets are contained in one file, so we will split into training and validation examples with the defined ratio.

In [62]:
split = int(np.floor(y.shape[0] * train_test_ratio))

X_train = X.iloc[:split, :y_shape - 1]
X_test = X.iloc[split:, :y_shape - 1]
print("\nX_train")
print(X_train)
print("\nX_test")
print(X_test)

y_train = y.iloc[:split]
y_test = y.iloc[split:]
print("\ny_train")
print(y_train)
print("\ny_test")
print(y_test)


X_train
     63.0  1.0  1.0.1  145.0  233.0  1.0.2  2.0  150.0  0.0  2.3  3.0  0.0.1  \
0    67.0  1.0    4.0  160.0  286.0    0.0  2.0  108.0  1.0  1.5  2.0    3.0   
1    67.0  1.0    4.0  120.0  229.0    0.0  2.0  129.0  1.0  2.6  2.0    2.0   
2    37.0  1.0    3.0  130.0  250.0    0.0  0.0  187.0  0.0  3.5  3.0    0.0   
3    41.0  0.0    2.0  130.0  204.0    0.0  2.0  172.0  0.0  1.4  1.0    0.0   
4    56.0  1.0    2.0  120.0  236.0    0.0  0.0  178.0  0.0  0.8  1.0    0.0   
..    ...  ...    ...    ...    ...    ...  ...    ...  ...  ...  ...    ...   
266  59.0  1.0    3.0  126.0  218.0    1.0  0.0  134.0  0.0  2.2  2.0    1.0   
267  40.0  1.0    4.0  152.0  223.0    0.0  0.0  181.0  0.0  0.0  1.0    0.0   
268  42.0  1.0    3.0  130.0  180.0    0.0  0.0  150.0  0.0  0.0  1.0    0.0   
269  61.0  1.0    4.0  140.0  207.0    0.0  2.0  138.0  1.0  1.9  1.0    1.0   
270  66.0  1.0    4.0  160.0  228.0    0.0  2.0  138.0  0.0  2.3  1.0    0.0   

     6.0  
0    3.0  
1    7.0

## Pytorch Datasets
Custom datasets for training and validation sets.
The input parameters are given as Pandas dataframes representing given __X_train/y_train__ and __X_test/y_test sets__.

In [63]:
class HeartTrainSet(Dataset):
    def __init__(self, X, y):
        X_train = X.to_numpy()
        y_train = y.to_numpy()

        self.X_train = torch.tensor(X_train, dtype=torch.float32)
        self.y_train = torch.tensor(y_train)

    def __getitem__(self, index):
        return self.X_train[index], self.y_train[index]

    def __len__(self):
        return len(self.y_train)


class HeartTestSet(Dataset):
    def __init__(self, X, y):
        X_test = X.to_numpy()
        y_test = y.to_numpy()

        self.X_test = torch.tensor(X_test, dtype=torch.float32)
        self.y_test = torch.tensor(y_test)

    def __getitem__(self, index):
        return self.X_test[index], self.y_test[index]

    def __len__(self):
        return len(self.y_test)

## Logistic Regression Model built using Pytorch built-in nn.Module

In [64]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

## Initialize Pytorch train and test datasets

In [65]:
heart_train_set = HeartTrainSet(X_train, y_train)
heart_test_set = HeartTestSet(X_test, y_test)

## Initialize Pytorch DataLoaders for iterating

In [66]:
train_dataloader = DataLoader(heart_train_set, batch_size=batch_size)
test_dataloader = DataLoader(heart_test_set, batch_size=batch_size)

## Initalize model, loss function, and optimization algorithm

In [67]:
model = LogisticRegression(13, 5)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

## Detect avalible devices for Pytorch

In [68]:
#"cuda" if torch.cuda.is_available() else
device = "cpu"

print("Using {} device".format(device))

Using cpu device


## Train and Test methods

In [76]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"Loss: {loss:>7f}")


def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \nAccuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

## Train and Test

In [79]:
for t in range(epochs):
    print(f"Epoch {t + 1}:")

    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)

print("Training Complete")

Epoch 1:
Loss: 1.904030
Test Error: 
Accuracy: 48.4%, Avg loss: 1.568614 

Epoch 2:
Loss: 1.377953
Test Error: 
Accuracy: 25.8%, Avg loss: 1.782386 

Epoch 3:
Loss: 1.903441
Test Error: 
Accuracy: 48.4%, Avg loss: 1.568027 

Epoch 4:
Loss: 1.377518
Test Error: 
Accuracy: 25.8%, Avg loss: 1.781756 

Epoch 5:
Loss: 1.902852
Test Error: 
Accuracy: 48.4%, Avg loss: 1.567439 

Epoch 6:
Loss: 1.377082
Test Error: 
Accuracy: 25.8%, Avg loss: 1.781126 

Epoch 7:
Loss: 1.902263
Test Error: 
Accuracy: 48.4%, Avg loss: 1.566853 

Epoch 8:
Loss: 1.376647
Test Error: 
Accuracy: 25.8%, Avg loss: 1.780497 

Epoch 9:
Loss: 1.901675
Test Error: 
Accuracy: 48.4%, Avg loss: 1.566267 

Epoch 10:
Loss: 1.376214
Test Error: 
Accuracy: 25.8%, Avg loss: 1.779869 

Epoch 11:
Loss: 1.901086
Test Error: 
Accuracy: 48.4%, Avg loss: 1.565682 

Epoch 12:
Loss: 1.375779
Test Error: 
Accuracy: 25.8%, Avg loss: 1.779240 

Epoch 13:
Loss: 1.900497
Test Error: 
Accuracy: 48.4%, Avg loss: 1.565099 

Epoch 14:
Loss: 1.375

## Save Model

In [73]:
torch.save(model, "models/logistic-regression")

__Note:__ The average accuracy of the Logistic Regression model ranges from 40-50%. Due to the high irregularity of the small dataset and the nature of the data, low accuracy rates with logistic regression models are expected. However, further hyper-parameter tuning may improve performance.

__Known Issues:__ Consistent oscillations in accuracy and testing error as well as fast but inconsistent convergence.