## Import all the necessary dependencies.

In [228]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

## Set the relevant hyper-parameters

In [229]:
dataset_path = "data\processed\processed.cleveland.csv"
train_test_ratio = 0.9  # Ratio of training examples to test examples
batch_size = 128
epochs = 50
learning_rate = 1.0e-4

## Data Ingestion Check

In [230]:
file = pd.read_csv(dataset_path)

X_shape, y_shape = file.shape
print("File Shape: (" + str(X_shape) + ", " + str(y_shape) + ") \n")

X = file.iloc[0:, :y_shape - 1]
y = file.iloc[0:, y_shape - 1]

print("X:\n")
print(X)
print("\ny:\n")
print(y)

File Shape: (302, 14) 

X:

     63.0  1.0  1.0.1  145.0  233.0  1.0.2  2.0  150.0  0.0  2.3  3.0  0.0.1  \
0    67.0  1.0    4.0  160.0  286.0    0.0  2.0  108.0  1.0  1.5  2.0    3.0   
1    67.0  1.0    4.0  120.0  229.0    0.0  2.0  129.0  1.0  2.6  2.0    2.0   
2    37.0  1.0    3.0  130.0  250.0    0.0  0.0  187.0  0.0  3.5  3.0    0.0   
3    41.0  0.0    2.0  130.0  204.0    0.0  2.0  172.0  0.0  1.4  1.0    0.0   
4    56.0  1.0    2.0  120.0  236.0    0.0  0.0  178.0  0.0  0.8  1.0    0.0   
..    ...  ...    ...    ...    ...    ...  ...    ...  ...  ...  ...    ...   
297  45.0  1.0    1.0  110.0  264.0    0.0  0.0  132.0  0.0  1.2  2.0    0.0   
298  68.0  1.0    4.0  144.0  193.0    1.0  0.0  141.0  0.0  3.4  2.0    2.0   
299  57.0  1.0    4.0  130.0  131.0    0.0  0.0  115.0  1.0  1.2  2.0    1.0   
300  57.0  0.0    2.0  130.0  236.0    0.0  2.0  174.0  0.0  0.0  2.0    1.0   
301  38.0  1.0    3.0  138.0  175.0    0.0  0.0  173.0  0.0  0.0  1.0    0.0   

     6.0  


__Note:__ The given datasets are contained in one file, so we will split into training and validation examples with the defined ratio.

In [231]:
split = int(np.floor(y.shape[0] * train_test_ratio))

X_train = X.iloc[:split, :y_shape - 1]
X_test = X.iloc[split:, :y_shape - 1]
print("\nX_train")
print(X_train)
print("\nX_test")
print(X_test)

y_train = y.iloc[:split]
y_test = y.iloc[split:]
print("\ny_train")
y_train = y_train.replace(to_replace=[2,3,4], value=[1,1,1])
print(y_train)
print("\ny_test")
y_test = y_test.replace(to_replace=[2,3,4], value=[1,1,1])
print(y_test)


X_train
     63.0  1.0  1.0.1  145.0  233.0  1.0.2  2.0  150.0  0.0  2.3  3.0  0.0.1  \
0    67.0  1.0    4.0  160.0  286.0    0.0  2.0  108.0  1.0  1.5  2.0    3.0   
1    67.0  1.0    4.0  120.0  229.0    0.0  2.0  129.0  1.0  2.6  2.0    2.0   
2    37.0  1.0    3.0  130.0  250.0    0.0  0.0  187.0  0.0  3.5  3.0    0.0   
3    41.0  0.0    2.0  130.0  204.0    0.0  2.0  172.0  0.0  1.4  1.0    0.0   
4    56.0  1.0    2.0  120.0  236.0    0.0  0.0  178.0  0.0  0.8  1.0    0.0   
..    ...  ...    ...    ...    ...    ...  ...    ...  ...  ...  ...    ...   
266  59.0  1.0    3.0  126.0  218.0    1.0  0.0  134.0  0.0  2.2  2.0    1.0   
267  40.0  1.0    4.0  152.0  223.0    0.0  0.0  181.0  0.0  0.0  1.0    0.0   
268  42.0  1.0    3.0  130.0  180.0    0.0  0.0  150.0  0.0  0.0  1.0    0.0   
269  61.0  1.0    4.0  140.0  207.0    0.0  2.0  138.0  1.0  1.9  1.0    1.0   
270  66.0  1.0    4.0  160.0  228.0    0.0  2.0  138.0  0.0  2.3  1.0    0.0   

     6.0  
0    3.0  
1    7.0

## Pytorch Datasets
Custom datasets for training and validation sets.
The input parameters are given as Pandas dataframes representing given __X_train/y_train__ and __X_test/y_test sets__.

In [232]:
class HeartTrainSet(Dataset):
    def __init__(self, X, y):
        X_train = X.to_numpy()
        y_train = y.to_numpy()

        self.X_train = torch.tensor(X_train, dtype=torch.float32)
        self.y_train = torch.tensor(y_train)

    def __getitem__(self, index):
        return self.X_train[index], self.y_train[index]

    def __len__(self):
        return len(self.y_train)


class HeartTestSet(Dataset):
    def __init__(self, X, y):
        X_test = X.to_numpy()
        y_test = y.to_numpy()

        self.X_test = torch.tensor(X_test, dtype=torch.float32)
        self.y_test = torch.tensor(y_test)

    def __getitem__(self, index):
        return self.X_test[index], self.y_test[index]

    def __len__(self):
        return len(self.y_test)

## Logistic Regression Model built using Pytorch built-in nn.Module

In [233]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)
        torch.nn.init.xavier_normal(self.linear.weight)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

## Initialize Pytorch train and test datasets

In [234]:
heart_train_set = HeartTrainSet(X_train, y_train)
heart_test_set = HeartTestSet(X_test, y_test)

## Initialize Pytorch DataLoaders for iterating

In [235]:
train_dataloader = DataLoader(heart_train_set, batch_size=batch_size)
test_dataloader = DataLoader(heart_test_set, batch_size=batch_size)

## Initalize model, loss function, and optimization algorithm

In [236]:
model = LogisticRegression(13, 2)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

  torch.nn.init.xavier_normal(self.linear.weight)


## Detect avalible devices for Pytorch

In [237]:
#"cuda" if torch.cuda.is_available() else
device = "cpu"

print("Using {} device".format(device))

Using cpu device


## Train and Test methods

In [238]:
def train(dataloader, model, loss_fn, optimizer, scheduler):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"Loss: {loss:>7f}")
    scheduler.step()


def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \nAccuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

## Train and Test

In [239]:
for t in range(epochs):
    print(f"Epoch {t + 1}:")

    train(train_dataloader, model, loss_fn, optimizer, scheduler)
    test(test_dataloader, model, loss_fn)

print("Training Complete")

Epoch 1:
Loss: 7.273410
Test Error: 
Accuracy: 64.5%, Avg loss: 3.418936 

Epoch 2:
Loss: 3.502120
Test Error: 
Accuracy: 64.5%, Avg loss: 3.322720 

Epoch 3:
Loss: 3.263786
Test Error: 
Accuracy: 64.5%, Avg loss: 3.294054 

Epoch 4:
Loss: 3.217453
Test Error: 
Accuracy: 64.5%, Avg loss: 3.271757 

Epoch 5:
Loss: 3.189638
Test Error: 
Accuracy: 64.5%, Avg loss: 3.252384 

Epoch 6:
Loss: 3.167383
Test Error: 
Accuracy: 64.5%, Avg loss: 3.235251 

Epoch 7:
Loss: 3.148445
Test Error: 
Accuracy: 64.5%, Avg loss: 3.220037 

Epoch 8:
Loss: 3.132063
Test Error: 
Accuracy: 64.5%, Avg loss: 3.206480 

Epoch 9:
Loss: 3.117783
Test Error: 
Accuracy: 64.5%, Avg loss: 3.194387 

Epoch 10:
Loss: 3.105279
Test Error: 
Accuracy: 64.5%, Avg loss: 3.183587 

Epoch 11:
Loss: 3.094291
Test Error: 
Accuracy: 64.5%, Avg loss: 3.173931 

Epoch 12:
Loss: 3.084606
Test Error: 
Accuracy: 61.3%, Avg loss: 3.165294 

Epoch 13:
Loss: 3.076045
Test Error: 
Accuracy: 61.3%, Avg loss: 3.157562 

Epoch 14:
Loss: 3.068

## Save Model

In [240]:
torch.save(model, "models/logistic-regression")

__Note:__ The average accuracy of the Logistic Regression model ranges from 40-50%. Due to the high irregularity of the small dataset and the nature of the data, low accuracy rates with logistic regression models are expected. However, further hyper-parameter tuning may improve performance.

__Known Issues:__ Consistent oscillations in accuracy and testing error as well as fast but inconsistent convergence.