In [120]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import seaborn as sns

In [1]:
# Get raw, windowed data
import os
import urllib.request

if not os.path.exists('./data/raw/smoking_input.csv') or not os.path.exists('./data/raw/smoking_targets.csv'):
    os.system('mkdir -p data/raw')
    urllib.request.urlretrieve("http://ifestos.cse.sc.edu/datasets/smoking_data.tar.gz", "data/smoking_data.tar.gz")

    os.system('tar -xzvf data/smoking_data.tar.gz -C data/raw/ --strip-components=1')

smoking_data/smoking_input.csv
smoking_data/smoking_targets.csv


In [5]:
# Load data
X = pd.read_csv('data/raw/smoking_input.csv', header=None)
y = pd.read_csv('data/raw/smoking_targets.csv', header=None)

In [114]:
# Formatting & Preprocessing

# Get cpu or gpu device for training
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

y.columns = ["label"]

# Fill NaNs
X = X.fillna(method='bfill')

# Put in torch tensors
X_pt = torch.from_numpy(X.to_numpy()).float().to(device)
y_pt = torch.from_numpy(y.to_numpy()).float().to(device)

# train-test split

from sklearn.model_selection import train_test_split

(X_train, X_test, y_train, y_test) = train_test_split(X_pt, y_pt, test_size=0.5)

Using cuda device


## Pytorch Model
- input layer 300 -> 10
- activation function ReLU
- hidden layer 10 -> 1
- output function Tanh

In [115]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(300, 10),
            nn.ReLU(),
            nn.Linear(10, 1)
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits    

### Train Model

In [209]:
model = MLP().to(device)

# Loss function and optimizer
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

model.train()

epochs = 1000
for i in range(epochs):
    # Predict and calulate loss on X train
    pred = model(X_train)
    loss = loss_fn(pred, y_train)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i%50 == 0:
        print(f"Epoch {i}: ", end='')
        print(f'\tLoss={loss.item()}')

Epoch 0: 	Loss=0.7582908272743225
Epoch 50: 	Loss=0.21632617712020874
Epoch 100: 	Loss=0.17030876874923706
Epoch 150: 	Loss=0.14946845173835754
Epoch 200: 	Loss=0.13691826164722443
Epoch 250: 	Loss=0.12829017639160156
Epoch 300: 	Loss=0.12156131118535995
Epoch 350: 	Loss=0.1160607784986496
Epoch 400: 	Loss=0.11162348091602325
Epoch 450: 	Loss=0.10797412693500519
Epoch 500: 	Loss=0.10483480989933014
Epoch 550: 	Loss=0.10212817788124084
Epoch 600: 	Loss=0.09969791769981384
Epoch 650: 	Loss=0.09751300513744354
Epoch 700: 	Loss=0.09551876783370972
Epoch 750: 	Loss=0.093685083091259
Epoch 800: 	Loss=0.0919940248131752
Epoch 850: 	Loss=0.09033877402544022
Epoch 900: 	Loss=0.08869588375091553
Epoch 950: 	Loss=0.08707770705223083


### Test Model

In [210]:
model.eval()
with torch.no_grad():
    pred = model(X_test)

loss = loss_fn(pred, y_test).item()

# couldn't help but make this a one-liner but not sure about this, how to choose whether pred is 0 or 1 label?
accuracy = (y_test.to('cpu').numpy() == np.array([ (lambda p: 1 if p>=0 else 0)(p) for p in pred.detach().to('cpu').numpy()]).reshape(-1,1)).sum()/len(y_test)

print(f'Test Loss: {loss:.3}')
print(f'Accuracy: {100*accuracy:.3}%')

Test Loss: 0.0856
Accuracy: 97.6%


# Test on Continous Signal

In [None]:
# TODO im going to sleep