In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import seaborn as sns
import plotly.express as px
import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

# Read Data

In [None]:
json_labels = []
raw_dfs = []

nfiles = 71
i=0

for subdir in os.listdir('./data/'):
    is_annot = False
    for file in os.listdir(f'./data/{subdir}'):
        if file.endswith('.json'):            
            # json files with annotations - dataset is annotated
            i += 1
            is_annot = True

            # Read labels
            with open(f'data/{subdir}/{file}', 'r') as f:
                json_labels.append(json.load(f))
        
        if is_annot and file.startswith('raw_data'):
            # if dataset is annotated, read raw data file
            df = pd.read_csv(f'data/{subdir}/{file}', header=None)
            raw_dfs.append(df[[2,3,4]])

    if(is_annot):
        print(subdir)
    else:
        print(f'Skipped {subdir}')
        
    if i == nfiles:
        break

In [None]:
# Window data

windowed = []

for i,df in enumerate(raw_dfs):
    print(f'Raw {i}: {df.shape}')

    w = np.empty((len(df)-99, 300), dtype=float)
    for j in range(len(df)-99):
        w[j] = df[j:j+100].to_numpy().T.flatten()

    print(f'Windowed {i}: {w.shape}')
    windowed.append(w)

X = np.concatenate(windowed)
print(f'X: {X.shape}')

In [None]:
# Labels
all_labels = []

for i, annot in enumerate(json_labels):
    labels = np.zeros(len(windowed[i]))     # each json file corrosponds to windowed data at same index
    print(i)
    for j in range(annot['start'], annot['end']):
        for puff in annot['puffs']:
            if j >= puff['start'] and j <= puff['end']:
                labels[j] = 1
    all_labels.append(labels)

y = np.concatenate(all_labels)
print(f'y: {y.shape}')

In [None]:
# visualize true labels on continous signal
df = pd.concat(raw_dfs).reset_index(drop=True)
df['labels'] = np.concatenate([np.pad(labels*10, (0,99), mode='constant', constant_values=0) for labels in all_labels])

fig = px.line(data_frame=df.loc[::100])
fig.show(renderer='browser')

# Train Model

In [None]:
# Get cpu or gpu device for training
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

In [None]:
# Prepare Data for Pytorch
X_pt = torch.from_numpy(X).float()
y_pt = torch.from_numpy(y).reshape(-1,1).float()

In [None]:
# Define Model
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(300, 10),
            nn.ReLU(),
            nn.Linear(10, 1)
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits  


In [None]:
def train(model, epochs, batch_size, loss_fn, optimizer, X_train, X_test, y_train, y_test):
    losses = []
    test_losses = []

    model.train()
    for i in range(epochs):
        print(f'Epoch: {i}')
        k = 0
        for j in range(0, len(X_train), batch_size):
            end = j+batch_size if j+batch_size < len(X_train) else len(X_train)

            X_train_batch = X_train[j:end].to(device)
            y_train_batch = y_train[j:end].to(device)

            # Forward pass
            logits = model(X_train_batch)

            loss = loss_fn(logits, y_train_batch)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            losses.append(loss.item())

            test_end = j+batch_size if j+batch_size < len(X_test) else len(X_test)

        
            X_test_batch = X_test[j:test_end].to(device)
            y_test_batch = y_test[j:test_end].to(device)
            test_logits = model(X_test_batch)
            test_loss = loss_fn(test_logits, y_test_batch)
            test_losses.append(test_loss.item())

            if k%100 == 0:
                print(f"Batch {k}: ", end='')
                print(f'\tLoss={loss.item():.5}', end='')
                print(f'\tTest Loss={test_loss.item():.5}')
            k += 1
    
    # plt.plot(losses)
    # plt.savefig('loss.png')

    return (losses, test_losses)

In [None]:
model = MLP().to(device)

In [None]:
# Train model

(X_train, X_test, y_train, y_test) = train_test_split(X_pt, y_pt, test_size=0.5, stratify=y_pt)

# Loss function and optimizer
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)

epochs = 2
batch_size = 128

(losses, test_losses) = train(model, epochs, batch_size, loss_fn, optimizer, X_train, X_test, y_train, y_test)


figure = px.line(pd.DataFrame({"loss": losses, "validation loss":test_losses}))
figure.show(renderer='browser')

In [None]:
# Test by Predicting with Model

# Get one file
index = 1
X_pred = torch.tensor(windowed[index]).float().to(device)
y_true = torch.tensor(all_labels[index]).float().reshape(-1,1).to(device)

model.eval()

step = 10000    # for memory
preds = []
correct = 0
loss = 0
for i in range(0, len(X_pred), step):
    end = i+step if i+step < len(X_pred) else len(X_pred)
    with torch.no_grad():
        logits = model(X_pred[i:end])
        pred = torch.round(nn.Sigmoid()(logits))
        correct += sum(y_true[i:end] == pred)
        preds += pred.flatten().tolist()
        loss += loss_fn(logits, y_true[i:end]).item()

preds = np.array(preds).reshape(-1,1)
accuracy = (correct / len(y_pt)).item()
loss = loss/len(X_pred)
print(f'Accuracy: {100*accuracy:.4}%')
print(f'Loss: {loss:.4}')

In [None]:
# Visualize predictions
df_pred = raw_dfs[index]
df_pred['preds'] = np.pad(preds.flatten()*10, (0, 99), mode='constant', constant_values=0)

figure = px.line(df_pred.loc[::10])
figure.show(renderer='browser')