In [93]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import seaborn as sns
import plotly.express as px
import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

# Read Data

In [94]:
# make directories
if not os.path.isdir('pipeline'):
    os.system('mkdir -p pipeline')
else:
    print("pipeline directory already exists - delete or rename it")

In [111]:
json_labels = {}

total_length = 0
nfiles = 5
i=0

raw_dir = './data'

os.system('mkdir pipeline/1_xyz')

for subdir in os.listdir(raw_dir):
    is_annot = False
    for file in os.listdir(f'./{raw_dir}/{subdir}'):
        if file.endswith('.json'):      
            # json files with annotations - dataset is annotated
            i += 1
            is_annot = True

            # Read labels
            with open(f'{raw_dir}/{subdir}/{file}', 'r') as f:
                json_labels[subdir] = json.load(f)
        
        if is_annot and file.startswith('raw_data'):
            # if dataset is annotated, read raw data file and save just x,y,z in 1_xyz dir
            df = pd.read_csv(f'{raw_dir}/{subdir}/{file}', header=None, usecols=[2,3,4], names=['x','y','z'])
            json_labels[subdir]['length'] = len(df)
            total_length += len(df)
            df.to_csv(f'pipeline/1_xyz/{subdir}.csv', index=False)

    if(is_annot):
        print(subdir)
    else:
        print(f'Skipped {subdir}')
        
    if i == nfiles:
        break

mkdir: cannot create directory ‘pipeline/1_xyz’: File exists


16
Skipped 10
Skipped 38
47
56
61
Skipped 26
Skipped 52
59


In [96]:
# Window data

os.system('mkdir pipeline/2_windowed')
num_rows = 5000

for i,file in enumerate(os.listdir('pipeline/1_xyz/')):
    df = pd.read_csv(f'pipeline/1_xyz/{file}')
    print(f'Before {df.shape}')
    w = np.empty((len(df)-99, 300), dtype=float)
    for j in range(len(df)-99):
        w[j] = df[j:j+100].to_numpy().T.flatten()
    print(f'After {w.shape}')
    np.save(f'pipeline/2_windowed/{file.split(".")[0]}.npy', w)

Before (156000, 3)
After (155901, 300)
Before (206000, 3)
After (205901, 300)
Before (142500, 3)
After (142401, 300)
Before (207500, 3)
After (207401, 300)
Before (156000, 3)
After (155901, 300)


In [97]:
# Labels

os.system('mkdir pipeline/labels')

for i, (key, annot) in enumerate(json_labels.items()):
    l = np.zeros(annot['length'] - 99)
    for j in range(annot['start'], annot['end']):
        for puff in annot['puffs']:
            if j >= puff['start'] and j <= puff['end']:
                l[j] = 1
    
    np.save(f'pipeline/labels/{key}.npy', l)

In [99]:
# visualize true labels on continous signal
i = 47
labels = np.load(f'pipeline/labels/{i}.npy')

df = pd.read_csv(f'pipeline/1_xyz/{i}.csv')
df['label'] = np.pad(labels*10, (0,99), mode='constant', constant_values=0)

fig = px.line(data_frame=df.loc[::10])
fig.show(renderer='browser')

Opening in existing browser session.


Gtk-Message: 13:58:25.289: Failed to load module "xapp-gtk3-module"
Gtk-Message: 13:58:25.290: Failed to load module "appmenu-gtk-module"


# Train Model

In [100]:
# Get cpu or gpu device for training
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [None]:
# # Prepare Data for Pytorch
# X_pt = torch.from_numpy(X).float()
# y_pt = torch.from_numpy(y).reshape(-1,1).float()

In [75]:
# Define Model
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(300, 10),
            nn.ReLU(),
            nn.Linear(10, 1)
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits  


In [76]:
def train(model, epochs, batch_size, loss_fn, optimizer, X_train, X_test, y_train, y_test):
    losses = []
    test_losses = []

    model.train()
    for i in range(epochs):
        print(f'Epoch: {i}')
        k = 0
        for j in range(0, len(X_train), batch_size):
            end = j+batch_size if j+batch_size < len(X_train) else len(X_train)

            X_train_batch = X_train[j:end].to(device)
            y_train_batch = y_train[j:end].to(device)

            # Forward pass
            logits = model(X_train_batch)

            loss = loss_fn(logits, y_train_batch)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            losses.append(loss.item())


            # Test
            test_end = j+batch_size if j+batch_size < len(X_test) else len(X_test)

            X_test_batch = X_test[j:test_end].to(device)
            y_test_batch = y_test[j:test_end].to(device)
            test_logits = model(X_test_batch)
            test_loss = loss_fn(test_logits, y_test_batch)
            test_losses.append(test_loss.item())

            if k%100 == 0:
                print(f"Batch {k}: ", end='')
                print(f'\tLoss={loss.item():.5}', end='')
                print(f'\tTest Loss={test_loss.item():.5}')
            k += 1
    
    return (losses, test_losses)

In [77]:
model = MLP().to(device)

In [124]:
X = np.zeros((0, 300))
y = np.zeros((0, 1))

# Get X and y
for file in os.listdir('pipeline/2_windowed'):
    X = np.concatenate([X, np.load(f'pipeline/2_windowed/{file}')])
    y = np.concatenate([y, np.load(f'pipeline/labels/{file}').reshape(-1,1)])

(867505, 1)
[-1.9003906 -1.8066406 -1.7373047 -1.6845703 -1.7207031 -1.8525391
 -1.9462891 -1.9316406 -1.8476562 -1.8740234 -2.0253906 -2.0683594
 -2.0175781 -1.8984375 -1.8857422 -1.9101562 -1.9267578 -1.9697266
 -1.953125  -1.9199219 -1.9365234 -1.953125  -1.9384766 -1.9072266
 -1.9775391 -2.0703125 -2.0585938 -2.0175781 -1.9814453 -1.9599609
 -1.9003906 -1.8691406 -1.8623047 -1.8144531 -1.6875    -1.6181641
 -1.7089844 -1.7734375 -1.7929688 -1.9052734 -2.0410156 -2.0898438
 -2.0820312 -2.0957031 -2.1601562 -2.1054688 -2.0742188 -2.0175781
 -2.0292969 -1.9863281 -1.9101562 -1.8330078 -1.8291016 -1.9199219
 -1.9365234 -1.8886719 -1.8935547 -1.8740234 -1.8310547 -1.8027344
 -1.65625   -1.5869141 -1.5869141 -1.6201172 -1.7666016 -1.890625
 -1.921875  -1.9052734 -1.8330078 -1.7763672 -1.8046875 -1.8574219
 -1.8769531 -1.9580078 -2.078125  -2.0917969 -2.1035156 -2.1914062
 -2.1542969 -2.1582031 -2.1855469 -2.1816406 -2.0996094 -2.0253906
 -1.9990234 -1.9599609 -1.9580078 -1.9697266 -1.984

867505

In [106]:
# Train model

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

epochs = 2
batch_size = 128

losses = []
test_losses = []

for file in os.listdir('pipeline/2_windowed'):
    print(f'Starting file {file.split(".")[0]}')
    X = torch.from_numpy(np.load(f'pipeline/2_windowed/{file}')).float()
    y = torch.from_numpy(np.load(f'pipeline/labels/{file}')).reshape(-1,1).float()

    (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.5, stratify=y)

    # (l, tl) = train(model, epochs, batch_size, loss_fn, optimizer, X_train, X_test, y_train, y_test)
    k = 0
    for j in range(0, len(X_train), batch_size):
        end = j+batch_size if j+batch_size < len(X_train) else len(X_train)

        X_train_batch = X_train[j:end].to(device)
        y_train_batch = y_train[j:end].to(device)

        # Forward pass
        logits = model(X_train_batch)

        loss = loss_fn(logits, y_train_batch)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())


        # Test
        test_end = j+batch_size if j+batch_size < len(X_test) else len(X_test)

        X_test_batch = X_test[j:test_end].to(device)
        y_test_batch = y_test[j:test_end].to(device)
        test_logits = model(X_test_batch)
        test_loss = loss_fn(test_logits, y_test_batch)
        test_losses.append(test_loss.item())

        if k % 100 == 0:
            print(f"Batch {k}: ", end='')
            print(f'\tLoss={loss.item():.5}', end='')
            print(f'\tTest Loss={test_loss.item():.5}')
        k += 1

figure = px.line(pd.DataFrame({"loss": losses, "validation loss":test_losses}))
figure.show(renderer='browser')

Starting file 47
Batch 0: 	Loss=0.36105	Test Loss=0.25988
Batch 100: 	Loss=0.13024	Test Loss=0.11171
Batch 200: 	Loss=0.059678	Test Loss=0.10581
Batch 300: 	Loss=0.051927	Test Loss=0.022507
Batch 400: 	Loss=0.084085	Test Loss=0.12683
Batch 500: 	Loss=0.060093	Test Loss=0.10556
Batch 600: 	Loss=0.030768	Test Loss=0.092163
Starting file 59
Batch 0: 	Loss=0.43432	Test Loss=0.41763
Batch 100: 	Loss=0.073547	Test Loss=0.11882
Batch 200: 	Loss=0.13922	Test Loss=0.088461
Batch 300: 	Loss=0.080599	Test Loss=0.049551
Batch 400: 	Loss=0.045343	Test Loss=0.087375
Batch 500: 	Loss=0.13691	Test Loss=0.12193
Batch 600: 	Loss=0.10053	Test Loss=0.059013
Batch 700: 	Loss=0.098748	Test Loss=0.040098
Batch 800: 	Loss=0.09064	Test Loss=0.076462
Starting file 61
Batch 0: 	Loss=0.439	Test Loss=0.37457
Batch 100: 	Loss=0.20838	Test Loss=0.11925
Batch 200: 	Loss=0.13362	Test Loss=0.20924
Batch 300: 	Loss=0.097438	Test Loss=0.19026
Batch 400: 	Loss=0.078356	Test Loss=0.097806
Batch 500: 	Loss=0.15348	Test Loss

Gtk-Message: 14:10:00.034: Failed to load module "xapp-gtk3-module"
Gtk-Message: 14:10:00.034: Failed to load module "appmenu-gtk-module"


In [None]:
# Test by Predicting with Model

# Get one file
index = 1
X_pred = torch.tensor(windowed[index]).float().to(device)
y_true = torch.tensor(all_labels[index]).float().reshape(-1,1).to(device)

model.eval()

step = 10000    # for memory
preds = []
correct = 0
loss = 0
for i in range(0, len(X_pred), step):
    end = i+step if i+step < len(X_pred) else len(X_pred)
    with torch.no_grad():
        logits = model(X_pred[i:end])
        pred = torch.round(nn.Sigmoid()(logits))
        correct += sum(y_true[i:end] == pred)
        preds += pred.flatten().tolist()
        loss += loss_fn(logits, y_true[i:end]).item()

preds = np.array(preds).reshape(-1,1)
accuracy = (correct / len(y_pt)).item()
loss = loss/len(X_pred)
print(f'Accuracy: {100*accuracy:.4}%')
print(f'Loss: {loss:.4}')

In [None]:
# Visualize predictions
df_pred = raw_dfs[index]
df_pred['preds'] = np.pad(preds.flatten()*10, (0, 99), mode='constant', constant_values=0)

figure = px.line(df_pred.loc[::10])
figure.show(renderer='browser')