In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import seaborn as sns
import plotly.express as px
import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
from torch.utils.data import TensorDataset,DataLoader
from tqdm import tqdm

# Read Data

In [4]:
# make directories
if not os.path.isdir('pipeline'):
    os.system('mkdir -p pipeline')
else:
    print("pipeline directory already exists - delete or rename it")

In [6]:
json_labels = {}

total_length = 0
nfiles = 10
i=0

raw_dir = './data'

os.system('mkdir pipeline/1_dm')

for subdir in os.listdir(raw_dir):
    if os.path.exists(f'{raw_dir}/{subdir}/{subdir}_data.json'):
        # json files with annotations exists - dataset is annotated
        i += 1

        # Read labels
        with open(f'{raw_dir}/{subdir}/{subdir}_data.json', 'r') as f:
            json_labels[subdir] = json.load(f)
        
        # if dataset is annotated, read raw data file, decimate from 100 to 20 Hz, and save just x,y,z in 1_dm dir
        df = pd.read_csv(f'{raw_dir}/{subdir}/raw_data.csv', header=None, usecols=[2,3,4], names=['x','y','z'])
        json_labels[subdir]['length'] = len(df[::5])
        total_length += len(df[::5])
        df[::5].to_csv(f'pipeline/1_dm/{subdir}.csv', index=False)

        print(f'{subdir} - Length: {len(df[::5])}')

    else:
        print(f'Skipped {subdir}')
        
    if i == nfiles:
        break

mkdir: cannot create directory ‘pipeline/1_dm’: File exists


27 - Length: 28400
58 - Length: 41100
Skipped 29
67 - Length: 34600
Skipped 13
53 - Length: 31000
Skipped 7
61 - Length: 31200
41 - Length: 26700
Skipped 9
Skipped 46
Skipped 19
Skipped 5
Skipped 10
21 - Length: 38700
50 - Length: 23100
65 - Length: 36000
Skipped 6
Skipped 70
22 - Length: 39100


In [9]:
# Window data

os.system('mkdir pipeline/2_windowed')

for i,file in enumerate(os.listdir('pipeline/1_dm/')):
    df = pd.read_csv(f'pipeline/1_dm/{file}')
    print(f'{file} - Before {df.shape}')
    w = np.empty((len(df)-99, 300), dtype=float)
    for j in range(len(df)-99):
        w[j] = df[j:j+100].to_numpy().T.flatten()
    print(f'After {w.shape}')
    np.save(f'pipeline/2_windowed/{file.split(".")[0]}.npy', w)

61.csv - Before (156000, 3)
After (155901, 300)
50.csv - Before (115500, 3)
After (115401, 300)
67.csv - Before (173000, 3)
After (172901, 300)
58.csv - Before (205500, 3)
After (205401, 300)
41.csv - Before (133500, 3)
After (133401, 300)
65.csv - Before (180000, 3)
After (179901, 300)
22.csv - Before (195500, 3)
After (195401, 300)
53.csv - Before (155000, 3)
After (154901, 300)
27.csv - Before (142000, 3)
After (141901, 300)
21.csv - Before (193500, 3)
After (193401, 300)


In [13]:
# Labels

os.system('mkdir pipeline/labels')

for i, (key, annot) in enumerate(json_labels.items()):
    l = np.zeros(annot['length'] - 99)
    for j in range(annot['start']//5, annot['end']//5):
        for puff in annot['puffs']:
            if j >= puff['start']//5 and j <= puff['end']//5:
                l[j] = 1
    
    np.save(f'pipeline/labels/{key}.npy', l)

mkdir: cannot create directory ‘pipeline/labels’: File exists


In [19]:
# visualize true labels on continous signal
i = 21
labels = np.load(f'pipeline/labels/{i}.npy')

df = pd.read_csv(f'pipeline/1_dm/{i}.csv')
df['label'] = np.pad(labels*10, (0,99), mode='constant', constant_values=0)

fig = px.line(data_frame=df)
fig.show(renderer='browser')

193500


ValueError: Length of values (38700) does not match length of index (193500)

# Prepare Tensorflow Datasets

In [None]:
X = np.zeros((0,300))
y = np.zeros((0,1))
for file in os.listdir('pipeline/2_windowed'):
    X = np.concatenate([X, np.load(f'pipeline/2_windowed/{file}')])
    y = np.concatenate([y, np.load(f'pipeline/labels/{file}').reshape(-1,1)])

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.3, stratify=y, shuffle=True, random_state=0)

train_dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
test_dataset = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())

os.system('mkdir pipeline/datasets')
torch.save(train_dataset, 'pipeline/datasets/train_dataset.pt')
torch.save(test_dataset, 'pipeline/datasets/test_dataset.pt')

# Train Model

In [None]:
# Get cpu or gpu device for training
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

In [None]:
# Define Model
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(300, 10),
            nn.ReLU(),
            nn.Linear(10, 1)
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits  

model = MLP().to(device)
    
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [None]:
# Train model

epochs = 10
batch_size = 64

losses = []
test_losses = []

train_dataloader = DataLoader(torch.load('pipeline/datasets/train_dataset.pt'), batch_size=batch_size)
test_dataloader = DataLoader(torch.load('pipeline/datasets/test_dataset.pt'))

In [None]:
# Train

for epoch in range(epochs):
    for X_train_batch, y_train_batch in train_dataloader:
        
        X_train_batch = X_train_batch.to(device)
        y_train_batch = y_train_batch.to(device)
        
        # Forward Pass
        logits = model(X_train_batch)

        loss = criterion(logits, y_train_batch)

        # Backward Pass
        optimizer.zero_grad
        loss.backward()
        optimizer.step()

        losses.append(loss.cpu().item())
    print(f'Epoch {epoch} - loss: {sum(losses[-batch_size:])/batch_size}')

In [None]:
res = 100
plt.plot(torch.tensor(losses)[:len(losses)-len(losses)%res].view(-1,res).mean(1))

# Test

In [None]:
# Test by Predicting with Model

# Get one file
index = 53
X_pred = torch.from_numpy(np.load(f'pipeline/2_windowed/{index}.npy')).float()
y_true = torch.from_numpy(np.load(f'pipeline/labels/{index}.npy')).reshape(-1,1).float()

model.eval()

step = 10000    # for memory
preds = []
correct = 0
loss = 0
for i in range(0, len(X_pred), step):
    end = i+step if i+step < len(X_pred) else len(X_pred)

    X_batch = X_pred[i:end].to(device)    
    y_true_batch = y_true[i:end].to(device)

    logits = model(X_batch)
    pred = torch.round(nn.Sigmoid()(logits))
    correct += sum(y_true_batch == pred)
    preds += pred.flatten().tolist()
    loss += loss_fn(logits, y_true_batch).item()

preds = np.array(preds).reshape(-1,1)
accuracy = (correct / len(y_true)).item()
loss = loss/len(X_pred)
print(f'Accuracy: {100*accuracy:.4}%')
print(f'Loss: {loss:.4}')

In [None]:
# Visualize predictions
df_pred = pd.read_csv(f'pipeline/1_xyz/{index}.csv')
df_pred['preds'] = np.pad(preds.flatten()*10, (0, 99), mode='constant', constant_values=0)

figure = px.line(df_pred.loc[::10])
figure.show(renderer='browser')

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true.detach().to('cpu'), preds)
sns.heatmap(cm, annot=True)