In [91]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import *
plt.style.use('ggplot')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [96]:
import torch
from torch.utils.data import Dataset
import numpy as np
from tqdm import tqdm

class PolymerDataset(Dataset):
    def __init__(self, data_paths, timesteps=64, diff_threshold=0) -> None:
        self.raw_data = [np.load(data_path, allow_pickle=True) for data_path in data_paths]
        self.prepare(timesteps=timesteps, diff_threshold=diff_threshold)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
    def _process_event(self, event, timesteps=64, diff_threshold=0):
        compressed_event = []
        step_size = int(np.ceil(len(event) / timesteps))
        for i in range(timesteps):
            sub_event = event[i*step_size:(i+1)*step_size]
            features = build_features(sub_event, diff_threshold=diff_threshold)
            compressed_event.append(np.array(list(features.values())))
        return np.array(compressed_event)

    def prepare(self, timesteps=64, diff_threshold=0, seed=42):
        data = []
        labels = []
        data_size = min([len(d) for d in self.raw_data])
        np.random.seed(seed)

        for data_index, raw_data in enumerate(self.raw_data):
            indices = np.random.permutation(len(raw_data))
            raw_data = raw_data[indices[:data_size]]
            data_lens = [len(event) for event in raw_data]
            min_event_len = np.quantile(data_lens, 0.05)
            max_event_len = np.quantile(data_lens, 0.95)

            for event in tqdm(raw_data):
                if len(event) > min_event_len and len(event) < max_event_len:
                    processed_event = self._process_event(event, timesteps=timesteps, diff_threshold=diff_threshold)
                    data.append(processed_event)
                    labels.append(data_index)

        self.data = torch.tensor(np.array(data), dtype=torch.float)
        self.labels = torch.tensor(np.array(labels), dtype=torch.long)
        return self

In [97]:
TIMESTEPS = 4
DIFF_THRESHOLD = 10
dataset = PolymerDataset(['../data/AA66466AA.npy', '../data/AA66566AA.npy'], timesteps=TIMESTEPS, diff_threshold=DIFF_THRESHOLD)


100%|██████████| 43074/43074 [13:22<00:00, 53.68it/s]
100%|██████████| 43074/43074 [05:16<00:00, 135.91it/s]


In [6]:
# torch.save(dataset.data, f'../data/AA6245_T{TIMESTEPS}.pt')

In [98]:
dataset.data.shape, dataset.labels.shape

(torch.Size([77464, 4, 13]), torch.Size([77464]))

In [69]:
dataset.data = (dataset.data - dataset.data.mean(dim=1, keepdim=True)) / dataset.data.std(dim=1, keepdim=True)

In [25]:
from torch.utils.data import random_split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])

In [99]:
(dataset.labels == 0).sum(), (dataset.labels == 1).sum(), (dataset.labels == 2).sum()

(tensor(38736), tensor(38728), tensor(0))

In [82]:
from torch.utils.data import DataLoader

class PolymerLSTM(torch.nn.Module):
    def __init__(self, num_features, num_classes, num_layers=1, hidden_size=32) -> None:
        super().__init__()
        self.lstm = torch.nn.LSTM(input_size=num_features, num_layers=num_layers, hidden_size=hidden_size, batch_first=True)
        self.linear = torch.nn.Linear(hidden_size, num_classes)
    
    def forward(self, X):
        lstm_out, _ = self.lstm(X)
        outputs = lstm_out[:, -1, :]
        outputs = self.linear(outputs)
        probs = torch.nn.functional.log_softmax(outputs, dim=1)
        return probs
    
    def predict(self, X):
        probs = self.forward(X)
        preds = torch.argmax(probs, dim=1, keepdim=False)
        return preds


def train(dataset, num_epochs=100, batch_size=64, num_features=2, num_classes=2, hidden_size=32, num_layers=1, lr_rate=0.05):
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model = PolymerLSTM(num_features, num_classes, num_layers=num_layers, hidden_size=hidden_size)
    loss_function = torch.nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)

    for epoch in range(num_epochs):
        num_correct = 0
        for X, y in iter(data_loader):
            model.zero_grad()
            probs = model(X)
            loss = loss_function(probs, y)
            loss.backward()
            optimizer.step()
            preds = torch.argmax(probs, dim=1, keepdim=False)
            num_correct += (preds == y).sum()
        print(f'epoch={epoch}/{num_epochs}, loss={loss}, accuracy={num_correct*100/len(dataset)}')
    
    return model

In [100]:
model = train(dataset, num_features=dataset.data.shape[2], num_classes=len(torch.unique(dataset.labels)), batch_size=128, hidden_size=8, num_layers=2)

epoch=0/100, loss=0.5998442769050598, accuracy=66.56640625
epoch=1/100, loss=0.6362658739089966, accuracy=67.23639678955078
epoch=2/100, loss=0.6862271428108215, accuracy=67.49974060058594
epoch=3/100, loss=0.4076294004917145, accuracy=68.09356689453125
epoch=4/100, loss=0.6180543303489685, accuracy=68.52344512939453
epoch=5/100, loss=0.6069925427436829, accuracy=67.83409118652344
epoch=6/100, loss=0.7957097887992859, accuracy=67.7876205444336
epoch=7/100, loss=0.6977307796478271, accuracy=68.03934478759766
epoch=8/100, loss=0.6789999604225159, accuracy=67.82376098632812
epoch=9/100, loss=0.720350444316864, accuracy=69.2347412109375
epoch=10/100, loss=0.5439063906669617, accuracy=68.6628646850586
epoch=11/100, loss=0.5567768216133118, accuracy=67.9851303100586
epoch=12/100, loss=0.6033696532249451, accuracy=68.28720092773438
epoch=13/100, loss=0.5770800709724426, accuracy=68.44340515136719
epoch=14/100, loss=0.6622921824455261, accuracy=65.43555450439453
epoch=15/100, loss=0.6147159934

KeyboardInterrupt: 