In [1]:

## classic pydata stack
import os 
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline 

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15,7)



## torch
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## SEEDING

torch.manual_seed(1)


REBUILD_DATA = True

In [2]:
import sys
sys.path.insert(0, '../lucas')

import NN

## Report notes

- need to filter out signals that are too long because the length distribution between the two datasets is very important
- need to balance datasets because one dataset is much bigger than the other
- process the data such that an RNN or any NN can process it
- 

In [3]:
num_blocks=3
dataset = NN.PolymerDataset(data_paths=["AA00400AA.npy","AA66466AA.npy"],num_blocks=num_blocks,lstm=True)
num_features = dataset.data[0].shape[1]

from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])


In [12]:
model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=12, num_epochs=300, batch_size=64, lr=0.001, verbose='vv')

epoch=0/299, loss=0.07545173913240433, accuracy=80.03883361816406
epoch=1/299, loss=0.22779029607772827, accuracy=90.08782196044922
epoch=2/299, loss=0.013155046850442886, accuracy=91.06315612792969
epoch=3/299, loss=0.004949873313307762, accuracy=91.44269561767578
epoch=4/299, loss=0.16400231420993805, accuracy=91.97669982910156
epoch=5/299, loss=0.4144808053970337, accuracy=92.29886627197266
epoch=6/299, loss=0.0021090880036354065, accuracy=92.30769348144531
epoch=7/299, loss=0.0050432742573320866, accuracy=92.50187683105469
epoch=8/299, loss=0.011515204794704914, accuracy=92.5548324584961
epoch=9/299, loss=0.0015961830504238605, accuracy=92.66516876220703
epoch=10/299, loss=0.15547989308834076, accuracy=92.75343322753906
epoch=11/299, loss=0.10935588926076889, accuracy=92.71371459960938
epoch=12/299, loss=0.13839741051197052, accuracy=92.73136138916016
epoch=13/299, loss=0.0012398171238601208, accuracy=92.98733520507812
epoch=14/299, loss=0.31085148453712463, accuracy=92.94761657714

In [14]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,confusion_matrix

from torch.utils.data import DataLoader

data_loader = DataLoader(test_data, batch_size=64, shuffle=False)

predictions = np.array([])
labels = np.array([])

with torch.no_grad():
    for X, y in iter(data_loader):
        probs = model(X)
        preds = torch.argmax(probs, dim=1, keepdim=False)
        predictions = np.concatenate((predictions,preds), axis=None)
        labels= np.concatenate((labels,y),axis=None)


accuracy = accuracy_score(labels,predictions)
f1 = f1_score(labels,predictions)
precision = precision_score(labels,predictions)
recall = recall_score(labels,predictions)

names =["Accuracy", "F1 Score", "Precision", "Recall"]
functions = [accuracy_score, f1_score, precision_score, recall_score]

for name, func in zip(names,functions):
    score = func(labels,predictions)
    print(f"{name}: {score*100:.2f}%")




Accuracy: 92.16%
F1 Score: 92.21%
Precision: 91.25%
Recall: 93.19%


Lists of runs:

With SGD:

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

92.62%

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

93.318% (increasing hidden_dim from 4 to 6)


With Adam:

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

- Accuracy: 91.81%
- F1 Score: 91.82%
- Precision: 91.30%
- Recall: 92.34%

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=9, num_epochs=100, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 6)


- Accuracy: 92.23%
- F1 Score: 92.35%
- Precision: 90.53%
- Recall: 94.25%


model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 9)


- Accuracy: 92.48%
- F1 Score: 92.56%
- Precision: 91.22%
- Recall: 93.93%


model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=12, num_epochs=300, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 12 and num_epochs from 100 to 300)

- Accuracy: 92.16%
- F1 Score: 92.21%
- Precision: 91.25%
- Recall: 93.19%


