In [29]:
%load_ext autoreload
%autoreload 2


## classic pydata stack
import os 
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline 

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15,7)



## torch
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## SEEDING



REBUILD_DATA = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
import sys
import vanilla_NN as NN

## Report notes

- need to filter out signals that are too long because the length distribution between the two datasets is very important
- need to balance datasets because one dataset is much bigger than the other
- process the data such that an RNN or any NN can process it
- 

In [32]:
num_blocks=3
dataset = NN.PolymerDataset(data_paths=["AA00400AA.npy","AA66466AA.npy"],num_blocks=num_blocks,lstm=True)
num_features = dataset.data[0].shape[1]



In [33]:
torch.manual_seed(12)
from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])


In [34]:
model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=12, num_epochs=50, batch_size=64, lr=0.001, weight_decay=0.0001, verbose='vv')

epoch=0/49, loss=0.478305846452713, accuracy=81.67174530029297
epoch=1/49, loss=0.019303172826766968, accuracy=89.6200180053711
epoch=2/49, loss=0.07077828794717789, accuracy=90.55562591552734
epoch=3/49, loss=0.06077583506703377, accuracy=91.00578308105469
epoch=4/49, loss=0.015926342457532883, accuracy=91.2838134765625
epoch=5/49, loss=0.5224441885948181, accuracy=91.54861450195312
epoch=6/49, loss=0.001695299637503922, accuracy=91.80458068847656
epoch=7/49, loss=0.4605109393596649, accuracy=92.04290008544922
epoch=8/49, loss=0.0007409860263578594, accuracy=92.15763854980469
epoch=9/49, loss=0.3232841491699219, accuracy=92.27238464355469
epoch=10/49, loss=0.16323672235012054, accuracy=92.47098541259766
epoch=11/49, loss=0.16561339795589447, accuracy=92.59014129638672
epoch=12/49, loss=0.3311363160610199, accuracy=92.6695785522461
epoch=13/49, loss=0.3978916108608246, accuracy=92.79315185546875
epoch=14/49, loss=0.013608536683022976, accuracy=92.8019790649414
epoch=15/49, loss=0.08564

In [35]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,confusion_matrix

from torch.utils.data import DataLoader

data_loader = DataLoader(test_data, batch_size=64, shuffle=False)

predictions = np.array([])
labels = np.array([])

with torch.no_grad():
    for X, y in iter(data_loader):
        probs = model(X)
        preds = torch.argmax(probs, dim=1, keepdim=False)
        predictions = np.concatenate((predictions,preds), axis=None)
        labels= np.concatenate((labels,y),axis=None)


accuracy = accuracy_score(labels,predictions)
f1 = f1_score(labels,predictions)
precision = precision_score(labels,predictions)
recall = recall_score(labels,predictions)

names =["Accuracy", "F1 Score", "Precision", "Recall"]
functions = [accuracy_score, f1_score, precision_score, recall_score]

for name, func in zip(names,functions):
    score = func(labels,predictions)
    print(f"{name}: {score*100:.2f}%")




Accuracy: 93.19%
F1 Score: 93.33%
Precision: 92.28%
Recall: 94.41%


Lists of runs:

With SGD:

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

92.62%

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

93.318% (increasing hidden_dim from 4 to 6)


With Adam:

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

- Accuracy: 91.81%
- F1 Score: 91.82%
- Precision: 91.30%
- Recall: 92.34%

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=9, num_epochs=100, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 6)


- Accuracy: 92.23%
- F1 Score: 92.35%
- Precision: 90.53%
- Recall: 94.25%


model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 9)


- Accuracy: 92.48%
- F1 Score: 92.56%
- Precision: 91.22%
- Recall: 93.93%


model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=12, num_epochs=300, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 12 and num_epochs from 100 to 300)

- Accuracy: 92.16%
- F1 Score: 92.21%
- Precision: 91.25%
- Recall: 93.19%


