In [2]:
%load_ext autoreload
%autoreload 2


## classic pydata stack
import os 
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline 

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15,7)



## torch
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## SEEDING



REBUILD_DATA = True

In [3]:
import sys
import vanilla_NN as NN

## Report notes

- need to filter out signals that are too long because the length distribution between the two datasets is very important
- need to balance datasets because one dataset is much bigger than the other
- process the data such that an RNN or any NN can process it
- 

In [4]:
num_blocks=10
dataset = NN.PolymerDataset(data_paths=["AA00400AA.npy","AA66466AA.npy"],num_blocks=num_blocks,lstm=True)
num_features = dataset.data[0].shape[1]


In [5]:
torch.manual_seed(12)
from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])


In [6]:
model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=12, num_epochs=200, batch_size=64, lr=0.001, weight_decay=0.0001, verbose='vv')

epoch=0/199, loss=0.1767188310623169, accuracy=83.5738525390625
epoch=1/199, loss=0.05627529323101044, accuracy=90.69244384765625
epoch=2/199, loss=0.24273580312728882, accuracy=91.9104995727539
epoch=3/199, loss=0.6765901446342468, accuracy=92.47981262207031
epoch=4/199, loss=1.4545215368270874, accuracy=92.72254180908203
epoch=5/199, loss=0.207328200340271, accuracy=92.85052490234375
epoch=6/199, loss=0.030906764790415764, accuracy=92.93437194824219
epoch=7/199, loss=0.0054834745824337006, accuracy=93.14620971679688
epoch=8/199, loss=0.03030676580965519, accuracy=92.99616241455078
epoch=9/199, loss=0.0008962685242295265, accuracy=93.13297271728516
epoch=10/199, loss=0.41894564032554626, accuracy=93.13738250732422
epoch=11/199, loss=0.16134415566921234, accuracy=93.15945434570312
epoch=12/199, loss=0.0009645950049161911, accuracy=93.19034576416016
epoch=13/199, loss=0.0013784439070150256, accuracy=93.1815185546875
epoch=14/199, loss=0.5213385224342346, accuracy=93.31391143798828
epoch

In [7]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,confusion_matrix

from torch.utils.data import DataLoader

data_loader = DataLoader(test_data, batch_size=64, shuffle=False)

predictions = np.array([])
labels = np.array([])

with torch.no_grad():
    for X, y in iter(data_loader):
        probs = model(X)
        preds = torch.argmax(probs, dim=1, keepdim=False)
        predictions = np.concatenate((predictions,preds), axis=None)
        labels= np.concatenate((labels,y),axis=None)


accuracy = accuracy_score(labels,predictions)
f1 = f1_score(labels,predictions)
precision = precision_score(labels,predictions)
recall = recall_score(labels,predictions)

names =["Accuracy", "F1 Score", "Precision", "Recall"]
functions = [accuracy_score, f1_score, precision_score, recall_score]

for name, func in zip(names,functions):
    score = func(labels,predictions)
    print(f"{name}: {score*100:.2f}%")




Accuracy: 93.22%
F1 Score: 93.32%
Precision: 92.84%
Recall: 93.81%


Lists of runs:

With SGD:

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

92.62%

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

93.318% (increasing hidden_dim from 4 to 6)


With Adam:

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

- Accuracy: 91.81%
- F1 Score: 91.82%
- Precision: 91.30%
- Recall: 92.34%

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=9, num_epochs=100, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 6)


- Accuracy: 92.23%
- F1 Score: 92.35%
- Precision: 90.53%
- Recall: 94.25%


model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 9)


- Accuracy: 92.48%
- F1 Score: 92.56%
- Precision: 91.22%
- Recall: 93.93%


model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=12, num_epochs=300, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 12 and num_epochs from 100 to 300)

- Accuracy: 92.16%
- F1 Score: 92.21%
- Precision: 91.25%
- Recall: 93.19%


