In [2]:
%load_ext autoreload
%autoreload 2


## classic pydata stack
import os 
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline 

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15,7)



## torch
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## SEEDING



REBUILD_DATA = True

In [8]:
import sys
import vanilla_NN as NN

## Report notes

- need to filter out signals that are too long because the length distribution between the two datasets is very important
- need to balance datasets because one dataset is much bigger than the other
- process the data such that an RNN or any NN can process it
- 

In [9]:
num_blocks=10
dataset = NN.PolymerDataset(data_paths=["AA66466AA.npy","AA66266AA.npy"],num_blocks=num_blocks,lstm=True)
num_features = dataset.data[0].shape[1]


In [10]:
torch.manual_seed(12)
from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])


In [11]:
model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=12, num_epochs=50, batch_size=64, lr=0.001, weight_decay=0.0001, verbose='vv')

epoch=0/49, loss=0.31304627656936646, accuracy=73.79396057128906
epoch=1/49, loss=0.3067774474620819, accuracy=81.02664184570312
epoch=2/49, loss=0.3299972414970398, accuracy=81.7904052734375
epoch=3/49, loss=0.26985618472099304, accuracy=82.62877655029297
epoch=4/49, loss=0.27726274728775024, accuracy=82.92717742919922
epoch=5/49, loss=0.28162452578544617, accuracy=83.22557830810547
epoch=6/49, loss=0.33179762959480286, accuracy=83.24689483642578
epoch=7/49, loss=0.32970964908599854, accuracy=83.69449615478516
epoch=8/49, loss=0.21187373995780945, accuracy=83.75843811035156
epoch=9/49, loss=0.2885851562023163, accuracy=83.83303833007812
epoch=10/49, loss=0.41887956857681274, accuracy=84.08525848388672
epoch=11/49, loss=0.31746137142181396, accuracy=84.03197479248047
epoch=12/49, loss=0.2851174473762512, accuracy=84.28419494628906
epoch=13/49, loss=0.2714103162288666, accuracy=84.22024536132812
epoch=14/49, loss=0.3374994099140167, accuracy=84.42273712158203
epoch=15/49, loss=0.2538404

In [12]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,confusion_matrix

from torch.utils.data import DataLoader

data_loader = DataLoader(test_data, batch_size=64, shuffle=False)

predictions = np.array([])
labels = np.array([])

with torch.no_grad():
    for X, y in iter(data_loader):
        probs = model(X)
        preds = torch.argmax(probs, dim=1, keepdim=False)
        predictions = np.concatenate((predictions,preds), axis=None)
        labels= np.concatenate((labels,y),axis=None)


accuracy = accuracy_score(labels,predictions)
f1 = f1_score(labels,predictions)
precision = precision_score(labels,predictions)
recall = recall_score(labels,predictions)

names =["Accuracy", "F1 Score", "Precision", "Recall"]
functions = [accuracy_score, f1_score, precision_score, recall_score]

for name, func in zip(names,functions):
    score = func(labels,predictions)
    print(f"{name}: {score*100:.2f}%")




Accuracy: 84.06%
F1 Score: 84.19%
Precision: 83.88%
Recall: 84.50%


Lists of runs:

With SGD:

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

92.62%

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

93.318% (increasing hidden_dim from 4 to 6)


With Adam:

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

- Accuracy: 91.81%
- F1 Score: 91.82%
- Precision: 91.30%
- Recall: 92.34%

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=9, num_epochs=100, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 6)


- Accuracy: 92.23%
- F1 Score: 92.35%
- Precision: 90.53%
- Recall: 94.25%


model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 9)


- Accuracy: 92.48%
- F1 Score: 92.56%
- Precision: 91.22%
- Recall: 93.93%


model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=12, num_epochs=300, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 12 and num_epochs from 100 to 300)

- Accuracy: 92.16%
- F1 Score: 92.21%
- Precision: 91.25%
- Recall: 93.19%


