In [1]:

## classic pydata stack
import os 
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline 

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15,7)



## torch
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## SEEDING

torch.manual_seed(1)


REBUILD_DATA = True

In [2]:
import sys
sys.path.insert(0, '../lucas')

import NN

## Report notes

- need to filter out signals that are too long because the length distribution between the two datasets is very important
- need to balance datasets because one dataset is much bigger than the other
- process the data such that an RNN or any NN can process it
- 

In [3]:
num_blocks=3
dataset = NN.PolymerDataset(data_paths=["AA00400AA.npy","AA66466AA.npy"],num_blocks=num_blocks,lstm=True)
num_features = dataset.data[0].shape[1]

from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])


In [8]:
model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.001, verbose='vv')

epoch=0/99, loss=0.08563226461410522, accuracy=75.316650390625
epoch=1/99, loss=0.3211606442928314, accuracy=89.31991577148438
epoch=2/99, loss=0.045108065009117126, accuracy=90.6438980102539
epoch=3/99, loss=0.04653136432170868, accuracy=90.7939453125
epoch=4/99, loss=0.07093808799982071, accuracy=91.12935638427734
epoch=5/99, loss=0.07249695062637329, accuracy=91.27940368652344
epoch=6/99, loss=0.05818578228354454, accuracy=91.42945098876953
epoch=7/99, loss=0.07147309929132462, accuracy=91.6633529663086
epoch=8/99, loss=0.1706058233976364, accuracy=91.9104995727539
epoch=9/99, loss=0.09629851579666138, accuracy=92.08702850341797
epoch=10/99, loss=0.7861974835395813, accuracy=92.20177459716797
epoch=11/99, loss=0.0058463639579713345, accuracy=92.19735717773438
epoch=12/99, loss=0.07166001200675964, accuracy=92.32534790039062
epoch=13/99, loss=0.17320960760116577, accuracy=92.43126678466797
epoch=14/99, loss=0.8215713500976562, accuracy=92.43567657470703
epoch=15/99, loss=0.3009438216

In [9]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,confusion_matrix

from torch.utils.data import DataLoader

data_loader = DataLoader(test_data, batch_size=64, shuffle=False)

predictions = np.array([])
labels = np.array([])

with torch.no_grad():
    for X, y in iter(data_loader):
        probs = model(X)
        preds = torch.argmax(probs, dim=1, keepdim=False)
        predictions = np.concatenate((predictions,preds), axis=None)
        labels= np.concatenate((labels,y),axis=None)


accuracy = accuracy_score(labels,predictions)
f1 = f1_score(labels,predictions)
precision = precision_score(labels,predictions)
recall = recall_score(labels,predictions)

names =["Accuracy", "F1 Score", "Precision", "Recall"]
functions = [accuracy_score, f1_score, precision_score, recall_score]

for name, func in zip(names,functions):
    score = func(labels,predictions)
    print(f"{name}: {score*100:.2f}%")




Accuracy: 92.23%
F1 Score: 92.35%
Precision: 90.53%
Recall: 94.25%


Lists of runs:

With SGD:

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

92.62%

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

93.318% (increasing hidden_dim from 4 to 6)


With Adam:

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=100, batch_size=64, lr=0.05, verbose='vv') 

- Accuracy: 91.81%
- F1 Score: 91.82%
- Precision: 91.30%
- Recall: 92.34%

model = NN.LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=6, num_epochs=100, batch_size=64, lr=0.001, verbose='vv')

(changed hidden_dim from 4 to 6)


- Accuracy: 92.23%
- F1 Score: 92.35%
- Precision: 90.53%
- Recall: 94.25%


