In [2]:
## classic pydata stack
%load_ext autoreload
%autoreload 2

import os 
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sys
%matplotlib inline 

from NN import *

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15,7)



## torch
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## SEEDING

torch.manual_seed(1)


REBUILD_DATA = True

In [2]:
num_blocks=5


dataset = PolymerDataset(data_paths=["../data/AA66266AA.npy","../data/AA662266AA.npy"],num_blocks=num_blocks,lstm=True)
num_features = dataset.data[0].shape[1]

from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])


In [3]:
model = LSTM.train(dataset=train_data, num_features=num_features, num_blocks=num_blocks, hidden_dim=4, num_epochs=300, batch_size=64, lr=0.002)

epoch=0/299, loss=0.5182722806930542, accuracy=76.92717742919922
epoch=50/299, loss=0.26967522501945496, accuracy=90.32682037353516
epoch=100/299, loss=0.21589510142803192, accuracy=90.96980285644531
epoch=150/299, loss=0.17336080968379974, accuracy=91.42095947265625
epoch=200/299, loss=0.1254599392414093, accuracy=91.66607666015625
epoch=250/299, loss=0.21873962879180908, accuracy=91.82948303222656
epoch=299/299, loss=0.2792649269104004, accuracy=92.06749725341797


In [3]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [4]:
def test_model(model, data_loader):
    predictions = np.array([])
    labels = np.array([])

    with torch.no_grad():
        for X, y in iter(data_loader):
            probs = model(X)
            preds = torch.argmax(probs, dim=1, keepdim=False)
            predictions = np.concatenate((predictions,preds), axis=None)
            labels= np.concatenate((labels,y),axis=None)

    return(accuracy_score(labels,predictions), f1_score(labels,predictions))

    print(confusion_matrix(labels,predictions))


In [5]:
import itertools as it

In [6]:
params_list={

###
# Feature engineering param

"auto_corr":[True,False],
"num_blocks":[5,6,7], #Keep num_blocks as last database parameter in order to optimize dataset building

####
# Optimizer and model params

"lr":[0.005,0.001, 0.0005],
"hidden_dim":[4,5,6],
"num_epochs":[10,50,80],

}
param_combinations = list(it.product(*(params_list[param_name] for param_name in params_list.keys())))
print(len(param_combinations))

162


In [7]:
def grid_search(params_list):
    param_combinations = list(it.product(*(params_list[param_name] for param_name in params_list.keys())))

    best_accuracy=0
    last_num_blocks=-1
   
    for params in param_combinations:
        kwargs={}
        for i,key in enumerate(params_list):
            kwargs[key]=params[i]
        if kwargs.get("num_blocks",1)!=last_num_blocks:
            dataset = PolymerDataset(data_paths=["../data/AA66266AA.npy","../data/AA662266AA.npy"],num_blocks=kwargs.get("num_blocks",1),lstm=True, auto_corr=kwargs.pop("auto_corr"))
            train_size = int(0.8 * len(dataset))
            test_size = len(dataset) - train_size
            train_data, test_data = random_split(dataset, [train_size, test_size])
            data_loader = DataLoader(test_data, batch_size=64, shuffle=False)
        else:
            kwargs.pop("auto_corr")
        print(kwargs)
        model = LSTM.train(dataset=train_data, num_features=num_features, batch_size=64, **kwargs)
        test_accuracy, test_f1 = test_model(model, data_loader)
        if test_accuracy>best_accuracy:
            best_accuracy = test_accuracy
            best_f1=test_f1
            best_model=model
            best_params=kwargs
            with open('best_params.txt', 'w') as f:
                f.write("Best accuracy ({}) and f1 ({}) were reached with params {}".format(best_accuracy,best_f1, best_params))        
                    
        last_num_blocks=kwargs.get("num_blocks",1)
    print("Best accuracy ({}) and f1 ({}) were reached with params {}".format(best_accuracy,best_f1, best_params))
    return best_model, best_params


In [8]:
grid_search(params_list)