In [1]:
## classic pydata stack
%load_ext autoreload
%autoreload 2

import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline 

from NN_LSTM import *

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15,7)

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import itertools as it
from datetime import datetime

## torch
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split

## SEEDING

torch.manual_seed(1)


REBUILD_DATA = True

In [2]:
def test_model(model, data_loader):
    predictions = np.array([])
    labels = np.array([])

    with torch.no_grad():
        for X, y in iter(data_loader):
            probs = model(X)
            preds = torch.argmax(probs, dim=1, keepdim=False)
            predictions = np.concatenate((predictions,preds), axis=None)
            labels= np.concatenate((labels,y),axis=None)

    return(accuracy_score(labels,predictions), f1_score(labels,predictions))

    print(confusion_matrix(labels,predictions))


In [4]:


###
# Feature engineering param
dataset_params_list={
    "psd":[False],
    "auto_corr":[False],
    "num_blocks":[3,5],
}


####
# Optimizer and model params
model_params_list={

"nn_specs": #Make sure that there are as much linear layers dims than activation function
    [
    ([8,8],[nn.Tanh(),nn.Tanh()]), 
    ([2],[nn.Tanh()]),
    ([12,2],[nn.Tanh(),nn.Tanh()]),
    ([10,10,2],[nn.Tanh(),nn.LeakyReLU(),nn.LeakyReLU()])
    ],


"hidden_dim":[1,8],
"lr":[0.001],
"rstm_layers":[1,2],
"num_epochs":[1,200],

}

dataset_param_combinations = list(it.product(*(dataset_params_list[param_name] for param_name in dataset_params_list.keys())))

model_param_combinations = list(it.product(*(model_params_list[param_name] for param_name in model_params_list.keys())))

print(len(model_param_combinations)*len(dataset_param_combinations))

80


In [5]:
def grid_search(dataset_params_list, model_params_list):

    dataset_param_combinations = list(it.product(*(dataset_params_list[param_name] for param_name in dataset_params_list.keys())))

    model_param_combinations = list(it.product(*(model_params_list[param_name] for param_name in model_params_list.keys())))

    best_accuracy=0

    with open("cv_logs_{}.txt".format(datetime.now()), "a") as f_logs:
        ds_kwargs={}
        for ds_params in dataset_param_combinations:
            for i,key in enumerate(dataset_params_list):
                ds_kwargs[key]=ds_params[i]

            dataset = PolymerDataset(data_paths=["../data/AA66266AA.npy","../data/AA662266AA.npy"],lstm=True, **ds_kwargs)
            train_size = int(0.8 * len(dataset))
            test_size = len(dataset) - train_size
            train_data, test_data = random_split(dataset, [train_size, test_size])
            data_loader = DataLoader(test_data, batch_size=64, shuffle=False)
            num_features = dataset.data[0].shape[1]
            
            for model_params in model_param_combinations:
                model_kwargs={}
                for i,key in enumerate(model_params_list):
                    model_kwargs[key]=model_params[i]

                print(ds_kwargs | model_kwargs)

                model = LSTM.train(dataset=train_data, num_features=num_features, batch_size=64, **model_kwargs)
                test_accuracy, test_f1 = test_model(model, data_loader)

                f_logs.write("Accuracy = {} | F1 = {} with params : {} \n".format(test_accuracy,test_f1, ds_kwargs| model_kwargs))
                f_logs.flush()

                if test_accuracy>best_accuracy:
                    best_accuracy = test_accuracy
                    best_f1=test_f1
                    best_model=model
                    best_params=ds_kwargs | model_kwargs

                    with open('best_params.txt', 'w') as f:
                        f.write("Best accuracy ({}) and f1 ({}) were reached with params {} \n".format(best_accuracy,best_f1, best_params))        
                        f.close()

        print("Best accuracy ({}) and f1 ({}) were reached with params {}".format(best_accuracy,best_f1, best_params))
        f_logs.close()
        return best_model, best_params


In [6]:
best_model, best_params= grid_search(dataset_params_list, model_params_list)

{'psd': False, 'auto_corr': False, 'num_blocks': 3, 'nn_specs': ([100, 100, 100], [Tanh(), LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01)]), 'hidden_dim': 1, 'lr': 0.001, 'rstm_layers': 1, 'num_epochs': 1}
New modules built with (1,100) Tanh()
New modules built with (100,100) LeakyReLU(negative_slope=0.01)
New modules built with (100,100) LeakyReLU(negative_slope=0.01)
[Parameter containing:
tensor([[ 0.7909,  0.9330,  0.7739,  0.3491, -0.6841,  0.5173, -0.9033, -0.3524,
         -0.6519],
        [ 0.5250,  0.9519, -0.4292, -0.0577, -0.4756, -0.5283,  0.4718, -0.9660,
         -0.1330],
        [ 0.8567,  0.8823,  0.8850, -0.7960, -0.5525,  0.0149, -0.0027, -0.6186,
          0.6106],
        [ 0.0460,  0.7217,  0.3579,  0.0361, -0.2516,  0.4631,  0.8234, -0.7105,
          0.4357]], requires_grad=True), Parameter containing:
tensor([[ 0.9843],
        [ 0.7864],
        [-0.6223],
        [ 0.0926]], requires_grad=True), Parameter containing:
tensor([-0.8449,  0.5301,

In [None]:
for dim, af in  list(map(list, zip(*params_list["nn_specs"][2]))):
    print(dim, af)

2 Tanh()


In [None]:

af=[]
fc=[]
for dims, a in   list(map(list, zip(*params_list["nn_specs"][2]))):
    print(a)
    print(dims)
    af.append(a)
    fc.append(nn.LazyLinear(dims))
list(zip(fc,af))

Tanh()
12
Tanh()
2


[(LazyLinear(in_features=0, out_features=12, bias=True), Tanh()),
 (LazyLinear(in_features=0, out_features=2, bias=True), Tanh())]

In [None]:
m = nn.ModuleList()

In [None]:
m.append(nn.Linear(3,2)).append(nn.ReLU())

ModuleList(
  (0): Linear(in_features=3, out_features=2, bias=True)
  (1): ReLU()
)