In [2]:
## classic pydata stack
%load_ext autoreload
%autoreload 2

import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline 

from NN_LSTM import *

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15,7)

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import itertools as it
from datetime import datetime

## torch 
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split

## SEEDING

torch.manual_seed(1)


REBUILD_DATA = True

In [3]:
def test_model(model, data_loader):
    predictions = np.array([])
    labels = np.array([])

    with torch.no_grad():
        for X, y in iter(data_loader):
            probs = model(X)
            preds = torch.argmax(probs, dim=1, keepdim=False)
            predictions = np.concatenate((predictions,preds), axis=None)
            labels= np.concatenate((labels,y),axis=None)

    return(accuracy_score(labels,predictions), f1_score(labels,predictions))

    print(confusion_matrix(labels,predictions))


In [6]:


###
# Feature engineering param
dataset_params_list={
    "psd":[True,False],
    "auto_corr":[False],
    "num_blocks":[3,5],
}


####
# Optimizer and model params
model_params_list={

"nn_specs": #Make sure that there are as much linear layers dims than activation function
    [
    ([12,2],[nn.Tanh(),nn.Tanh()]),
    ([10 for i in range(10)],[nn.LeakyReLU() for i in range(10)]),
    ([8,8],[nn.Tanh(),nn.Tanh()]), 
    ([2],[nn.Sigmoid()]),
    ([10,10,2],[nn.Tanh(),nn.LeakyReLU(),nn.LeakyReLU()])
    ],


"hidden_dim":[8,4],
"lr":[0.001],
"rstm_layers":[1,5],
"num_epochs":[350],

}

dataset_param_combinations = list(it.product(*(dataset_params_list[param_name] for param_name in dataset_params_list.keys())))

model_param_combinations = list(it.product(*(model_params_list[param_name] for param_name in model_params_list.keys())))

print(len(model_param_combinations)*len(dataset_param_combinations))

80


In [7]:
def grid_search(dataset_params_list, model_params_list):

    dataset_param_combinations = list(it.product(*(dataset_params_list[param_name] for param_name in dataset_params_list.keys())))

    model_param_combinations = list(it.product(*(model_params_list[param_name] for param_name in model_params_list.keys())))

    best_accuracy=0

    with open("cv_logs_{}.txt".format(datetime.now()), "a") as f_logs:
        ds_kwargs={}
        for ds_params in dataset_param_combinations:
            for i,key in enumerate(dataset_params_list):
                ds_kwargs[key]=ds_params[i]

            dataset = PolymerDataset(data_paths=["../data/AA66266AA.npy","../data/AA662266AA.npy"],lstm=True, **ds_kwargs)
            train_size = int(0.8 * len(dataset))
            test_size = len(dataset) - train_size
            train_data, test_data = random_split(dataset, [train_size, test_size])
            data_loader = DataLoader(test_data, batch_size=64, shuffle=False)
            num_features = dataset.data[0].shape[1]
            
            for model_params in model_param_combinations:
                model_kwargs={}
                for i,key in enumerate(model_params_list):
                    model_kwargs[key]=model_params[i]

                print(ds_kwargs | model_kwargs)

                model = LSTM.train(dataset=train_data, num_features=num_features, batch_size=64, **model_kwargs)
                test_accuracy, test_f1 = test_model(model, data_loader)

                f_logs.write("Accuracy = {} | F1 = {} with params : {} \n".format(test_accuracy,test_f1, ds_kwargs| model_kwargs))
                f_logs.flush()

                if test_accuracy>best_accuracy:
                    best_accuracy = test_accuracy
                    best_f1=test_f1
                    best_model=model
                    best_params=ds_kwargs | model_kwargs

                    with open('best_params.txt', 'w') as f:
                        f.write("Best accuracy ({}) and f1 ({}) were reached with params {} \n".format(best_accuracy,best_f1, best_params))  
                        #for param in best_model.parameters():
                        #    f.write(param.data) 
                        f.close()

        print("Best accuracy ({}) and f1 ({}) were reached with params {}".format(best_accuracy,best_f1, best_params))
        f_logs.close()
        return best_model, best_params


In [8]:
best_model, best_params= grid_search(dataset_params_list, model_params_list)

{'psd': True, 'auto_corr': False, 'num_blocks': 3, 'nn_specs': ([12, 2], [Tanh(), Tanh()]), 'hidden_dim': 8, 'lr': 0.001, 'rstm_layers': 1, 'num_epochs': 350}
epoch=0/349, loss=0.7099433541297913, accuracy=57.07992935180664
epoch=50/349, loss=0.502008855342865, accuracy=84.13143920898438
epoch=100/349, loss=0.46110591292381287, accuracy=85.03730010986328
epoch=150/349, loss=0.2906324863433838, accuracy=85.62344360351562
epoch=200/349, loss=0.3311106264591217, accuracy=86.17761993408203
epoch=250/349, loss=0.2917272448539734, accuracy=85.95026397705078
epoch=300/349, loss=0.28054431080818176, accuracy=86.51509857177734
epoch=349/349, loss=0.18826869130134583, accuracy=86.9129638671875
{'psd': True, 'auto_corr': False, 'num_blocks': 3, 'nn_specs': ([12, 2], [Tanh(), Tanh()]), 'hidden_dim': 8, 'lr': 0.001, 'rstm_layers': 5, 'num_epochs': 350}
epoch=0/349, loss=0.6888331770896912, accuracy=50.710479736328125
epoch=50/349, loss=0.27603980898857117, accuracy=81.35701751708984
epoch=100/349, 

In [9]:
for dim, af in  list(map(list, zip(*params_list["nn_specs"][2]))):
    print(dim, af)

NameError: name 'params_list' is not defined

In [None]:
af=[]
fc=[]
for dims, a in   list(map(list, zip(*params_list["nn_specs"][2]))):
    print(a)
    print(dims)
    af.append(a)
    fc.append(nn.LazyLinear(dims))
list(zip(fc,af))

Tanh()
12
Tanh()
2


[(LazyLinear(in_features=0, out_features=12, bias=True), Tanh()),
 (LazyLinear(in_features=0, out_features=2, bias=True), Tanh())]

In [None]:
m = nn.ModuleList()

In [None]:
m.append(nn.Linear(3,2)).append(nn.ReLU())

ModuleList(
  (0): Linear(in_features=3, out_features=2, bias=True)
  (1): ReLU()
)