# Import

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import glob
import os
from os.path import join, isfile, isdir
import pickle

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from models.RNN import RNN

from plot import Plotter

# Loading data

In [None]:
# Here different labels can be selected to show performance

pred = ['drive']
prob = ['classification']
out = join('output', '_'.join(pred), 'tuning')

# Name of measure labels
if prob[0] == 'regression':
    measure_labels = ['test_{}_{}'.format(pred[0], m) for m in ['MAE', 'RMSE', 'Max AE', 'R2']]
else:
    measure_labels = ['test_{}_{}'.format(pred[0], m) for m in ['AUC', 'F1', 'Accuracy']]

# Retrieve logs and performance (metrics appendix)

In [None]:
# Make one big pd dataframe of the tuned arguments and models
# This makes the appendix of all the variable metrics

performance = pd.DataFrame([])
values = []

# Retrieve log for each model type
for model_type in ['GRU_RNN']: #  ['RandomForestClass', 'RandomForestReg', 'LinearSVMClass', 'LinearSVMReg', 'Constant', 'GRU_RNN', 'NN', 'LSTM_RNN', '1d_CNN', 'SimpleRNN_RNN']:
    # Model performances
    if not isdir(join(out, model_type)):
        print(join(out, model_type), 'not found')
        continue
    # Read log
    log = pd.read_csv(join(out, model_type, 'log.csv')) # , usecols=['model'] + measure_labels)
    log['model_type'] = model_type
    # Concatenate log with df
    performance = pd.concat([performance, log])
    
    # Loop through all trained models
    for trained_models in glob.glob(join(out, model_type, 'model*')):
        for model in trained_models.split('\n'):
            model_name = os.path.split(model)[-1]
            if isfile(join(model, 'train_args.pickle')):
                # Get training parameters of model to change some columns names
                # Also to show which different hyperparameters are tuned
                model_args = pickle.load(open(join(model, 'train_args.pickle'), 'rb'))
                
                # If recurrent specify which one SimpleRNN, LSTM, or GRU
                if 'recurrent_layer' in model_args.keys():
                    model_args['model_type'] = model_args['recurrent_layer'].__name__ + '_RNN'
                    del(model_args['recurrent_layer'])
                else:
                    model_args['model_type'] = model_type
                
                # Change n_filters and hiddensize parameter to combined column (same thing)
                if 'n_filters' in model_args.keys():
                    model_args['hidden_size/n_filters'] = model_args['n_filters']
                    del(model_args['n_filters'])
                elif 'hidden_size' in model_args.keys():
                    model_args['hidden_size/n_filters'] = model_args['hidden_size']
                    del(model_args['hidden_size'])
            
            # Baseline model (no training args)
            else:
                model_args = {'model_type': model_type}
            
            model_args['model'] = model_name
            values.append(model_args)

# Show pandas dataframe of trained models evaluation metrics and the information in the log file
df = pd.DataFrame(values)
df = df.merge(performance, on=['model', 'model_type'])
df = df.rename(columns={'model': 'model_dir'})
# df = df.loc[df['layers'] != 5] # Remove layer=5, only trained for some
# df = df.drop(['model'], axis='columns')
# df = df.loc[:, ((~df.isnull().all()) & (df.nunique() != 1))]

# Rename model type to more interpretaable name
type_to_name = {'RandomForestClass': 'RF', 'RandomForestReg': 'RF',
                'LinearSVMClass': 'SVM', 'LinearSVMReg': 'SVM', 'Constant': 'Constant', 'NN': 'NN',
                'GRU_RNN': 'GRU', 'LSTM_RNN': 'LSTM', '1d_CNN': 'CNN', 'SimpleRNN_RNN': 'SimpleRNN'}
df['model'] = df['model_type'].apply(lambda x: type_to_name[x])

df.groupby('model').first()

<b> Make the appendix </b>

In [None]:
# Sort the big dataframe by the first measure label (MAE/AUC) to define best model
stats_all_models = df.sort_values(by=measure_labels[0])\
        .loc[:, ~df.columns.isin(['model_type', 'model_dir'])]

# Reformat floats to .3 precision
stats_all_models['L2'] = stats_all_models['L2'].map('{:.0e}'.format)
# stats_all_models['layers'] = stats_all_models['layers'].map('{:g}'.format)
stats_all_models['hidden_size/n_filters'] = stats_all_models['hidden_size/n_filters'].map('{:g}'.format)
if 'n_estimators' in stats_all_models.columns:
    stats_all_models['n_estimators'] = stats_all_models['n_estimators'].map('{:g}'.format)
if 'epsilon' in stats_all_models.columns:
    stats_all_models['epsilon'] = stats_all_models['epsilon'].map('{:.0e}'.format)

# For each measure (MAE, RMSE, etc.) reformat precision of float to .3
# Also change column names
for m in measure_labels:
    stats_all_models[m] = stats_all_models[m].map('{:.3f}'.format)
    new_m = m.split('_')
    new_m[0] = 'train'
    new_m = '_'.join(new_m)
    stats_all_models[new_m] = stats_all_models[new_m].map('{:.3f}'.format)
    stats_all_models = stats_all_models.rename(columns={m: 'test ' + m.split('_')[-1],
                                                        new_m: 'train ' + new_m.split('_')[-1]})

# Replace nans with -, looks nicer
stats_all_models = stats_all_models.replace(np.nan, '-')
stats_all_models = stats_all_models.replace('nan', '-')
# Drop some uninformative info
stats_all_models = stats_all_models.drop(['epochs', 'best epoch', 'elapsed time'], axis='columns')

# Sort the dataframe by model, test MAE for regression
# Sort the dataframe by model, test AUC, test F1, test Accuracy
stats_all_models = stats_all_models.sort_values(by=['model', 'test MAE'] if prob[0] == 'regression'  
                                                    else ['model', 'test AUC', 'test F1', 'test Accuracy'],
                                                ascending=(True if prob[0] == 'regression' else False)
                                               )
stats_all_models.head()

In [None]:
# Save appendix
stats_all_models.drop(['bidirect', 'drop', 'rdrop'], axis=1)\
                .to_csv('additional_files/Additional file 6.csv'.format(pred[0]), index=False)