# Imports

In [None]:
import warnings
warnings.filterwarnings("ignore") # ignore messy numpy warnings

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import pprint
from scipy import stats
import random
from datetime import datetime
import json
from sklearn.ensemble import RandomForestRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from pandas import read_csv
from datetime import datetime
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
import os
import json
import time
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid
import os.path
from keras import backend as K

# Functions

In [None]:
def pre_process_df(df):
    
    #Drop unwanted columns
    df_dropped = df.drop(columns=['Unnamed: 0',
                                  'stash_id','stash_feed','item_category',
                                  '_id',
                                  'date_month','date_year','item_name',
                                  'league','rarity',
                                  'price_currency','price_raw','date_day',
                                  'time_minutes'],errors='ignore')
    
    # delete columns with no values in them, which means only zero
    proc_df = df_dropped.loc[:,(df!=0).any (axis=0)]
        
    # Clean up of data. 
    ## Prices were imported as 'Object' not 'float'. We need to convert them to float.
    proc_df[['price_amount']] = pd.to_numeric(proc_df['price_amount'],errors='coerce')
    
    # Remove rows where price_amount is NaN
    proc_df = proc_df[pd.notnull(proc_df['price_amount'])]
    proc_df = proc_df.loc[proc_df['price_amount']!=0]
    
    return proc_df

def split_df_to_unique_item_names(df):

    unique_item_names = df['item_name'].value_counts(ascending=False)

    ascending_dataframes_per_name = {}

    for item_name in unique_item_names.index:
        dataF = df.loc[(df['item_name']== item_name)]
        if dataF.empty: continue
        if item_name not in ascending_dataframes_per_name:    
            ascending_dataframes_per_name[item_name] = dataF

    for item_name in ascending_dataframes_per_name:
        dataF = ascending_dataframes_per_name[item_name]
        dataF = dataF.loc[:,(dataF!=0).any(axis=0)]
        mask = dataF['price_amount'].notna()
        dataF = dataF[mask]
        ascending_dataframes_per_name[item_name] = dataF.reset_index()
        
    return ascending_dataframes_per_name

def compute_corr(df,method='spearman',filename=''):
    
    min_periods = int(len(df))*0.1
    cols = list(df.filter(regex='(Attacks per Second|Energy Shield|Elemental Damage|Critical Strike Chance|Physical Damage|influence|Armour|sockets_number|linked_sockets|Evasion Rating)|(?=^co_|ex_|im_|en_$)(^.*$)').columns.values)
    df[cols] = df[cols].replace({0:np.nan, 0.0:np.nan})
    #df[df.filter(regex='(?=^co_|ex_|im_|en_$)(^.*$)') <= 0.0] = np.nan
    corr = df.corr(method,min_periods = min_periods)
    corr = corr.dropna('columns',how='all')
    corr = corr.dropna('rows',how='all')
    
    return corr

def remove_outliers_IQR(item_dataframe,column_label = 'price_amount',high_quantile=0.75):
    '''Function removes outliers from a dataframe along the price_amount column by default.

    Input:
        df: pandas DataFrame
        column_label: along which column to check for outliers(default = 'price_amount')
        high_quantile: high-end quantile to use on boxplot'''
    
    Q1 = item_dataframe[column_label].quantile(1-high_quantile)
    Q3 = item_dataframe[column_label].quantile(high_quantile)
    IQR = Q3 - Q1
    new_df = item_dataframe[~((item_dataframe[column_label] < (Q1 - 1.5 * IQR))|(item_dataframe[column_label] > (Q3 + 1.5 * IQR)))]
    
    return new_df

def remove_outliers_zscore(item_dataframe,column_labels = ['price_amount'],threshold=3,show_results=False):
    '''Function removes outliers using z-score from a dataframe along the price_amount column by default.

    Input:
        df: pandas DataFrame
        column_label: along which columns to check for outliers(default = ['price_amount'])
        show_results: show results before and after removing outliers(default = False)
        size: vertical and horizontal size of the plot'''
    
    z_score = np.abs(stats.zscore(item_dataframe[column_labels]))
    new_df = item_dataframe[(z_score < threshold)]
    if show_results:
        data_outliers_index = np.where(z_score > threshold)[0]
        print('Data outliers for "{}":'.format(item_dataframe['item_name'][0]))
        for id in data_outliers_index:
              print('index: {:<10d}{}: {:<10f}'.format(id,column_labels[0],item_dataframe.iloc[id][column_labels[0]]))
        print('Removed {} rows'.format(item_dataframe.shape[0]-new_df.shape[0]))
    return new_df


def produce_decision_dataframe(item_df,correlations_df=pd.DataFrame,incl_outliers=True,method='z-score',threshold=2,quantile=0.8):
    
    d_df = item_df
    
    if not incl_outliers:
        if method == 'z-score' : 
            d_df = remove_outliers_zscore(d_df,threshold=threshold)
        elif method == 'IQR' :
            d_df = remove_outliers_IQR(d_df,high_quantile=quantile)
        else:
            raise Exception('\t\tWrong outlier mode. Valid options mode = [z-score | IQR]')
    
    if correlations_df.empty:
        columns = ['item_name','feature','corr_value','no_features','transactions','st_div','variance']
        correlations_df =  pd.DataFrame(columns=columns)
    
    corr=compute_corr(d_df,method='kendall')
    #corr_filtered = corr['price_amount'].filter(regex='(item_category|corrupted|Attacks per Second|Energy Shield|Elemental Damage|Critical Strike Chance|Physical Damage|influence|Armour|sockets_number|linked_sockets|Quality|Evasion Rating)|(?=^co_|ex_|im_|en_$)(^.*$)').drop(labels=['ex_conv_rate'],axis=0).dropna()
    corr_filtered = corr['price_amount'].filter(regex='(date_day|item_category|corrupted|Attacks per Second|Energy Shield|Elemental Damage|Critical Strike Chance|Physical Damage|influence|Armour|sockets_number|linked_sockets|Quality|Evasion Rating)|(?=^co_|ex_|im_|en_$)(^.*$)').dropna()
    for row in corr_filtered.index:
        correlations_df = correlations_df.append({'item_name':d_df['item_name'].unique()[0],
                                'feature':row,
                                'corr_value': corr_filtered[row],
                                'no_features':len(corr_filtered),
                                'transactions':d_df.groupby('item_name')['item_name'].count().values[0],
                                'st_div':d_df['price_amount'].describe()['std'],
                                'variance':d_df[['price_amount']].var(axis=0)},ignore_index=True)
    
    return correlations_df

def produce_corr_based_df(df_per_item_name,method='z-score',threshold=2,quantile=0.8):
    
    columns = ['item_name','feature','corr_value','no_features','transactions','st_div','variance']

    df =  pd.DataFrame(columns=columns)

    count = 0
    for dataF in df_per_item_name:
        count= count+1
        if count%200==0:
            print("Processed {} item_names".format(count))
        item_df = df_per_item_name[dataF]
        df = produce_decision_dataframe(item_df,df,incl_outliers=False,method=method,threshold=threshold,quantile=quantile)
        
    return df

def filter_decision_df(df, days=7, min_corr=0.1, min_no_features=2, min_std=5.0):
    
    min_trx = days*24
    
    df_filtered = df[(abs(df['corr_value'])>=min_corr) & \
                                          (df['transactions'] > min_trx) & \
                                          (df['st_div'] > min_std)]
    
    df_filtered['no_features'] = df_filtered.groupby('item_name')['item_name'].transform('count')
    df_filtered = df_filtered[df_filtered['no_features'] >= min_no_features]
    
    return df_filtered

def convert_column_values_string_to_rankInt(df) -> pd.DataFrame:
    for column in df.columns:
        if df[column].dtype == type(object):
            le = preprocessing.LabelEncoder()
            df[column] = le.fit_transform(df[column])
    return df

def flatten_column(df_column,method='median',round_base=2):
    if method=='median':
        return df_column.median()
    elif method=='mean':
        return round(df_column.mean(),round_base)
    
def interpolate_df(df,config):
    feature_series = []
    index = []
    item_features = df.columns
    for feature in item_features:
        if feature in config['features']:
            if config['features'][feature]=='median':
                inter_series_f = flatten_column(df[feature],method='median')
            elif config['features'][feature]=='mean':
                inter_series_f = flatten_column(df[feature],method='mean')
            else:
                inter_series_f = flatten_column(df[feature],method=config['default_flatten'])
        else:
            inter_series_f = flatten_column(df[feature],method=config['default_flatten'])
        
        index.append(feature)
        feature_series.append(inter_series_f)
    return pd.Series(feature_series,index)

def fill_and_plot(df,method='default',order=3):
    if method in ['spline','polynomial']:
        df_inter = df.interpolate(method=method,order=order)
    else:
        df_inter = df.interpolate(method=method)
    plt.figure(figsize=(20,10))
    df_inter['price_amount'].plot()
    plt.legend([method])
    return df_inter


def fill_time_periods(df,method = 'pchip',order=3):

    if method in ['spline','polynomial']:
        df = df.interpolate(method=method,order=order)
    else:
        df = df.interpolate(method=method)
    return df

def feature_selection(df, method="decision_tree",verbose=0,importance_threshold=0.15,max_no_of_features = 5):
    important_features = []
    train = df.copy()
    new_df = df.drop(['price_amount','time','date','socket_colors','time_hours'],axis=1,errors='ignore')
    if method=='decision_tree':
        model = RandomForestRegressor(random_state=12,max_depth=100)
        new_df = pd.get_dummies(new_df)
        model.fit(new_df,train.price_amount)
        features = new_df.columns
        importances = model.feature_importances_
        indices = np.argsort(importances)[-max_no_of_features:]  # top 10 features
        if verbose:
            for i in indices:
                print("Feature : {:40} --->importance [{}]".format(features[i][:40],importances[i].round(3)))
        for i in indices:
            if importances[i] > importance_threshold:
                important_features.append(features[i])
        important_features.append('price_amount')

        return important_features
    elif method=='rfe':
        lreg = DecisionTreeRegressor()
        rfe = RFE(lreg, max_no_of_features-1)
        cols = new_df.columns
        rfe = rfe.fit(new_df, train.price_amount)
        sorted_ranking = sorted(zip(map(lambda x:round(x,5),rfe.ranking_),cols))
        
        for i in range(0,len(sorted_ranking)):
            if verbose==1: print("Feature : {:40} has weight [{}]".format(sorted_ranking[i][1][:35],sorted_ranking[i][0]))
#             if (sorted_ranking[i][0] <= max_no_of_features*0.3 and (len(important_features)<(max_no_of_features))):    
            if (len(important_features)<(max_no_of_features-1)):
                important_features.append(sorted_ranking[i][1])
                
        important_features.append('price_amount')
        return important_features
    
    
def plot_results(predicted_data, true_data,index,directory,file_name):
    filename = os.path.join(directory,'Point to point forecast_{}.png'.format(file_name))
    fig = plt.figure(facecolor='white',figsize=(30,20))
    ax = fig.add_subplot(111)
    plt.title('Point to point forecast_{}'.format(file_name),fontsize=25)
    ax.plot(true_data, label='True Data')
    plt.plot(index[-len(true_data):],predicted_data, label='Prediction', linestyle='-',linewidth=3)
    indexes_len = len(index[-len(true_data):])
    modulo = indexes_len//10
    ticks = [i for i in range(0,indexes_len) if i%modulo==0]
    plt.xticks(ticks=ticks,fontsize=18,rotation=45)
    ax.tick_params(direction='out',pad=15)
    plt.yticks(fontsize=18)
    plt.xlabel('time_period',fontsize=25)
    ax = plt.gca()
    ax.xaxis.set_label_coords(0.5,-0.08)
#     plt.legend(loc='upper right', prop={'size': 25})
    fig.legend(loc=7,prop={'size': 25})
    fig.tight_layout()
    fig.subplots_adjust(right=0.88)
    plt.savefig(filename)
    plt.show()
    
def plot_results_full_seq(predicted_data, true_data,index,directory,file_name):
    rmse = []
    reshaped_predicted = np.reshape(predicted_data, (-1, 1))
    for i in range(len(predicted_data)):
        rmse.append(sqrt(mean_squared_error(reshaped_predicted[i], true_data[i])))
    
    filename = os.path.join(directory,'Full sequence forecast_{}.png'.format(file_name))
    fig = plt.figure(facecolor='white',figsize=(30,20))
    ax = fig.add_subplot(111)
    plt.title('Full Sequence forecast_{}'.format(file_name),fontsize=25)
    ax.plot(rmse, label='RMSE')
    ax.plot(true_data, label='True Data')
    plt.plot(index[-len(true_data):],predicted_data, label='Prediction', linestyle='-',linewidth=3)
    indexes_len = len(index[-len(true_data):])
    modulo = indexes_len//10
    ticks = [i for i in range(0,indexes_len) if i%modulo==0]
    plt.xticks(ticks=ticks,fontsize=18,rotation=45)
    ax.tick_params(direction='out',pad=15)
    plt.yticks(fontsize=18)
    plt.xlabel('time_period',fontsize=25)
    ax = plt.gca()
    ax.xaxis.set_label_coords(0.5,-0.08)
    fig.legend(loc=7,prop={'size': 25})
    fig.tight_layout()
    fig.subplots_adjust(right=0.88)
    plt.savefig(filename)
    plt.show()
    
    


def plot_results_multiple(predicted_data, true_data, prediction_len,index,directory,file_name):
    filename = os.path.join(directory,'Multistep forecasts_{}.png'.format(file_name))
    fig = plt.figure(facecolor='white',figsize=(30,20))
    ax = fig.add_subplot(111)    
    plt.title('Multistep forecasts_{}'.format(file_name),fontsize=25)
    ax.plot(index[-len(true_data):],true_data, label='True Data', linestyle='-',linewidth=3)
    indexes_len = len(index[-len(true_data):])
#     modulo = indexes_len//10
#     ticks = [i for i in range(0,indexes_len) if i%modulo==0]
#     plt.xticks(ticks=ticks,rotation=45,fontsize=18)
    plt.yticks(fontsize=18)
    plt.xlabel('time_period',fontsize=25)    
    ax = plt.gca()
    ax.xaxis.set_label_coords(0.5,-0.08)
    ax.tick_params(direction='out',pad=15)
    # Pad the list of predictions to shift it in the graph to it's correct start
    ticks=[]
    for i, data in enumerate(predicted_data):
        padding = [None for p in range(i * prediction_len)]
        starting_forecast_date_index = len(padding)
        ending_forecast_date_index =len(padding) + prediction_len
        ticks.append(index[-len(true_data):][starting_forecast_date_index])
        ticks.append(index[-len(true_data):][ending_forecast_date_index])
        plt.plot((padding + data), label='Prediction')
#     plt.legend(loc='upper right', prop={'size': 25})
    plt.xticks(ticks=ticks,rotation=70,fontsize=18)
    fig.legend(loc=7,prop={'size': 25})
    fig.tight_layout()
    fig.subplots_adjust(right=0.88)
    plt.savefig(filename)
    plt.show()    

def append_to_csv_file(filename,index,string_row,rmse,mode='a',case=1):
    if not os.path.isfile(filename) or mode=='w': 
        f = open(filename,'w')
        if case==1:
            f.write("{}^{}^{}^{}^{}^{}^{}^{}^{}^{}^rmse\n".format("index","sequence_length","train_test_split","epochs","neurons0","neurons1",
                                                              "dropout_rate","activation_function","optimizer","learning_rate"))
        elif case==2:
            f.write("{}^{}^{}^{}^{}^{}^{}^{}^{}^{}^{}^rmse\n".format("index","sequence_length","train_test_split","epochs","neurons0","neurons1","neurons2",
                                                              "dropout_rate","activation_function","optimizer","learning_rate"))
        f.close()
        return
    f = open(filename,'a')
    f.write(index)
    f.write('^')
    f.write(string_row+'^{}\n'.format(rmse))
    f.close()

def change_dict_path_value(dotted_path, org,value,delim='.'):
    paths, current = dotted_path.split(sep=delim), org
    for p in paths[:-1]:
        if is_number(p) : current = current[int(p)]
        else : current = current[p]
    current[paths[-1]] = value
    return org

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


def get_params_abbrev(configs,case=1):
    if case==2:
        string_row = "{}^{}^{}^{}^{}^{}^{}^{}^{}^{}".format(configs['data']['sequence_length'],
                                                      configs['data']['train_test_split'],
                                                      configs['training']['epochs'],
                                                      configs['model']['layers'][0]['neurons'],
                                                      configs['model']['layers'][1]['neurons'],
                                                      configs['model']['layers'][2]['neurons'],
                                                      configs['model']['layers'][3]['rate'],
                                                      configs['model']['layers'][4]['activation'],
                                                      configs['model']['optimizer'],
                                                      configs['model']['learning_rate'])
        index =  "sl{}_spl{}_ep{}_n0{}_n1{}_n2{}_dr{}_ac{}_opt{}_lr{}".format(configs['data']['sequence_length'],
                                                      configs['data']['train_test_split'],
                                                      configs['training']['epochs'],
                                                      configs['model']['layers'][0]['neurons'],
                                                      configs['model']['layers'][1]['neurons'],
                                                      configs['model']['layers'][2]['neurons'],
                                                      configs['model']['layers'][3]['rate'],
                                                      configs['model']['layers'][4]['activation'],
                                                      configs['model']['optimizer'],
                                                      configs['model']['learning_rate'])
    elif case==1:
        string_row = "{}^{}^{}^{}^{}^{}^{}^{}^{}".format(configs['data']['sequence_length'],
                                                      configs['data']['train_test_split'],
                                                      configs['training']['epochs'],
                                                      configs['model']['layers'][0]['neurons'],
                                                      configs['model']['layers'][1]['neurons'],
                                                      configs['model']['layers'][2]['rate'],
                                                      configs['model']['layers'][3]['activation'],
                                                      configs['model']['optimizer'],
                                                      configs['model']['learning_rate'])
        index =  "sl{}_spl{}_ep{}_n0{}_n1{}_dr{}_ac{}_opt{}_lr{}".format(configs['data']['sequence_length'],
                                                      configs['data']['train_test_split'],
                                                      configs['training']['epochs'],
                                                      configs['model']['layers'][0]['neurons'],
                                                      configs['model']['layers'][1]['neurons'],
                                                      configs['model']['layers'][2]['rate'],
                                                      configs['model']['layers'][3]['activation'],
                                                      configs['model']['optimizer'],
                                                      configs['model']['learning_rate'])
    return index,string_row

# Classes

In [None]:
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer


class DataLoader():
    """A class for loading and transforming data for the lstm model"""

    def __init__(self, filename, split,config,inter_config):
        dataframe = pd.read_csv(filename,delimiter='^').round(2)
        dataframe = self.prepare_data(dataframe,config,inter_config)
        dataframe = self.rearrange_yhat_to_first_column(dataframe)
        i_split = int(len(dataframe) * split)
        self.indexes = dataframe.index
        self.data_train = dataframe.values[:i_split]
        self.data_test  = dataframe.values[i_split:]
        self.len_train  = len(self.data_train)
        self.len_test   = len(self.data_test)
        self.len_train_windows = None
        
    def prepare_data(self,df,config,inter_config):

        item_name = df['item_name'].unique()[0]
        item_inter_conf = inter_config[item_name]
        no_of_features = config['data']['no_of_features']
        corr_threshold = config['data']['corr_threshold']
        inter_frequency = item_inter_conf['frequency']

        #remove unwanted columns and rows
        df = pre_process_df(df)

        #Remove outliers
        outlier_conf = config['data']['outliers']
        if outlier_conf['method']=='IQR':
            df = remove_outliers_IQR(df,high_quantile=outlier_conf['high_quantile'])
        if outlier_conf['method']=='z_score':
            df = remove_outliers_zscore(df,threshold=outlier_conf['threshold'])

        #Feature selection prep
        important_features = feature_selection(df,method='rfe',verbose=0,importance_threshold=corr_threshold,max_no_of_features=no_of_features)
        
        #Make a dateTime type column to interpolate with later
        d = df['date'] + '-'+df['time']
        df['date_time'] =  pd.to_datetime(d, format='%Y-%m-%d-%H-%M')
        df = df.set_index("date_time")

        #Actual feature selection
        df = df[important_features]
        self.no_features = df.shape[1]

        #Interpolation
        df1 = df.groupby(pd.Grouper(freq=inter_frequency,closed='left')).apply(lambda x: interpolate_df(x,item_inter_conf))
        df1 = df1.resample(inter_frequency).asfreq()
        filled_df = fill_time_periods(df1,item_inter_conf['fill_method']).round(2)


        return filled_df

    def rearrange_yhat_to_first_column(self,df,yhat_name='price_amount'):

        rearranged_columns = ['price_amount']
        for c in df.columns:
            if c=='price_amount' :continue
            rearranged_columns.append(c)
        return df[rearranged_columns]
    
    def get_test_data(self, seq_len, normalise):
        '''
        Create x, y test data windows
        Warning: batch method, not generative, make sure you have enough memory to
        load data, otherwise reduce size of the training split.
        '''
        data_windows = []
        for i in range(self.len_test - seq_len):
            data_windows.append(self.data_test[i:i+seq_len])

        data_windows = np.array(data_windows).astype(float)
        data_windows = self.normalise_windows(data_windows, single_window=False) if normalise else data_windows

        x = data_windows[:, :-1]
        y = data_windows[:, -1, [0]]
        return x,y

    def get_train_data(self, seq_len, normalise):
        '''
        Create x, y train data windows
        Warning: batch method, not generative, make sure you have enough memory to
        load data, otherwise use generate_training_window() method.
        '''
        data_x = []
        data_y = []
        for i in range(self.len_train - seq_len):
            x, y = self._next_window(i, seq_len, normalise)
            data_x.append(x)
            data_y.append(y)
#         data_y = self.scaler_y.fit_transform(data_y)
        return np.array(data_x), np.array(data_y)

    def generate_train_batch(self, seq_len, batch_size, normalise):
        '''Yield a generator of training data from filename on given list of cols split for train/test'''
        i = 0
        while i < (self.len_train - seq_len):
            x_batch = []
            y_batch = []
            for b in range(batch_size):
                if i >= (self.len_train - seq_len):
                    # stop-condition for a smaller final batch if data doesn't divide evenly
                    yield np.array(x_batch), np.array(y_batch)
                    i = 0
                x, y = self._next_window(i, seq_len, normalise)
                x_batch.append(x)
                y_batch.append(y)
                i += 1
            yield np.array(x_batch), np.array(y_batch)

    def _next_window(self, i, seq_len, normalise):
        '''Generates the next data window from the given index location i'''
        window = self.data_train[i:i+seq_len]
        window = self.normalise_windows(window, single_window=True)[0] if normalise else window
        x = window[:-1]
        y = window[-1, [0]]
        return x, y

    def normalise_windows(self, window_data, single_window=False):
        '''Normalise window with a base value of zero'''
        normalised_data = []
        window_data = [window_data] if single_window else window_data
        for window in window_data:
            normalised_window = []
            for col_i in range(window.shape[1]):
                normalised_col = [((float(p) / float(window[0, col_i])) - 1) for p in window[:, col_i]]
                normalised_window.append(normalised_col)
            normalised_window = np.array(normalised_window).T # reshape and transpose array back into original multidimensional format
            normalised_data.append(normalised_window)
        return np.array(normalised_data)

In [None]:
import os
import math
import numpy as np
import datetime as dt
from numpy import newaxis
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

class Model():
    """A class for an building and inferencing an lstm model"""

    def __init__(self):
        self.model = Sequential()

    def load_model(self, filepath):
        print('[Model] Loading model from file %s' % filepath)
        self.model = load_model(filepath)

    def build_model(self, configs):
        optimizer = configs['model']['optimizer']
        for layer in configs['model']['layers']:
            neurons = layer['neurons'] if 'neurons' in layer else None
            dropout_rate = layer['rate'] if 'rate' in layer else None
            activation = layer['activation'] if 'activation' in layer else None
            return_seq = layer['return_seq'] if 'return_seq' in layer else None
            input_timesteps = configs['data']['sequence_length']-1
#             input_dim = layer['input_dim'] if 'input_dim' in layer else None
            input_dim = configs['data']['no_of_features'] if ('input_dim' in layer and configs['data']['feature_selection']) else None
            
            learning_rate = configs['model']['learning_rate']
            
            if layer['type'] == 'dense':
                self.model.add(Dense(neurons, activation=activation))
            if layer['type'] == 'lstm':
                self.model.add(LSTM(neurons, input_shape=(input_timesteps, input_dim), return_sequences=return_seq))
            if layer['type'] == 'dropout':
                self.model.add(Dropout(dropout_rate))

        if optimizer == 'SGD':
            optimizer = SGD(lr=learning_rate)
        
        self.model.compile(loss=configs['model']['loss'], optimizer=optimizer,metrics=['mse', 'mae', 'mape'])

        print('[Model] Model Compiled')

    def train(self, x, y, epochs, batch_size, save_dir,filename):
        print('[Model] Training Started')
        print('[Model] %s epochs, %s batch size' % (epochs, batch_size))

        save_fname = os.path.join(save_dir, filename+'.h5')
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=3),
            ModelCheckpoint(filepath=save_fname, monitor='val_loss', save_best_only=True)
        ]
        self.model.fit(
            x,
            y,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=0
        )
        self.model.save(save_fname)

        print('[Model] Training Completed. Model saved as %s' % save_fname)

    def predict_point_by_point(self, data):
        #Predict each timestep given the last sequence of true data, in effect only predicting 1 step ahead each time
        print('[Model] Predicting Point-by-Point...')
        predicted = self.model.predict(data)
        predicted = np.reshape(predicted, (predicted.size,))
        return predicted

    def predict_sequences_multiple(self, data, window_size, prediction_len):
        #Predict sequence of 50 steps before shifting prediction run forward by 50 steps
        print('[Model] Predicting Sequences Multiple...')
        prediction_seqs = []
        for i in range(int(len(data)/prediction_len)):
            curr_frame = data[i*prediction_len]
            predicted = []
            for j in range(prediction_len):
                predicted.append(self.model.predict(curr_frame[newaxis,:,:])[0,0])
                curr_frame = curr_frame[1:]
                curr_frame = np.insert(curr_frame, [window_size-2], predicted[-1], axis=0)
            prediction_seqs.append(predicted)
        return prediction_seqs

    def predict_sequence_full(self, data, window_size):
        #Shift the window by 1 new prediction each time, re-run predictions on new window
        print('[Model] Predicting Full Sequence...')
        curr_frame = data[0]
        predicted = []
        for i in range(len(data)):
            predicted.append(self.model.predict(curr_frame[newaxis,:,:])[0,0])
            curr_frame = curr_frame[1:]
            curr_frame = np.insert(curr_frame, [window_size-2], predicted[-1], axis=0)
        return predicted

In [None]:
def run_item_analysis(configs,inter_conf,index):

    data = DataLoader(configs['data']['filename'],
        configs['data']['train_test_split'],
        configs,
        inter_conf
    )

    model = Model()
    model.build_model(configs)

    x, y = data.get_train_data(seq_len=configs['data']['sequence_length'], normalise=configs['data']['normalise'])
    
    model.train(
        x,
        y,
        epochs = configs['training']['epochs'],
        batch_size = configs['training']['batch_size'],
        save_dir = configs['model']['save_dir'],
        filename=index
    )

    x_test, y_test = data.get_test_data(
        seq_len=configs['data']['sequence_length'],
        normalise=configs['data']['normalise']
    )


    predictions = model.predict_sequences_multiple(x_test, configs['data']['sequence_length'], configs['data']['sequence_length'])
    plot_results_multiple(predictions, y_test, configs['data']['sequence_length'],data.indexes.strftime("%d-%m_%Hh"),directory=configs['model']['save_dir'],file_name=index)

    predictions_fullseq = model.predict_sequence_full(x_test, configs['data']['sequence_length'])
    plot_results_full_seq(predictions_fullseq, y_test,data.indexes.strftime("%d-%m_%Hh"),directory=configs['model']['save_dir'],file_name=index)
    
    
    predictions = model.predict_point_by_point(x_test)

    rmse = sqrt(mean_squared_error(y_test, predictions))

    print('Test RMSE: %.3f' % rmse)

    plot_results(predictions, y_test,data.indexes.strftime("%d-%m_%Hh"),directory=configs['model']['save_dir'],file_name=index)
    
    return rmse

# Tabula multi analysis

## Training

In [None]:
configs = json.load(open('multi_tabula_config.json', 'r'))
if not os.path.exists(configs['model']['save_dir']): os.makedirs(configs['model']['save_dir'])
filename = os.path.join(configs['model']['save_dir'],'lstm_tabula_results.csv')
append_to_csv_file(filename,'','',0,mode='w',case=2)

In [None]:
# disable interactive jupyter notebook printing
%%capture

inter_conf = json.load(open('interpolation_config.json', 'r'))

grid = ParameterGrid({"data.sequence_length": [i for i in range(8,10)],
                      "data.train_test_split": [0.3,0.4,0.5],
                      "training.epochs": [500],
                     "model.optimizer":['adam'],
                     "model.layers.0.neurons":[500],
                     "model.layers.1.neurons":[500],
                     "model.layers.3.rate":[0],
                     "model.layers.4.activation":['linear',]
                     })

for param_list in list(grid):
        
    for key,value in param_list.items():
        configs = change_dict_path_value(key,configs,value)
    index,string_row = get_params_abbrev(configs,case=2)
    
    results_df = pd.read_csv(filename,sep='^',index_col='index')
    if index in results_df.index: 
        print("Index --{}-- exists".format(index))
        continue
    try:
        rmse = run_item_analysis(configs,inter_conf,index)
        append_to_csv_file(filename,index,string_row,rmse)
    except Exception as e:
        print(e)
    finally:
        if K.backend() == 'tensorflow':
            K.clear_session()

## Results

In [None]:
configs = json.load(open('multi_tabula_config.json', 'r'))
filename = os.path.join(configs['model']['save_dir'],'lstm_tabula_results.csv')
df = pd.read_csv(filename,sep='^',index_col='index')
df.sort_values('rmse')

# Windripper multi analysis

## Training

In [None]:
configs = json.load(open('multi_windripper_config.json', 'r'))
if not os.path.exists(configs['model']['save_dir']): os.makedirs(configs['model']['save_dir'])
filename = os.path.join(configs['model']['save_dir'],'lstm_windripper_results.csv')
append_to_csv_file(filename,'','',0,mode='w',case=1)

In [None]:
%%capture
inter_conf = json.load(open('interpolation_config.json', 'r'))

grid = ParameterGrid({"data.sequence_length": [i for i in range(6,15)],
                      "data.train_test_split": [0.3,0.4,0.5],
                      "training.epochs": [100,200,300],
                     "model.optimizer":['adam'],
                     "model.layers.0.neurons":[100],
                     "model.layers.1.neurons":[50],
                     "model.layers.2.rate":np.arange(0.1,0.31,0.1),
                     "model.layers.3.activation":['linear']
                     })

for param_list in list(grid):
        
    for key,value in param_list.items():
        configs = change_dict_path_value(key,configs,value)
    index,string_row = get_params_abbrev(configs,case=1)
    
    results_df = pd.read_csv(filename,sep='^',index_col='index')
    if index in results_df.index: 
        print("Index --{}-- exists".format(index))
        continue
    try:
        rmse = run_item_analysis(configs,inter_conf,index)
        append_to_csv_file(filename,index,string_row,rmse)
    except Exception as e:
        print(e)
    finally:
        if K.backend() == 'tensorflow':
            K.clear_session()

## Results

In [None]:
configs = json.load(open('multi_windripper_config.json', 'r'))
filename = os.path.join(configs['model']['save_dir'],'lstm_windripper_results.csv')
df = pd.read_csv(filename,sep='^',index_col='index')
df.sort_values('rmse')

# Testing

In [None]:
configs = json.load(open('multi_windripper_config.json', 'r'))
if not os.path.exists(configs['model']['save_dir']): os.makedirs(configs['model']['save_dir'])

# config = json.load(open('tabula_config.json', 'r'))
inter_conf = json.load(open('interpolation_config.json', 'r'))

grid = ParameterGrid({"data.sequence_length": [i for i in range(6,21)],
                      "data.no_of_features":[2,3],
                      "data.train_test_split": [i for i in range(0.3,0.8,0.05)],
                      "training.epochs": [100, 200,300],
                     "data.outliers.high_quantile":[i for i in range(0.6,0.91,0.1)],
                     "model.loss":['mse','mae','rmse','rmae'],
                     "model.optimizer":['adam','sgd'],
                     "model.learning_rate":[0.01,0.05,0.1,0.2],
                     "model.layers.0.neurons":[50,100,200,300],
                     "model.layers.0.input_timesteps":[i for i in range(6,20)],
                     "model.layers.0.input_dim":[2,3],
                     "model.layers.1.neurons":[25,50,100,150],
                     "model.layers.2.rate":[i for i in range(0.1,0.51,0.05)],
                     "model.layers.3.activation":['linear','sigmoid','relu']
                     })

data = DataLoader(configs['data']['filename'],
    configs['data']['train_test_split'],
    configs,
    inter_conf
)

model = Model()
model.build_model(configs)

x, y = data.get_train_data(seq_len=configs['data']['sequence_length'], normalise=configs['data']['normalise'])

#in-memory training
model.train(
    x,
    y,
    epochs = configs['training']['epochs'],
    batch_size = configs['training']['batch_size'],
    save_dir = configs['model']['save_dir']
)

x_test, y_test = data.get_test_data(
    seq_len=configs['data']['sequence_length'],
    normalise=configs['data']['normalise']
)

predictions = model.predict_sequences_multiple(x_test, configs['data']['sequence_length'], configs['data']['sequence_length'])
plot_results_multiple(predictions, y_test, configs['data']['sequence_length'],data.indexes.strftime("%d-%m_%Hh"))

# predictions = model.predict_sequence_full(x_test, configs['data']['sequence_length'])
# plot_results(predictions, y_test)

predictions = model.predict_point_by_point(x_test)

rmse = sqrt(mean_squared_error(y_test, predictions))

print('Test RMSE: %.3f' % rmse)

plot_results(predictions, y_test,data.indexes.strftime("%d-%m_%Hh"))
