In [None]:
import pandas as pd
pd.Series

import math
import csv
from datetime import datetime

import numpy as np
import scipy as sc

import statsmodels
import sklearn
from sklearn import preprocessing

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 25, 20
import time

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils
from matplotlib import pyplot

from keras.regularizers import l1
from sklearn.model_selection import TimeSeriesSplit

# import BatchNormalization
from keras.layers.normalization import BatchNormalization
from keras.layers import LeakyReLU
from keras.layers import Activation
from keras.regularizers import l2

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score 

### Load file, print info and select columns

In [None]:
#function to load files
def load_file(filepath):
    df = pd.read_csv(filepath, sep='\t', index_col=0, parse_dates=True)
    df = df.sort_index()
    #we can check that this 2 columns are equal, so we can drop one
    #any(df['SALE_AMOUNT_BEFORE_CANCELLATIONS'] != df['SALE_AMOUNT_AFTER_CANCELLATIONS'])
    df = df.drop(['SALE_AMOUNT_AFTER_CANCELLATIONS'], axis=1)
#...
    return df.astype('float32')

#function to create a new df with selected columns
def create_small_df(df, columns):
    small_df = df.copy()
    small_df = small_df[columns]
    return small_df

#function to print inf about Data
def print_info_df(df, print_columns = False):
    #Count period
    d1 = df.index[0]
    d2 = df.index[-1]
    delta = d2 - d1
    print('Number of days is ' + str(delta.days) + ' from ' + str(d1) + ' to '+ str(d2))
    print('The shape of the data: %d*%d' %(df.shape[0],df.shape[1]))
    print('Check for Nan values: %s'%(df.isnull().values.any()))
    if (print_columns == True):
        print(list(df.columns))
    else:
        print('Number of columns: %d'%(df.shape[1]))

In [None]:
df = load_file('...')
print_info_df(df, False)

feature = 'SALE_AMOUNT'

### Error Metrics

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def root_mean_square_error(y_true, y_pred):
    #y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred))

def relative_rmse(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    rmse = root_mean_square_error(y_true, y_pred)
    return (rmse / np.mean(y_true))*100

#def f_smape(A, F):
#    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

def f_smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    epsilon = 0.1
    summ = np.maximum(np.abs(y_true) + np.abs(y_pred) + epsilon, 0.5 + epsilon)
    smape = np.abs(y_pred - y_true) / summ * 2.0
    return np.mean(smape)

def summarize_results(scores):
    #print(scores)
    m, s = np.mean(scores), np.std(scores)
    #print('Mean %.4f (+/- %.4f)' % (m,s))
    return m,s

### Data Preparation

In [None]:
#LSTM Data Preparation
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

def preprocessing_data(data, n, s_columns):
    # normalize features
    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(data)
    # frame as supervised learning
    reframed = series_to_supervised(scaled, n, 1)
    # drop columns we don't want to predict
    columns_to_drop = list(range(-s_columns+1,0))
    reframed.drop(reframed.columns[columns_to_drop], axis=1, inplace=True)
    values = reframed.values
    return scaler, values

def split_data(values, n_steps,s_columns, n_train_days, n_test_days):
    train = values[:n_train_days, :]
    test = values[n_train_days:, :]
    # split into input and outputs
    train_X, train_y = train[:, :-1], train[:, -1]
    test_X, test_y = test[:, :-1], test[:, -1]
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], n_steps, s_columns))
    test_X = test_X.reshape((test_X.shape[0], n_steps, s_columns))
    return train_X, train_y, test_X, test_y

def difference_seria(dataset, order=1):
    diff = list()
    for i in range(order, len(dataset)):
        value = dataset[i] - dataset[i - order]
        diff.append(value)
    return np.array(diff)

def sergey(last, yhat, order=1):
    result = list()
    result.append(last)
    for i in range(1,len(yhat)+1):
        result.append(result[i-1] + yhat[i-1])
    return result

In [None]:
def define_fit_lstm(train_X, train_y, test_X,
                    test_y, n_steps, scaler, s_columns):
    s = s_columns
    n_input = n_steps*s
    
    model = Sequential()
    model.add(LSTM(100, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='Adagrad', metrics=['mse', 'mae', 'mape'])

    history = model.fit(train_X, train_y, epochs=100, verbose=0,
                        batch_size=n_input, shuffle=False,
                        validation_data=(test_X, test_y))
    # fit network
    ###history = model.fit(train_X, train_y, epochs=100, verbose=0, batch_size=n_input, shuffle=False)
    # plot history 
    plt.subplot(2, 1, 1)
    plt.plot(history.history['loss'], label='loss_MAE', lw=2)
    plt.plot(history.history['val_loss'], label='val_loss_MAE', lw=2)
    plt.title('Training and Validation Loss')
    plt.legend(prop={'size': 15})
    plt.grid(True)
    plt.show()
    
    # evaluate model
    results = model.evaluate(test_X, test_y, verbose=0)
    loss, mse, mae, mape = results 
    print('loss=%.3f, mse=%.3f, mae=%.3f, mape=%.3f' %(loss, mse, mae, mape)) 
    print ('History results: ')
    print('Loss: %.3f - %.3f' % (history.history['loss'][0],history.history['loss'][-1]))
    print('Validation Loss: %.3f - %.3f' % (history.history['val_loss'][0],history.history['val_loss'][-1]))
   
    # make a prediction
    yhat = model.predict(test_X)
    test_X = test_X.reshape((test_X.shape[0], n_steps*s_columns))
    # invert scaling for forecast
    inv_yhat = np.concatenate((yhat, test_X[:, 1-s:]), axis=1)
    inv_yhat = scaler.inverse_transform(inv_yhat)
    inv_yhat = inv_yhat[:,0]
    # invert scaling for actual
    test_y = test_y.reshape((len(test_y), 1))
    inv_y = np.concatenate((test_y, test_X[:, 1-s:]), axis=1)
    inv_y = scaler.inverse_transform(inv_y)
    inv_y = inv_y[:,0]
    
    # calculate RMSE
    rmse = root_mean_square_error(inv_y, inv_yhat)
    rel_rmse = relative_rmse(inv_y, inv_yhat)
    # recalculate MAPE (results are the same actually)
    mape = mean_absolute_percentage_error(inv_y, inv_yhat)

    plt.subplot(2, 1, 1)
    plt.plot(inv_yhat, label='yhat', linestyle='--', lw=2)
    plt.plot(inv_y, label='y', lw=2)
    plt.title('Observed and Predicted Values')
    plt.legend(prop={'size': 18})
    plt.grid(True)
    plt.show() 
    
    plt.subplot(2, 1, 1)
    plt.plot(inv_yhat[:30], label='yhat', linestyle='--', lw=2)
    plt.plot(inv_y[:30], label='y', lw=2)
    plt.title('Observed and Predicted Values')
    plt.legend(prop={'size': 18})
    plt.grid(True)
    plt.show() 
    
    return loss, rmse, mape, rel_rmse

def run_model(n,s,values):
    size = values.shape[0]
    n_train_days = round(size*0.9)
    n_test_days = size - n_train_days
    n_steps = n
    s_columns = s
    #Preprocessing
    scaler, new_values = preprocessing_data(values, n_steps, s_columns)
    #Split data into train and test sets
    train_X, train_y, test_X, test_y = split_data(new_values, n_steps, s_columns, n_train_days, n_test_days)
    #Define and fit our LSTM model
    loss, rmse, mape, rel_rmse = define_fit_lstm(train_X, train_y, test_X, test_y, n_steps, scaler, s_columns)
    return loss, rmse, mape, rel_rmse

In [None]:
def run_experiment(n, df, repeats=3):
    values = df.values.astype('float32')
    feature_set = list(df.columns)
    s_columns = df.shape[1]
    print("Run experiment with " + str(repeats) + " repeats")
    print('Features set:'+ str(feature_set))
   #repeat experiment 
    losses = list()
    rmses = list()
    mapes = list()
    relative_rmses = list()
    for r in range(repeats):
        print('--------------------------------------------------------------------------------------------------------')
        print('Run #%d' % (r+1))
        loss, rmse, mape, relative_rmse = run_model(n,s_columns,values)
        print('>#%d Training Loss: %.3f' % (r+1, loss))
        print('>#%d Test RMSE: %.3f' % (r+1, rmse))
        print('>#%d Test Relative RMSE: %.3f' % (r+1, relative_rmse))
        print('>#%d Test MAPE: %.3f' % (r+1, mape))
        losses.append(loss)
        rmses.append(rmse)
        mapes.append(mape)
        relative_rmses.append(relative_rmse)
    print('--------------------------------------------------------------------------------------------------------')    
    print('Final Average Results: ')
    m,s = summarize_results(losses)
    print('Loss: %.4f (s=%.4f)' % (m,s))
    m,s = summarize_results(rmses)
    print('RMSE: %.4f (s=%.4f)' % (m,s))
    m,s = summarize_results(relative_rmses)
    print('Relative RMSE: %.4f (s=%.4f)' % (m,s))
    m,s = summarize_results(mapes)
    print('MAPE: %.4f (s=%.4f)' % (m,s))

In [None]:
## Full set data
values = df.values.astype('float32')
f_set = list(df.columns)
n_features = df.shape[1]
df.head()

In [None]:
run_experiment(1, df, 2)