In [None]:
import pandas as pd
import numpy as np
from pandas import read_csv
from pandas import datetime
from matplotlib import pyplot as plt
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA
from pandas import Series
from sklearn.metrics import mean_squared_error
from datetime import datetime 

In [None]:
import pywt

In [None]:
import warnings
warnings.filterwarnings('ignore')

### GLOBAL VARIABLES

In [None]:
INPUT_PATH = '../../../data/processed'
INPUT_FILE_NAME = 'dataproc_v001'
OUTPUT_PATH = '../../../models/arima/hyperparameters/'
HYPERPARAM_NAME = 'best_hyperparam_arima_r'
LOG_NAME = 'gsearch_arima_logs_r'
OUTPUT_FILE_NAME = 'gsearch_arima_logs_d'
NRUN = 1
DAYS_PRED = 28
METRIC = 'rmse'

### FUNCTIONS

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
def parser(x):
	return datetime.strptime('190'+x, '%Y-%m')

In [None]:
def evaluate_arima_model(train, test, arima_order, metric='rmse'):
    
    error = 0
    
    n_test = test.shape[0]
    
    for test in test:
  
        history = [x for x in train]
        # make predictions
        predictions = list()
        for t in test:
            model = ARIMA(history, order=arima_order)
            model_fit = model.fit(disp=0)
            yhat = model_fit.forecast()[0]
            predictions.append(yhat)
            history.append(t)

        # calculate out of sample error
        if metric=='mse':
            error += mean_squared_error(test, predictions)
        elif metric == 'rmse':
            error += rmse(test, predictions)



    return error/n_test

In [None]:
def gsearch_arima_models(train, test, p_values, d_values, q_values, metric='rmse'):
    train = train.astype('float32')
    test = test.astype('float32')
    
    
    
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                
                order = (p,d,q)
                line = datetime.now().strftime("%d/%m/%Y") + ", "
                
                # open log file
                f = open(f'{OUTPUT_PATH}/{LOG_NAME}{NRUN}.csv','a+')
                
                try:
                    
                    score = evaluate_arima_model(train, test, order)
                    
                    if score < best_score:
                        best_score, best_cfg = score, order
                    
                    print('ARIMA%s %s=%.3f' % (order, metric, score))
                except:
                    continue
                
                
                line += str(order[0]) + ", " + str(order[1]) + ", " + str(order[2]) + ", " + METRIC + ", " + str(score) + "\n"

                
                # save into log file
                with open(f'{OUTPUT_PATH}/{LOG_NAME}{NRUN}.csv','a+') as f:
                    f.write(line)
                
                
    print('Best ARIMA%s %s=%.3f' % (best_cfg, metric,best_score))

In [None]:
def maddest(d, axis=None):
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

def denoise_signal(x, wavelet='db4', level=1):
    coeff = pywt.wavedec(x, wavelet, mode="per")
    sigma = (1/0.6745) * maddest(coeff[-level])

    uthresh = sigma * np.sqrt(2*np.log(len(x)))
    coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])

    return pywt.waverec(coeff, wavelet, mode='per')

In [None]:
def average_smoothing(signal, kernel_size=3, stride=1):
    sample = []
    start = 0
    end = kernel_size
    while end <= len(signal):
        start = start + stride
        end = end + stride
        sample.extend(np.ones(end - start)*np.mean(signal[start:end]))
    return np.array(sample)

### LOAD DATASET

In [None]:
data = pd.read_pickle(f'{INPUT_PATH}/{INPUT_FILE_NAME}.pkl')

In [None]:
data = data[data.part == 'train'] # select only train data

In [None]:
date_cutoff = data.d.max() - DAYS_PRED

In [None]:
X_train = data[data.d <= date_cutoff]

In [None]:
X_test = data[data.d > date_cutoff]

In [None]:
del data

In [None]:
LEVEL = ['state_id', 'd']

In [None]:
train_agg = X_train.groupby(LEVEL).demand.mean().reset_index()

In [None]:
train_agg.head()

In [None]:
with open(f'{OUTPUT_PATH}/{LOG_NAME}{NRUN}.csv','w+') as f:
    f.write(f"date, p, d, q, metric, score\n")

### TRAIN MODEL

In [None]:
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3) #this is the range for the values
q_values = range(0, 3) #this is the range for the values

In [None]:
ID = 0
train = train_agg[train_agg.state_id == ID].demand.tolist()

In [None]:
test = X_test[X_test.state_id == ID].pivot(index='id', columns='d', values='demand').values

In [None]:
nrows = test.shape[0]

In [None]:
ridx = np.random.choice(nrows, size=100, replace=False)

In [None]:
test = test[ridx, :] # select only some rows to test because take to many time to evaluate the model