In [1]:
import pandas as pd
import numpy as np
from pandas import read_csv
from pandas import datetime
from matplotlib import pyplot as plt
from statsmodels.tsa.arima_model import ARIMA
from pandas import Series
from sklearn.metrics import mean_squared_error

  from pandas import datetime


In [2]:
import warnings
warnings.filterwarnings('ignore')

### GLOBAL VARIABLES

In [3]:
INPUT_PATH = '../../../data/processed'
INPUT_FILE_NAME = 'dataproc_v001'
DAYS_PRED = 28

### FUNCTIONS

In [4]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [5]:
def parser(x):
	return datetime.strptime('190'+x, '%Y-%m')

In [6]:
def evaluate_arima_model(train, test, arima_order, metric='rmse'):
    
    error = 0
    
    n_test = test.shape[0]
    
    for test in test:
  
        history = [x for x in train]
        # make predictions
        predictions = list()
        for t in test:
            model = ARIMA(history, order=arima_order)
            model_fit = model.fit(disp=0)
            yhat = model_fit.forecast()[0]
            predictions.append(yhat)
            history.append(t)

        # calculate out of sample error
        if metric=='mse':
            error += mean_squared_error(test, predictions)
        elif metric == 'rmse':
            error += rmse(test, predictions)



    return error/n_test

In [7]:
def gsearch_arima_models(train, test, p_values, d_values, q_values, metric='rmse'):
    train = train.astype('float32')
    test = test.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    score = evaluate_arima_model(train, test, order)
                    if score < best_score:
                        best_score, best_cfg = score, order
                    print('ARIMA%s %s=%.3f' % (order, metric, score))
                except:
                    continue
    print('Best ARIMA%s %s=%.3f' % (best_cfg, metric,best_score))

### LOAD DATASET

In [8]:
data = pd.read_pickle(f'{INPUT_PATH}/{INPUT_FILE_NAME}.pkl')

In [10]:
data = data[data.part == 'train'] # select only train data

data = pd.read_csv(f'{INPUT_PATH}/{INPUT_FILE_NAME}.csv', sep=';')

In [11]:
date_cutoff = data.d.max() - DAYS_PRED

In [12]:
X_train = data[data.d <= date_cutoff]

In [13]:
X_test = data[data.d > date_cutoff]

In [14]:
del data

In [15]:
LEVEL = ['state_id', 'd']

In [16]:
train_agg = X_train.groupby(LEVEL).demand.mean().reset_index()

In [17]:
train_agg.head()

Unnamed: 0,state_id,d,demand
0,0,1211,1.21548
1,0,1212,1.389882
2,0,1213,1.341423
3,0,1214,1.335766
4,0,1215,1.148327


### TRAIN MODEL

In [18]:
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3) #this is the range for the values
q_values = range(0, 3) #this is the range for the values

In [19]:
ID = 0
train = train_agg[train_agg.state_id == ID].demand.tolist()

In [20]:
test = X_test[X_test.state_id == ID].pivot(index='id', columns='d', values='demand').values

In [21]:
nrows = test.shape[0]

In [22]:
ridx = np.random.choice(nrows, size=100, replace=False)

In [23]:
test = test[ridx, :] # select only some rows to test because take to many time to evaluate the model

In [24]:
test[:,:]

array([[ 0,  0,  1, ...,  0,  1,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  2,  0,  1],
       [ 0,  2,  0, ...,  1,  3,  2],
       [ 0,  2,  2, ...,  9,  6, 11]], dtype=int16)

In [27]:
evaluate_arima_model(train, test, (10, 1, 5), metric='rmse')

ValueError: On entry to DLASCL parameter number 4 had an illegal value