In [15]:
import pandas as pd
from pandas import read_csv
from pandas import datetime
from matplotlib import pyplot
from statsmodels.tsa.arima_model import ARIMA
from pandas import Series
from sklearn.metrics import mean_squared_error

In [2]:
import warnings
warnings.filterwarnings('ignore')

### GLOBAL VARIABLES

In [26]:
INPUT_PATH = '../../../data/processed'
INPUT_FILE_NAME = 'dataproc_v001'
DAYS_PRED = 28

### FUNCTIONS

In [4]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [5]:
def parser(x):
	return datetime.strptime('190'+x, '%Y-%m')

In [6]:
def evaluate_arima_model(train, test, arima_order, metric='rmse'):
  
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])

    # calculate out of sample error
    if metric=='mse':
        error = mean_squared_error(test, predictions)
    elif metric == 'rmse':
        error = rmse(test, predictions)



    return error

In [7]:
def evaluate_models(train, test, p_values, d_values, q_values, metric='rmse'):
    train = train.astype('float32')
    test = test.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    score = evaluate_arima_model(train, test, order)
                    if score < best_score:
                        best_score, best_cfg = score, order
                    print('ARIMA%s %s=%.3f' % (order, metric, score))
                except:
                    continue
    print('Best ARIMA%s %s=%.3f' % (best_cfg, metric,best_score))

### LOAD DATASET

data = pd.read_pickle(f'{INPUT_PATH}/{INPUT_FILE_NAME}.pkl')

In [28]:
data = pd.read_csv(f'{INPUT_PATH}/{INPUT_FILE_NAME}.csv', sep=';')

In [29]:
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,1211,1,train,2014-05-23,11416,,,,,0,0,0,8.26
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,1211,0,train,2014-05-23,11416,,,,,0,0,0,3.97
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,1211,0,train,2014-05-23,11416,,,,,0,0,0,2.97
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,1211,1,train,2014-05-23,11416,,,,,0,0,0,4.64
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,1211,2,train,2014-05-23,11416,,,,,0,0,0,3.08


In [31]:
data.state_id.unique()

array([0, 1, 2])

In [None]:
data.part.unique()

In [None]:
X_train_vector = np.load(f'{INPUT_PATH}/X_train.npy') # It loads a vector with the folds

In [None]:
Y_train_vector = np.load(f'{INPUT_PATH}/Y_train.npy')

### TRAIN MODEL

In [11]:
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3) #this is the range for the values
q_values = range(0, 3) #this is the range for the values

In [None]:
evaluate_models(train,
                test, p_values, d_values, q_values)