#### Importing Dataset

In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

from preprocessing.ImputeMean import ImputeMean
from preprocessing.TrainTestSplit import TrainTestSplit
from preprocessing.ZeroSales import ZeroSales
from preprocessing.DataAggregator import DataAggregator

from feature_engineering.Lag import Lag
from feature_engineering.Log import Log

calendar_df = pd.read_csv('E:/Documents/TanXor/Dataset/calendar.csv')
sales_df = pd.read_csv('E:/Documents/TanXor/Dataset/sales_train_validation.csv')

#### Data Transformation

In [2]:
date = calendar_df['date'].iloc[:1913]
col1 = 'store_id'
col2 = 'dept_id'
data = sales_df

In [3]:
aggreg = DataAggregator(data, col1, col2, date)
sales = aggreg.aggregate()
sales = aggreg.transform()

zero_neg = ZeroSales(sales).zero_sales()
ImputeMean(sales, 0).imputer()
train_data, test_data = TrainTestSplit(sales, 0.2).data_split()

### Feature Engineering

In [4]:
train_data = Lag(train_data).lag_transform(7, train_data.columns[0])
#train_data = Log(train_data).log_transform(train_data.columns[0])

In [5]:
train_data.columns[-1]

'CA_1_FOODS_1_lag7'

### Stationarity Test

In [6]:
%pip install statsmodels

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
### Testing For Stationarity

from statsmodels.tsa.stattools import adfuller
import statsmodels.api.tsa.statespace.SARIMAX as SARIMAX

ModuleNotFoundError: No module named 'statsmodels.api.tsa'; 'statsmodels.api' is not a package

In [None]:
def adfuller_test(data):
    result=adfuller(data)
    return result[1]

In [None]:
for i in sales.columns[:70]:
    if (adfuller_test(sales[i]) <= 0.05):
        print(i, "Pass")
    else:
        print(i, "Fail")

### Model Training

In [None]:
sarimax = SARIMAX(train_data, 
                    order=(3,1,3), 
                    seasonal_order=(0,1,1,7),
                    freq='D')
                                    
# Fit the model
output = sarimax.fit()

In [None]:
train_data['forecast']=results.predict(start=1780,end=1913,dynamic=True)
train_data[['Sales','forecast']].iloc[1740:].plot()

### Hyperparamter Tuning

In [None]:
import warnings
from statsmodels.tools.sm_exceptions import ValueWarning

# Ignore ValueWarning from statsmodels
warnings.simplefilter('ignore', ValueWarning)

In [None]:
### Import Packages ###
'''
import itertools

### Define Parameter Ranges to Test ###

# Note: higher numbers will result in code taking much longer to run
# Here we have it set to test p,d,q each = 0, 1 & 2

# Define the p, d and q parameters to take any value between 0 and 3 (exclusive)
p = range(1, 6)
q = range(0, 6)
d = range(1, 2)
P = range(0, 4)
Q = range(0, 4)

# Generate all different combinations of p, q and q triplets
pdq = list(itertools.product(p, d, q))

# Generate all different combinations of seasonal p, q and q triplets
# Note: here we have 12 in the 's' position as we have monthly data
# You'll want to change this according to your time series' frequency
pdqs = [(x[0], x[1], x[2], 7) for x in list(itertools.product(P, d, Q))]

### Run Grid Search ###

# Note: this code will take a while to run

# Define function
def sarimax_gridsearch(ts, pdq, pdqs, freq='D'):
    
    Input: 
        ts : your time series data
        pdq : ARIMA combinations from above
        pdqs : seasonal ARIMA combinations from above
        maxiter : number of iterations, increase if your model isn't converging
        frequency : default='M' for month. Change to suit your time series frequency
            e.g. 'D' for day, 'H' for hour, 'Y' for year. 
        
    Return:
        Prints out top 5 parameter combinations
        Returns dataframe of parameter combinations ranked by BIC
    

    # Run a grid search with pdq and seasonal pdq parameters and get the best BIC value
    ans = []
    for comb in pdq:
        for combs in pdqs:
            try:
                mod = sm.tsa.statespace.SARIMAX(ts, # this is your time series you will input
                                                order=comb,
                                                seasonal_order=combs,
                                                enforce_stationarity=False, 
                                                enforce_invertibility=False,
                                                freq=freq)

                output = mod.fit(maxiter=1000)
                predictions = output.predict(start=1800,end=1913,dynamic=True)

                test_data = ts.iloc[1800:1913]
                mape = np.mean(np.abs((test_data - predictions) / test_data)) * 100

                ans.append([comb, combs, output.bic, mape])
                print('SARIMAX {} x {}12 : MAPE Calculated ={}'.format(comb, combs, mape))
            except:
                continue
            
    # Find the parameters with minimal BIC value

    # Convert into dataframe
    ans_df = pd.DataFrame(ans, columns=['pdq', 'pdqs', 'bic', 'mape'])

    # Sort and return top 5 combinations
    ans_df = ans_df.sort_values(by=['mape'],ascending=True)
    
    return ans_df
    
'''

In [None]:
### Apply function to your time series data ###

# Remember to change frequency to match your time series data
# best_params = sarimax_gridsearch(df_1['Sales'], pdq, pdqs, freq='D')

In [None]:
# best_params.head(20)

In [None]:
# Build SARIMAX model w/optimal parameters
sarimax=SARIMAX(df_1['Sales'],
                order=(3,1,3), 
                seasonal_order=(0,1,1,7), 
                enforce_stationarity=False, 
                enforce_invertibility=False,
                freq='D')
                                    
# Fit the model
output = sarimax.fit()
    
# Print output summary
#print(output.summary())

# Plot diagnostics
#output.plot_diagnostics(figsize=(16,10));

In [None]:
df_1['forecast']=output.predict(start=1800,end=1913,dynamic=True)
df_1[['Sales','forecast']].iloc[1800:].plot()

In [None]:
%pip install optuna

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
#from sklearn import datasets

def objective(trial):
    p = trial.suggest_int('p', 0, 6)
    # d = trial.suggest_int('d', 1, 3)
    q = trial.suggest_int('q', 0, 6)
    P = trial.suggest_int('P', 0, 6)
    Q = trial.suggest_int('Q', 0, 6)
    # m = trial.suggest_int('m', 3, 8)
    srx = sm.tsa.statespace.SARIMAX(df_1['Sales'], 
                                    order=(p,1,q), 
                                    seasonal_order=(P,1,Q,7), 
                                    enforce_stationarity=False, 
                                    enforce_invertibility=False,
                                    freq='D')
    
    output = srx.fit(maxiter=1000)
    predictions = output.predict(start=1800,end=1913,dynamic=True)

    test_data = df_1['Sales'].iloc[1800:1913]
    mape = np.mean(np.abs((test_data - predictions) / test_data)) * 100

    return mape

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

In [None]:
trial = study.best_trial
print('Mape: {}'.format(trial.value))

In [None]:
print("Best hyperparameters: {}".format(trial.params))