In [1]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import numpy as np
from warnings import filterwarnings
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import math
import statistics as st
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from datetime import datetime

In [2]:
def mse(labels, predictions):
    return np.mean((labels - predictions) ** 2)

def prediction_scores(df, step, cfg, return_predictions=False, return_labels=False):
    timesteps, horizon = step
    predictions_all = []
    labels_all = []
    
    def _get_score(x, timesteps, horizon, cfg, metric):
        if sum(x != np.nan) < timesteps + horizon:
            return np.nan
        
        # order, trend = cfg
        order, seasonal_order, trend = cfg
        inputs = x[:timesteps]
        inputs.index.freq = 'MS'
        
        labels = x[timesteps:]
        labels_all.append(labels)
        
        m = SARIMAX(inputs, order=order, seasonal_order=seasonal_order, trend=trend, enforce_stationarity=False, enforce_invertibility=False)
        # m = ARIMA(inputs, order=order, trend=trend, enforce_stationarity=False, enforce_invertibility=False)
        predictions = m.fit(disp=False).predict(timesteps, timesteps+horizon-1)
        # print(predictions)
        predictions_all.append(predictions)
        score = metric(labels, predictions)
        # print(score)
        return score
    
    result = [df.rolling(timesteps+horizon).apply(lambda x: _get_score(x, timesteps, horizon, cfg, mse)).mean(axis=0)]
    if return_predictions:
        result.append(predictions_all)
    if return_labels:
        result.append(labels_all)
    if len(result) == 1:
        return result[0]
    return tuple(result)

# cfg = [(1,0,1), 'n']
# step = [24, 6]
# scores, pred, lab = prediction_scores(df_test['price'], step, cfg, True, True)

In [3]:
def grid_search(df, cfg_list, step):
    scores = list()
    filterwarnings("ignore")
    best_score = np.inf

    for cfg in cfg_list:
        
        score = prediction_scores(df, step, cfg)
        print(f'{cfg=}, {score=}')

        if score < best_score:
            print("Found an improved score", score,"is better than", best_score )
            best_score = score
            best_cfg = cfg

    return best_cfg, best_score

In [4]:
#Function to  create a set of arimax configs to try
def arima_config(p,d,q):
    models = list()
    # define config lists
    p_params = list(range(0,p))
    d_params = list(range(0,d))
    q_params = list(range(0,q))
    t_params = ['n']
    # create config instances
    for p in p_params:
        for d in d_params:
            for q in q_params:
                for t in t_params:
                    cfg = [(p,d,q), t]
                    models.append(cfg)
    return models

In [5]:
#Function to  create a set of sarima configs to try
def sarima_configs(p,d,q,P,D,Q):
    models = list()
    # define config lists
    p_params = list(range(0,p))
    d_params = list(range(0,d))
    q_params = list(range(0,q))
    t_params = ['n']
    P_params = list(range(0,P))
    D_params = list(range(0,D))
    Q_params = list(range(0,Q))
    m_params = [12]
    # create config instances
    for p in p_params:
        for d in d_params:
            for q in q_params:
                for t in t_params:
                     for P in P_params:
                        for D in D_params:
                            for Q in Q_params:
                                for m in m_params:
                                    cfg = [(p,d,q), (P,D,Q,m), t]
                                    models.append(cfg)
    return models

In [6]:
#Function take log of price and standardize train set
def transform_data_train(df):
    df.price = np.log(df.price)
    col_list = df.columns
    mean_list = []
    sd_list = []
    mean = 0
    sd = 0

    for col in col_list:
        mean = st.mean(df[col])
        sd = st.pstdev(df[col])
        df[col] = (df[col]-mean)/sd
        mean_list.append(mean)
        sd_list.append(sd)

    #print(df.price)
    #print(mean_list)
    #print(sd_list)
    return df, mean_list, sd_list

In [7]:
#Function take log of price and standardize test set
def transform_data_test(df,mean_list, sd_list):
    df.price = np.log(df.price)
    col_list = df.columns

    for i in range(len(col_list)):
        col = col_list[i]
        df[col] = (df[col]-mean_list[i])/sd_list[i]

    #print(df.head(5))
    return df

In [8]:
#Function to split dataset based on date selected
def train_test_split(df, enddate_train,startdate_test):
    df_train = df[:enddate_train]
    df_test = df[startdate_test:]
    return df_train, df_test

In [9]:
def ith_step_prediction_label(predictions, labels=None, ith_step=6, horizon=6):
    predictions = pd.concat(predictions, axis=1)
    predictions = pd.Series([predictions.iat[i+ith_step-1, i] for i in range(len(predictions)-horizon+1)], index=predictions.index[ith_step-1:ith_step + horizon]) 
    predictions.name = 'pred'
    if labels is None:
        return predictions
    
    labels = pd.concat(labels, axis=1)
    labels = pd.Series([labels.iat[i+ith_step-1, i] for i in range(len(labels)-horizon+1)], index=labels.index[ith_step-1:ith_step + horizon]) 
    labels.name = 'label'
    
    df = pd.concat([predictions, labels], axis=1)
    return df


In [10]:
import pathlib
cwd = pathlib.Path.cwd()

code_directory = cwd.parents[1]

bas_directory = code_directory / "notebooks" / "Bas"
gonem_directory = code_directory / "notebooks" / "Gonem"
# data_file = bas_directory / "cadeautjevoorGonemenLiza.xlsx"
results_directory = gonem_directory / 'arima_results'

df_files = ['MAIZE_FILTERED_2023-03-03_02-09-43.xlsx', 'SUNFLOWER_FILTERED_2023-03-03_02-19-29.xlsx', 'WHEAT_FILTERED_2023-03-03_02-44-24.xlsx']
products = ['Maize', 'Sunflower', 'Wheat']


In [15]:
#Import data of the required product
#Select which country dataset you want to forcast

#split the data in test and train
enddate_train = "2022-04-01"
startdate_test = "2019-11-01"

for product, path_file in zip(products, df_files):
    df_all = pd.read_excel(gonem_directory / path_file, header=[0, 1], index_col=0)
    countries = df_all.columns.get_level_values(1).unique()
    
    for country in countries:
        df = df_all.xs(country, axis=1, level=1, drop_level=True) #ADJUST to country

        df_train_unedited, df_test_unedited = train_test_split(df, enddate_train,startdate_test )

        #take log price and standardize the data
        df_train = df_train_unedited.copy()
        df_train, mean_list, sd_list = transform_data_train(df_train)

        #Run test set
        df_test = df_test_unedited.copy()
        df_test = transform_data_test(df_test, mean_list,sd_list)
        # model configs
        p = 8
        d = 2
        q = 8
        # cfg_list = arima_config(p,d, q)
        cfg_list = sarima_configs(3,2,3,2,2,2)

        step = [24,6] #history horizon and multistepforecast

        # grid search on train dataset with minimum MSE

        best_cfg, lowest_score = grid_search(df_train['price'], cfg_list,step)
        # best_cfg, lowest_score, predictions_train = grid_search(df_train, cfg_list,step)
        # best_cfg, lowest_score= [(0, 0, 1), 'n'], 1.0504044808697957
        print('Gridsearch found with best parameters', best_cfg, "with MSE equal to", lowest_score)

        score_train, predictions_train, labels_train = prediction_scores(df_train['price'], step, best_cfg, return_predictions=True, return_labels=True)
        score_test, predictions_test, labels_test = prediction_scores(df_test['price'], step, best_cfg, return_predictions=True, return_labels=True)

        df_out = ith_step_prediction_label(predictions_test, labels_test, ith_step=6)
        df_out = np.exp(df_out*sd_list[df.columns.get_loc("price")]+mean_list[df.columns.get_loc("price")])

        data_file = results_directory / f'{product}_{country}_{best_cfg=}_{lowest_score=}.csv'
        df_out.to_csv(data_file)


Index(['Brazil', 'France', 'Germany', 'Hungary', 'Ukraine', 'Global'], dtype='object', name='PARTNER_Labels')
Brazil
France
Germany
Hungary
Ukraine
Global
Index(['Belgium', 'Germany', 'Hungary', 'Ukraine', 'Argentina', 'Global'], dtype='object', name='PARTNER_Labels')
Belgium
Germany
Hungary
Ukraine
Argentina
Global
Index(['Belgium', 'France', 'Germany', 'Romania', 'United Kingdom', 'Global'], dtype='object', name='PARTNER_Labels')
Belgium
France
Germany
Romania
United Kingdom
Global
