In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dateutil.easter as easter # Used to get Easter date each year, which is a significant holiday in Nordic Countries

#Load all of the datasets that will be used
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
gdp_df = pd.read_csv('../input/consumer-price-index-20152019-nordic-countries/Best_CPI.csv')
gdp_pc_df = pd.read_csv('../input/gdp-per-capita-finland-norway-sweden-201519/GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv')
macro_df = pd.read_csv('../input/macroeconomic-composite-finland-norway-sweden/macro_economic_idx.csv')

# Function to process the date column, including getting holidays
def dateProcess1(df, gdp_df):
    # Make a bunch of columns for the dates
    day_mon_list = []
    mon_list = []
    year_list = []

    for k in range(len(df['date'])):
        splt = df.iloc[k]['date'].split('-')
        day_mon_list.append(int(splt[2]))
        mon_list.append(int(splt[1]))
        year_list.append(int(splt[0]) - 2015)
    df['day_of_month'] = day_mon_list
    df['month'] = mon_list
    df['year'] = year_list
    
    # Add GDP Data
    gdp_list = []
    gdp_pc_list = []
    for i in range(len(df['year'])):
        if(df.iloc[i]['country'] == 'Finland'):
            gdp_list.append(gdp_df.iloc[(3*df.iloc[i]['year'])]['GDP'])
            gdp_pc_list.append(gdp_pc_df.iloc[df.iloc[i]['year']]['Finland'])
        elif(df.iloc[i]['country'] == 'Norway'):
            gdp_list.append(gdp_df.iloc[(3*df.iloc[i]['year']) + 1]['GDP'])
            gdp_pc_list.append(gdp_pc_df.iloc[df.iloc[i]['year']]['Norway'])
        elif(df.iloc[i]['country'] == 'Sweden'):
            gdp_list.append(gdp_df.iloc[(3*df.iloc[i]['year']) + 2]['GDP'])
            gdp_pc_list.append(gdp_pc_df.iloc[df.iloc[i]['year']]['Sweden'])
    df['gdp_list'] = gdp_list
    df['gdp_per_capita'] = gdp_pc_list
   
    # Add macro data
    macro_list = []
    for j in range(len(df['year'])):
        if(df.iloc[j]['country'] == 'Finland'): 
            if(df.iloc[j]['product'] == 'Kaggle Hat'): 
                macro_list.append(macro_df.iloc[df.iloc[j]['year']]['macro_comp'])
            elif(df.iloc[j]['product'] == 'Kaggle Mug'): 
                macro_list.append(macro_df.iloc[df.iloc[j]['year'] + 5]['macro_comp'])
            elif(df.iloc[j]['product'] == 'Kaggle Sticker'): 
                macro_list.append(macro_df.iloc[df.iloc[j]['year'] + 10]['macro_comp'])
        elif(df.iloc[j]['country'] == 'Sweden'): 
            if(df.iloc[j]['product'] == 'Kaggle Hat'): 
                macro_list.append(macro_df.iloc[df.iloc[j]['year'] + 15]['macro_comp'])
            elif(df.iloc[j]['product'] == 'Kaggle Mug'): 
                macro_list.append(macro_df.iloc[df.iloc[j]['year'] + 20]['macro_comp'])
            elif(df.iloc[j]['product'] == 'Kaggle Sticker'): 
                macro_list.append(macro_df.iloc[df.iloc[j]['year'] + 25]['macro_comp'])
        elif(df.iloc[j]['country'] == 'Norway'): 
            if(df.iloc[j]['product'] == 'Kaggle Hat'): 
                macro_list.append(macro_df.iloc[df.iloc[j]['year'] + 30]['macro_comp'])
            elif(df.iloc[j]['product'] == 'Kaggle Mug'): 
                macro_list.append(macro_df.iloc[df.iloc[j]['year'] + 35]['macro_comp'])
            elif(df.iloc[j]['product'] == 'Kaggle Sticker'): 
                macro_list.append(macro_df.iloc[df.iloc[j]['year'] + 40]['macro_comp'])
    df['macro_data'] = macro_list      
    
    
    df['date'] = pd.to_datetime(df['date'])
    df['weekend'] = df.date.dt.weekday >= 5 # Saturday and Sunday
    df['friday'] = df.date.dt.weekday == 4 # Friday
    df['day_of_year'] = df.date.dt.dayofyear
    
    # Christmas
    xmas_date = df.date.dt.year.apply(lambda year: pd.Timestamp(str(year)+'-12-25'))
    df['xmas_adjust'] = (df.date - xmas_date).dt.days.clip(lower=-20,upper=16).astype(float)
          
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df['easter_adj']= (df.date - easter_date).dt.days.clip(lower =-3,upper = 60).astype(float)
    df.loc[df['easter_adj'].isin(range(12, 39)), 'easter_adj'] = 12 
    
    # Black Friday
    black_fri_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-27')),
                                         2016: pd.Timestamp(('2016-11-25')),
                                         2017: pd.Timestamp(('2017-11-24')),
                                         2018: pd.Timestamp(('2018-11-23')),
                                         2019: pd.Timestamp(('2019-11-29'))})
    df['days_from_black_friday'] = (df.date - black_fri_date).dt.days.clip(-5, 5)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    df['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    #First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    print(df['date'])
    df.drop(columns=['date'],inplace=True)

dateProcess1(train, gdp_df)
dateProcess1(test, gdp_df)

In [None]:
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

# Nearly all of our data is categorical, and we do not know a clear correlation between categories and num_sold, so we will hot encode using scikit-learn's OneHotEnocder
def dataProcess(x):
    one_hot = ce.OneHotEncoder(cols = ['country'])
    x = one_hot.fit_transform(x)

    one_hot1 = ce.OneHotEncoder(cols = ['store']) # Creating a new hot encoder for each column may not be the most efficient, feel free to optimize this
    x = one_hot1.fit_transform(x)

    one_hot2 = ce.OneHotEncoder(cols = ['product'])
    x = one_hot2.fit_transform(x)
    return x

In [None]:

features = ['country', 'store', 'product', 'day_of_month', 'month', 'year', 'day_of_year', 'weekend', 'friday', 'xmas_adjust', 'easter_adj', 'days_from_black_friday', 'days_from_wed_jun', 'days_from_sun_nov', 'gdp_list', 'macro_data']
labels = ['num_sold']
x_train = train[features]
y_train = train[labels]
y_train = np.ravel(y_train) # Scikit-learn didn't like my y-column unless I used this .ravel() method from numpy
x_test = test[features]


x_train = dataProcess(x_train)
x_test = dataProcess(x_test)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Canvert True/False values to numerical data
object_cols = ['weekend', 'friday']

ordinal_encoder = OrdinalEncoder()
x_train[object_cols] = ordinal_encoder.fit_transform(x_train[object_cols])
x_train[object_cols] = ordinal_encoder.transform(x_train[object_cols])

x_test[object_cols] = ordinal_encoder.fit_transform(x_test[object_cols])
x_test[object_cols] = ordinal_encoder.transform(x_test[object_cols])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.ensemble import RandomForestRegressor
# X-treme gradient boost
from xgboost import XGBRegressor

# CatBoost
import catboost
from catboost import CatBoostRegressor

# Light Gradient Boosting Machine
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor

In [None]:
# This is the way that the competition will grade our predictions
def SMAPE(y_true, y_pred):
    diff = np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200
    return diff.mean()

import optuna


def objective(trial):
    
    rf_n_estimators_grad = trial.suggest_int("rf_n_estimators", 10, 5000)
    
    grad_boost = RandomForestRegressor(n_estimators=rf_n_estimators_grad)
    
    x_train_train, x_train_test, y_train_train, y_train_test = train_test_split(x_train, y_train, test_size=0.2) # split the data so we can get an idea of our model's performance
    
    # Step 3: Scoring method:
    grad_boost.fit(x_train_train, y_train_train)
    y_pred = grad_boost.predict(x_train_test)
    for z in range(len(y_pred)):
        y_pred[z] = round(float(y_pred[z]))
    smape_train = SMAPE(y_train_test, y_pred)
    return smape_train

# Step 4: Running it
# previous best is 4.522593002459165
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

rf_n_estimators_xg = trial.suggest_int("rf_n_estimators_xg", 10, 5000)
    rf_max_depth_xg = trial.suggest_int("rf_max_depth_xg", 2, 16)
    rf_learning_rate_xg = trial.suggest_float("rf_learning_rate_xg", 0.001, 0.5)
    
xg_boost = XGBRegressor(max_depth=rf_max_depth_xg, n_estimators=rf_n_estimators_xg, learning_rate=rf_learning_rate_xg)
    
    rf_learning_rate_cat = trial.suggest_float("rf_alpha", 0.01, 0.5)
    rf_max_depth_cat = trial.suggest_int("rf_learning_rate_init", 2, 16)
    rf_n_estimators_cat = trial.suggest_int("rf_n_estimators", 10, 5000)
    
    cat_boost = CatBoostRegressor(learning_rate=rf_learning_rate_cat, max_depth = rf_max_depth_cat, n_estimators = rf_n_estimators_cat)
    
    rf_learning_rate_lg = trial.suggest_float("rf_alpha_lg", 0.01, 0.5)
    rf_max_depth_lg = trial.suggest_int("rf_learning_rate_init_lg", 2, 16)
    rf_n_estimators_lg = trial.suggest_int("rf_n_estimators_lg", 10, 5000)
    
    lg_boost = LGBMRegressor(learning_rate=rf_learning_rate_lg, max_depth = rf_max_depth_lg, n_estimators = rf_n_estimators_lg)
    
    learning_rate_fin = trial.suggest_float("rf_alpha", 0.01, 0.5)
    max_depth_fin = trial.suggest_int("rf_learning_rate_init", 2, 16)
    n_estimators_fin = trial.suggest_int("rf_n_estimators", 10, 5000)
    
    final_boost = CatBoostRegressor(learning_rate=learning_rate_fin, max_depth = max_depth_fin, n_estimators = n_estimators_fin)
    
    regressor_obj = StackingRegressor([('grad', grad_boost), ('xg', xg_boost), ('cat', cat_boost), ('lg', lg_boost)], final_estimator = final_boost)
    