In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dateutil.easter as easter

train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
gdp_df = pd.read_csv('../input/consumer-price-index-20152019-nordic-countries/Best_CPI.csv')

# Nearly all of our data is categorical, and we do not know a clear correlation between categories and num_sold, so we will hot encode using scikit-learn's OneHotEnocder
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
def dataProcess(x):
    one_hot = ce.OneHotEncoder(cols = ['country'])
    x = one_hot.fit_transform(x)

    one_hot1 = ce.OneHotEncoder(cols = ['store']) # Creating a new hot encoder for each column may not be the most efficient, feel free to optimize this
    x = one_hot1.fit_transform(x)

    one_hot2 = ce.OneHotEncoder(cols = ['product'])
    x = one_hot2.fit_transform(x)
    return x
    

def dateProcess1(df, gdp_df):
    # Make a bunch of columns for the dates
    day_mon_list = []
    mon_list = []
    year_list = []

    for k in range(len(df['date'])):
        splt = df.iloc[k]['date'].split('-')
        day_mon_list.append(int(splt[2]))
        mon_list.append(int(splt[1]))
        year_list.append(int(splt[0]) - 2015)
    

    df['day_of_month'] = day_mon_list
    df['month'] = mon_list
    df['year'] = year_list

    gdp_list = []
    for i in range(len(df['year'])):
        if(df.iloc[i]['country'] == 'Finland'):
            gdp_list.append(gdp_df.iloc[(3*df.iloc[i]['year'])]['GDP'])
        elif(df.iloc[i]['country'] == 'Norway'):
            gdp_list.append(gdp_df.iloc[(3*df.iloc[i]['year']) + 1]['GDP'])
        elif(df.iloc[i]['country'] == 'Sweden'):
            gdp_list.append(gdp_df.iloc[(3*df.iloc[i]['year']) + 2]['GDP'])
    df['gdp_list'] = gdp_list

    df['date'] = pd.to_datetime(df['date'])
    df['weekend'] = df.date.dt.weekday >= 5 # Saturday and Sunday
    df['friday'] = df.date.dt.weekday == 4 # Friday
    df['day_of_year'] = df.date.dt.dayofyear
    
    # Christmas
    xmas_date = df.date.dt.year.apply(lambda year: pd.Timestamp(str(year)+'-12-25'))
    df['xmas_adjust'] = (df.date - xmas_date).dt.days.clip(lower=-20,upper=16).astype(str)
          
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df['easter_adj']= (df.date - easter_date).dt.days.clip(lower =-3,upper = 60).astype(float)
    df.loc[df['easter_adj'].isin(range(12, 39)), 'easter_adj'] = 12 
    
    # Black Friday
    black_fri_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-27')),
                                         2016: pd.Timestamp(('2016-11-25')),
                                         2017: pd.Timestamp(('2017-11-24')),
                                         2018: pd.Timestamp(('2018-11-23')),
                                         2019: pd.Timestamp(('2019-11-29'))})
    df['days_from_black_friday'] = (df.date - black_fri_date).dt.days.clip(-5, 5)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    df['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    #First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    print(df['date'])
    df.drop(columns=['date'],inplace=True)

    
dateProcess1(train, gdp_df)
dateProcess1(test, gdp_df)

In [None]:

features = ['country', 'store', 'product', 'day_of_month', 'month', 'year', 'day_of_year', 'weekend', 'friday', 'xmas_adjust', 'easter_adj', 'days_from_black_friday', 'days_from_wed_jun', 'days_from_sun_nov', 'gdp_list']
labels = ['num_sold']
x_train = train[features]
y_train = train[labels]
y_train = np.ravel(y_train) # Scikit-learn didn't like my y-column unless I used this .ravel() method from numpy
x_test = test[features]


x_train = dataProcess(x_train)
x_test = dataProcess(x_test)

In [None]:
from sklearn.model_selection import train_test_split

x_train_train, x_train_test, y_train_train, y_train_test = train_test_split(x_train, y_train, test_size=0.3) # split the data so we can get an idea of our model's performance

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
# This is the way that the competition will grade our predictions
def SMAPE(y_true, y_pred):
    diff = np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200
    return diff.mean()

import optuna


def objective(trial):
    
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 5000)
    rf_learning_rate = trial.suggest_float("rf_learning_rate", 0.001, 1.0)
    
    
    classifier_obj = AdaBoostRegressor(n_estimators=rf_n_estimators, learning_rate=rf_learning_rate)

    # Step 3: Scoring method:
    classifier_obj.fit(x_train_train, y_train_train)
    y_pred = classifier_obj.predict(x_train_test)
    for z in range(len(y_pred)):
        y_pred[z] = round(float(y_pred[z]))
    smape_train = SMAPE(y_train_test, y_pred)
    return smape_train

# Step 4: Running it
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)