In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
sns.set(rc = {'figure.figsize':(30,16)})
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This worksheet heavily inspired by: https://www.kaggle.com/carlmcbrideellis/tps-jan-2022-a-simple-average-model-no-ml

In many cases I copy+paste code chunks.

In [None]:
# https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414

def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.round(np.mean(diff),5)

In [None]:
# load files into dataset
train_all = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv', parse_dates=['date'])
test_all = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv', parse_dates=['date'])
#sample = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')
GDP_data = pd.read_csv("../input/gdp-per-capita-finland-norway-sweden-201519/GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv")
festivities = pd.read_csv("../input/festivities-in-finland-norway-sweden-tsp-0122/nordic_holidays.csv", parse_dates=['date'])

#print(train_all.shape)
#print(test_all.shape)
#print(sample.shape)

display(train_all)
festivities

In [None]:
display(train_all)

train_all = pd.merge(train_all, festivities[['date','holiday','country']],  how='left', left_on=['date','country'], right_on = ['date','country'])
test_all = pd.merge(test_all, festivities[['date','holiday','country']],  how='left', left_on=['date','country'], right_on = ['date','country'])
display(train_all)

## Lets add more features

In [None]:
from dateutil.easter import easter

# number days in estern week: n days before and after 
n = 4

def engineer_features(series):
    # source: https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298300
    series['year'] = series['date'].dt.year
    series['quarter'] = series['date'].dt.quarter
    series['month'] = series['date'].dt.month
    series['week'] = series['date'].dt.isocalendar().week.astype(int)
    series['day'] = series['date'].dt.day
    series['dayofyear'] = series['date'].dt.dayofyear
    series['daysinmonth'] = series['date'].dt.days_in_month
    series['dayofweek'] = series['date'].dt.dayofweek
    series['weekend'] = ((series['date'].dt.dayofweek) // 5 == 1).astype(int)
    
    # easter dates
    series['easter'] = -1
    for year in series['year'].unique():
        easter_date = pd.to_datetime(easter(year))
        # print(easter_date)
        easter_date = easter_date# - pd.DateOffset(days = n)
        for easter_day in range(2*n + 1):
            e_day = easter_date + pd.DateOffset(days = easter_day)
            series.loc[series.date == e_day, 'easter'] = easter_day
            
    # GDP
    series['GDP'] = 0
    for c in series['country'].unique():
        for y in series['year'].unique():
            # print(f"{c} - {y}")
            gdp = GDP_data.loc[GDP_data.year == y][c].item()
            series.loc[(series.country == c) & (series.year == y), 'GDP'] = gdp
            
            
    # add holidays
 #  for c in series['country'].unique():
 #      display(series)
 #      holidays = festivities[festivities.country == c]
 #      series.loc[series.country == c] = series[series.country == c].merge(holidays[['date','holiday']],how = 'left')
 #      display(series)
 

In [None]:
train_all.columns

In [None]:
engineer_features(train_all)
engineer_features(test_all)

print(train_all.shape)
print(test_all.shape)

#print(train_all.easter.unique())
#display(test_all.easter.unique())


display(train_all.head())
display(test_all.head())

In [None]:
# lets use last year (2018) of train data to test my models
decision_date = pd.to_datetime("2018-01-01")
train = train_all.query("date < @decision_date").copy()
display(train.shape)
test = train_all.query("date >= @decision_date").copy()
display(test.shape)

## Random Numbers model

In [None]:
# I. random numbers
test['prediction'] = np.random.randint(low=50,high=150, size = len(test))

# score
SMAPE(test["num_sold"], test["prediction"])

## Average over all training data

In [None]:
# II. average over whole training data
test['prediction'] = train.num_sold.mean()

# score
SMAPE(test["num_sold"], test["prediction"])

## Average for each category: country, store, product, ...

In [None]:
basic_categories = ['country', 'store', 'product']

train_means = train.groupby(basic_categories)['num_sold'].mean().to_dict()
test['prediction'] = test.set_index(basic_categories).index.map(train_means.get)

#display(train_means)

# score
SMAPE(test["num_sold"], test["prediction"])


In [None]:
categories = ['country', 'store', 'product', 'week', 'dayofweek']

train_means = train.groupby(categories)['num_sold'].mean().to_dict()
test['prediction'] = test.set_index(categories).index.map(train_means.get)

# score
SMAPE(test["num_sold"], test["prediction"])

## Special model for easter 

In [None]:
easter_categories = ['country', 'store', 'product']

test["prediction_easter"] = test['prediction']
for easter_day in range(2*n + 1):
    #print(f"{easter_day}\n")
    train_subset = train[train.easter == easter_day]
    easter_means = train_subset.groupby(easter_categories)['num_sold'].mean().to_dict()
    #print(f"{easter_means}\n")
    test_subset = test[test.easter == easter_day]
    test.loc[test.easter == easter_day, 'prediction_easter'] = test_subset.set_index(easter_categories).index.map(easter_means.get)
    
    
# score
SMAPE(test["num_sold"], test["prediction_easter"])

In [None]:
# show in graph
country = "Finland"
store   = "KaggleMart"
one_country_and_store = test.query("country == @country & store == @store").copy()

fig, ax = plt.subplots(figsize=(20, 7))
sns.lineplot(data=one_country_and_store, x="date", y="num_sold", hue="product", linewidth = 2, linestyle='--')
sns.lineplot(data=one_country_and_store,  x="date", y="prediction", hue="product", linewidth = 3.5)
plt.legend([],[], frameon=False);

## Random Forest

lest see how Random Forest model compares to averages

In [None]:
# Forest initialization
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=2)

## Only Basic Categories

In [None]:
forest_categories = basic_categories
f_train = train[forest_categories].copy()
f_train = pd.get_dummies(f_train)
# display(f_train)
f_target = train['num_sold']
# display(f_target)

f_predict = test[forest_categories].copy()
f_predict = pd.get_dummies(f_predict)

display(f_predict)

model.fit(f_train,f_target)
preds = model.predict(f_predict)

test["prediction_forest"] = preds

# score
SMAPE(test["num_sold"], test["prediction_forest"])

**Lesson learned:** Random Forrest score for goven categories is = to score for averaging over these categories

## More categories - features from datetime

### the same categories which I use for averaging

In [None]:
forest_categories = categories + ['holiday']
f_train = train[forest_categories].copy()
f_train = pd.get_dummies(f_train)
# display(f_train)
f_target = train['num_sold']
# display(f_target)

f_predict = test[forest_categories].copy()
f_predict = pd.get_dummies(f_predict)

model.fit(f_train,f_target)
preds = model.predict(f_predict)

test["prediction_forest"] = preds

# score
SMAPE(test["num_sold"], test["prediction_forest"])

In [None]:
# show in graph
country = "Finland"
store   = "KaggleMart"
one_country_and_store = test.query("country == @country & store == @store").copy()

fig, ax = plt.subplots(figsize=(20, 7))
sns.lineplot(data=one_country_and_store, x="date", y="num_sold", hue="product", linewidth = 2, linestyle='--')
sns.lineplot(data=one_country_and_store,  x="date", y="prediction_forest", hue="product", linewidth = 3.5)
#sns.lineplot(data=one_country_and_store,  x="date", y="prediction_easter",color='red',  linewidth = 5.5, linestyle = ':')
plt.legend([],[], frameon=False);

## And now with all features engineered from datetime

In [None]:
forest_categories = basic_categories + ['year', 'quarter','month', 'week','day','dayofyear', 'daysinmonth', 'dayofweek', 'weekend', 'easter']
forest_categories = forest_categories + ['holiday']
#display(forest_categories)
#display(train)
f_train = train[forest_categories].copy()


f_train = pd.get_dummies(f_train) # encode categorical data
display(f_train.columns)
f_target = train['num_sold']
# display(f_target)

f_predict = test[forest_categories].copy()
f_predict = pd.get_dummies(f_predict)

#display(f_predict.head())
# some columns (holidays) may be missing in train / predict data - we need to align
f_train_aligned, f_predict_aligned = f_train.align(f_predict, join="outer", axis=1)
display(f_train_aligned.shape)
display(f_predict_aligned.shape)


model.fit(f_train,f_target)
preds = model.predict(f_predict)

test["prediction_forest"] = preds

# score
SMAPE(test["num_sold"], test["prediction_forest"])

In [None]:
# show in graph
country = "Finland"
store   = "KaggleMart"
one_country_and_store = test.query("country == @country & store == @store").copy()

fig, ax = plt.subplots(figsize=(20, 7))
sns.lineplot(data=one_country_and_store, x="date", y="num_sold", hue="product", linewidth = 2, linestyle='--')
sns.lineplot(data=one_country_and_store,  x="date", y="prediction_forest", hue="product", linewidth = 3.5)
#sns.lineplot(data=one_country_and_store,  x="date", y="prediction_easter",color='red',  linewidth = 5.5, linestyle = ':')
plt.legend([],[], frameon=False);

#TODO: add GDP column, print residuals-> find holidays for each country, boosting from tutorial

### Try to blend avg and forest

In [None]:
# blending
predictions = ['prediction_easter', 'prediction_forest']
to_blend = test[predictions]
test['blended'] = to_blend.mean(axis = 1)
test['blended']

# score
SMAPE(test["num_sold"], test["blended"])

Result: much worse than Forest itself

## XBoost


In [None]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.1)
xgb_model.fit(f_train, f_target)

preds = xgb_model.predict(f_predict)

test["prediction_xgb"] = preds

# score
SMAPE(test["num_sold"], test["prediction_xgb"])


In [None]:
# blending - forrest and xgb
predictions = ['prediction_xgb', 'prediction_forest']
to_blend = test[predictions]
test['blended'] = to_blend.mean(axis = 1)
test['blended']

# score
SMAPE(test["num_sold"], test["blended"])

## cat boost


In [None]:
from catboost import CatBoostRegressor

cb_model = CatBoostRegressor(random_state=142)

cb_model.fit(f_train_aligned, f_target, verbose = False)
preds = cb_model.predict(f_predict_aligned)

test["prediction_cb"] = preds

# score
SMAPE(test["num_sold"], test["prediction_cb"])

In [None]:
# blending - xgb and cb
predictions = ['prediction_xgb', 'prediction_cb']
to_blend = test[predictions]
test['blended'] = to_blend.mean(axis = 1)
test['blended']

# score
SMAPE(test["num_sold"], test["blended"])

## Apply model to the whole dataset

In [None]:
# prepare datasets
f_train = train_all[forest_categories].copy()

f_train = pd.get_dummies(f_train)
f_target = train_all['num_sold']
# display(f_target)
display(f_train.shape)

f_predict = test_all[forest_categories].copy()
f_predict = pd.get_dummies(f_predict)
display(f_predict.shape)

# two columns (holidays) are missing in predict data - we need to align
f_train_aligned, f_predict_aligned = f_train.align(f_predict, join="outer", axis=1)
display(f_train_aligned.shape)
display(f_predict_aligned.shape)

# forest 
# Random Forest does not work well with current num of features
# model.fit(f_train,f_target)
# preds = model.predict(f_predict)
# test_all["prediction_forest"] = preds

## XGB
xgb_model.fit(f_train_aligned,f_target)
preds = xgb_model.predict(f_predict_aligned)
test_all["prediction_xgb"] = preds

# cb
#cb_model.fit(f_train_aligned, f_target, verbose = False)
#preds = cb_model.predict(f_predict_aligned)
#test_all["prediction_cb"] = preds

## blending - xgb and cb
#predictions = ['prediction_xgb', 'prediction_cb']
#to_blend = test_all[predictions]
#test_all['blended'] = to_blend.mean(axis = 1)


# show in graph
country = "Finland"
store   = "KaggleMart"
one_country_and_store = test_all.query("country == @country & store == @store").copy()

fig, ax = plt.subplots(figsize=(20, 7))
# sns.lineplot(data=one_country_and_store, x="date", y="prediction_forest", hue="product", linewidth = 2, linestyle='--')
#sns.lineplot(data=one_country_and_store,  x="date", y="prediction_xgb", hue="product", linewidth = 3.5)
plt.legend([],[], frameon=False);


In [None]:
## model
#train_means = train.groupby(categories)['num_sold'].mean().to_dict()
#test_all['prediction'] = test_all.set_index(categories).index.map(train_means.get)
#
## show in graph
#country = "Finland"
#store   = "KaggleMart"
#one_country_and_store = test_all.query("country == @country & store == @store").copy()
#
#fig, ax = plt.subplots(figsize=(20, 7))
##sns.lineplot(data=one_country_and_store, x="date", y="num_sold", hue="product", linewidth = 2, linestyle='--')
#sns.lineplot(data=one_country_and_store,  x="date", y="prediction", hue="product", linewidth = 3.5)
#plt.legend([],[], frameon=False);
#


### Adjust Easter

In [None]:
## special model for easter 
#test_all['prediction_easter'] = test_all['prediction']
#for easter_day in range(2*n + 1):
#    #print(f"{easter_day}\n")
#    train_subset = train_all[train_all.easter == easter_day]
#    easter_means = train_subset.groupby(easter_categories)['num_sold'].mean().to_dict()
#    #print(f"{easter_means}\n")
#    test_subset = test_all[test_all.easter == easter_day]
#    test_all.loc[test_all.easter == easter_day, 'prediction_easter'] = test_subset.set_index(easter_categories).index.map(easter_means.get)
#    
#
## show in graph
#country = "Finland"
#store   = "KaggleMart"
#one_country_and_store = test_all.query("country == @country & store == @store").copy()
#
#fig, ax = plt.subplots(figsize=(20, 7))
#sns.lineplot(data=one_country_and_store, x="date", y="prediction", hue="product", linewidth = 2, linestyle='--')
#sns.lineplot(data=one_country_and_store,  x="date", y="prediction_easter", hue="product", linewidth = 3.5)
#plt.legend([],[], frameon=False);

## Save Submission

In [None]:
# save into submission file
output = pd.DataFrame()
output['row_id'] = test_all['row_id']
output['num_sold'] = test_all['prediction_xgb']
output.to_csv('submission.csv', index=False)