In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from darts import TimeSeries
from darts.dataprocessing import Pipeline
from darts.dataprocessing.transformers import Scaler, StaticCovariatesTransformer, MissingValuesFiller, InvertibleMapper
from darts.utils.missing_values import fill_missing_values
from darts.models import XGBModel
from darts.models import MovingAverageFilter
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.metrics import mae, mse, rmse, rmsle, r2_score
import matplotlib.pyplot as plt

TypeError: object.__init__() takes exactly one argument (the instance to initialize)

In [None]:
DATAPATH = "../data/store-sales"

In [None]:
train_df = pd.read_csv(DATAPATH + '/train.csv', parse_dates=['date'])
stores_df = pd.read_csv(DATAPATH + '/stores.csv')
stores_df = stores_df.rename(columns={'type': 'store_type'})

oil_df = pd.read_csv(DATAPATH + '/oil.csv', parse_dates=['date'])
transactions_df = pd.read_csv(DATAPATH + '/transactions.csv', parse_dates=['date'])
holidays_df = pd.read_csv(DATAPATH + '/holidays_events.csv', parse_dates=['date'])
holidays_df = holidays_df.rename(columns={'type': 'holiday_type'})

test_df = pd.read_csv(DATAPATH + '/test.csv', parse_dates=['date'])
df_test_sorted = test_df.sort_values(by=['store_nbr','family'])

df_sample_submission = pd.read_csv(DATAPATH + '/sample_submission.csv')

family_list = train_df['family'].unique()
store_list = stores_df['store_nbr'].unique()

In [None]:
train_df = train_df.merge(stores_df, on='store_nbr', how='left')

In [None]:
ts_dict = {}

for family in family_list:
    df_family = train_df.loc[train_df['family'] == family]

    family_ts = TimeSeries.from_group_dataframe(
        df_family,
        time_col="date",
        group_cols=["store_nbr", "family"],
        static_cols=["city", "state", "store_type", "cluster"],
        value_cols="sales",
        fill_missing_dates=True,
        freq='D')

    for ts in family_ts:
        ts = ts.astype(np.float32)

    ts_dict[family] = sorted(family_ts, key=lambda ts: int(ts.static_covariates_values()[0, 0]))

In [None]:
pipeline_dict = {}
transform_dict = {}

for key in ts_dict:
    train_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Fill NAs")
    static_cov_transformer = StaticCovariatesTransformer(verbose=False, transformer_cat=OneHotEncoder(), name="Encoder")
    log_transformer = InvertibleMapper(np.log1p, np.expm1, verbose=False, n_jobs=-1, name="Log-Transform")
    train_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaling")

    train_pipeline = Pipeline([train_filler,
                               static_cov_transformer,
                               log_transformer,
                               train_scaler])

    training_transformed = train_pipeline.fit_transform(ts_dict[key])
    pipeline_dict[key] = train_pipeline
    transform_dict[key] = training_transformed

In [None]:
full_time_period = pd.date_range(start='2013-01-01', end='2017-08-31', freq='D')

year = datetime_attribute_timeseries(time_index=full_time_period, attribute="year")
month = datetime_attribute_timeseries(time_index=full_time_period, attribute="month")
day = datetime_attribute_timeseries(time_index=full_time_period, attribute="day")
dayofyear = datetime_attribute_timeseries(time_index=full_time_period, attribute="dayofyear")
weekday = datetime_attribute_timeseries(time_index=full_time_period, attribute="dayofweek")
weekofyear = datetime_attribute_timeseries(time_index=full_time_period, attribute="weekofyear")
timesteps = TimeSeries.from_times_and_values(times=full_time_period,
                                             values=np.arange(len(full_time_period)),
                                             columns=["linear_increase"])

time_cov = year.stack(month).stack(day).stack(dayofyear).stack(weekday).stack(weekofyear).stack(timesteps)
time_cov = time_cov.astype(np.float32)

In [None]:
time_cov_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")
time_cov_train, time_cov_val = time_cov.split_before(pd.Timestamp('20170816'))
time_cov_scaler.fit(time_cov_train)
time_cov_transformed = time_cov_scaler.transform(time_cov)

In [None]:
oil = TimeSeries.from_dataframe(oil_df,
                                time_col='date',
                                value_cols=['dcoilwtico'],
                                freq='D')

oil = oil.astype(np.float32)

# Transform
oil_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Filler")
oil_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")
oil_pipeline = Pipeline([oil_filler, oil_scaler])
oil_transformed = oil_pipeline.fit_transform(oil)

# Moving Averages for Oil Price
oil_moving_average_7 = MovingAverageFilter(window=7)
oil_moving_average_28 = MovingAverageFilter(window=28)

ma_7 = oil_moving_average_7.filter(oil_transformed).astype(np.float32)
ma_7 = ma_7.with_columns_renamed(col_names=ma_7.components, col_names_new="oil_ma_7")
ma_28 = oil_moving_average_28.filter(oil_transformed).astype(np.float32)
ma_28 = ma_28.with_columns_renamed(col_names=ma_28.components, col_names_new="oil_ma_28")
oil_moving_averages = ma_7.stack(ma_28)

In [None]:
def holiday_list(df_stores):
    listofseries = []

    for i in range(0, len(df_stores)):
        df_holiday_dummies = pd.DataFrame(columns=['date'])
        df_holiday_dummies["date"] = holidays_df["date"]

        df_holiday_dummies["national_holiday"] = np.where(
            ((holidays_df["holiday_type"] == "Holiday") & (holidays_df["locale"] == "National")), 1, 0)

        df_holiday_dummies["earthquake_relief"] = np.where(holidays_df['description'].str.contains('Terremoto Manabi'),
                                                           1, 0)

        df_holiday_dummies["christmas"] = np.where(holidays_df['description'].str.contains('Navidad'), 1, 0)

        df_holiday_dummies["football_event"] = np.where(holidays_df['description'].str.contains('futbol'), 1, 0)

        df_holiday_dummies["national_event"] = np.where(((holidays_df["holiday_type"] == "Event") & (
                holidays_df["locale"] == "National") & (~holidays_df['description'].str.contains(
            'Terremoto Manabi')) & (~holidays_df['description'].str.contains('futbol'))), 1, 0)

        df_holiday_dummies["work_day"] = np.where((holidays_df["type"] == "Work Day"), 1, 0)

        df_holiday_dummies["local_holiday"] = np.where(((holidays_df["holiday_type"] == "Holiday") & (
                (holidays_df["locale_name"] == df_stores['state'][i]) | (
                holidays_df["locale_name"] == df_stores['city'][i]))), 1, 0)

        listofseries.append(df_holiday_dummies)

    return listofseries

In [None]:
def remove_0_and_duplicates(holiday_list):
    listofseries = []

    for i in range(0, len(holiday_list)):
        df_holiday_per_store = list_of_holidays_per_store[i].set_index('date')

        df_holiday_per_store = df_holiday_per_store.loc[~(df_holiday_per_store == 0).all(axis=1)]

        df_holiday_per_store = df_holiday_per_store.groupby('date').agg(
            {'national_holiday': 'max', 'earthquake_relief': 'max',
             'christmas': 'max', 'football_event': 'max',
             'national_event': 'max', 'work_day': 'max',
             'local_holiday': 'max'}).reset_index()

        listofseries.append(df_holiday_per_store)

    return listofseries

In [None]:
def holiday_TS_list_54(holiday_list):
    listofseries = []

    for i in range(0, 54):
        holidays_ts = TimeSeries.from_dataframe(list_of_holidays_per_store[i],
                                                time_col='date',
                                                fill_missing_dates=True,
                                                fillna_value=0,
                                                freq='D')

        holidays_ts = holidays_ts.slice(pd.Timestamp('20130101'), pd.Timestamp('20170831'))
        holidays_ts = holidays_ts.astype(np.float32)
        listofseries.append(holidays_ts)

    return listofseries

In [None]:
list_of_holidays_per_store = holiday_list(stores_df)
list_of_holidays_per_store = remove_0_and_duplicates(list_of_holidays_per_store)
list_of_holidays_store = holiday_TS_list_54(list_of_holidays_per_store)

holidays_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Filler")
holidays_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")

holidays_pipeline = Pipeline([holidays_filler, holidays_scaler])
holidays_transformed = holidays_pipeline.fit_transform(list_of_holidays_store)

In [None]:
df_promotion = pd.concat([train_df, test_df], axis=0)
df_promotion = df_promotion.sort_values(["store_nbr","family","date"])
df_promotion.tail()

family_promotion_dict = {}

for family in family_list:
    df_family = df_promotion.loc[df_promotion['family'] == family]

    list_of_TS_promo = TimeSeries.from_group_dataframe(
        df_family,
        time_col="date",
        group_cols=["store_nbr","family"],
        value_cols="onpromotion",
        fill_missing_dates=True,
        freq='D')

    for ts in list_of_TS_promo:
        ts = ts.astype(np.float32)

    family_promotion_dict[family] = list_of_TS_promo

In [None]:
promotion_transformed_dict = {}

for key in family_promotion_dict:
    promo_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Fill NAs")
    promo_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaling")

    promo_pipeline = Pipeline([promo_filler,
                               promo_scaler])

    promotion_transformed = promo_pipeline.fit_transform(family_promotion_dict[key])

    # Moving Averages for Promotion Family Dictionaries
    promo_moving_average_7 = MovingAverageFilter(window=7)
    promo_moving_average_28 = MovingAverageFilter(window=28)

    promotion_covs = []

    for ts in promotion_transformed:
        ma_7 = promo_moving_average_7.filter(ts)
        ma_7 = TimeSeries.from_series(ma_7.pd_series())
        ma_7 = ma_7.astype(np.float32)
        ma_7 = ma_7.with_columns_renamed(col_names=ma_7.components, col_names_new="promotion_ma_7")
        ma_28 = promo_moving_average_28.filter(ts)
        ma_28 = TimeSeries.from_series(ma_28.pd_series())
        ma_28 = ma_28.astype(np.float32)
        ma_28 = ma_28.with_columns_renamed(col_names=ma_28.components, col_names_new="promotion_ma_28")
        promo_and_mas = ts.stack(ma_7).stack(ma_28)
        promotion_covs.append(promo_and_mas)

    promotion_transformed_dict[key] = promotion_covs

In [None]:
general_covariates = time_cov_transformed.stack(oil_transformed).stack(oil_moving_averages)

In [None]:
store_covariates_future = []

for store in range(0,len(store_list)):
    stacked_covariates = holidays_transformed[store].stack(general_covariates)
    store_covariates_future.append(stacked_covariates)

In [None]:
future_covariates_dict = {}

for key in promotion_transformed_dict:

    promotion_family = promotion_transformed_dict[key]
    covariates_future = [promotion_family[i].stack(store_covariates_future[i]) for i in range(0,len(promotion_family))]

    future_covariates_dict[key] = covariates_future

In [None]:
transactions_df.sort_values(["store_nbr","date"], inplace=True)

TS_transactions_list = TimeSeries.from_group_dataframe(
    transactions_df,
    time_col="date",
    group_cols=["store_nbr"],
    value_cols="transactions",
    fill_missing_dates=True,
    freq='D')

transactions_list = []

for ts in TS_transactions_list:
    series = TimeSeries.from_series(ts.pd_series())
    series = series.astype(np.float32)
    transactions_list.append(series)

transactions_list[24] = transactions_list[24].slice(start_ts=pd.Timestamp('20130102'), end_ts=pd.Timestamp('20170815'))

from datetime import datetime, timedelta

transactions_list_full = []

for ts in transactions_list:
    if ts.start_time() > pd.Timestamp('20130101'):
        end_time = (ts.start_time() - timedelta(days=1))
        delta = end_time - pd.Timestamp('20130101')
        zero_series = TimeSeries.from_times_and_values(
            times=pd.date_range(start=pd.Timestamp('20130101'),
                                end=end_time, freq="D"),
            values=np.zeros(delta.days+1))
        ts = zero_series.append(ts)
        ts = ts.with_columns_renamed(col_names=ts.components, col_names_new="transactions")
        transactions_list_full.append(ts)

transactions_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Filler")
transactions_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")

transactions_pipeline = Pipeline([transactions_filler, transactions_scaler])
transactions_transformed = transactions_pipeline.fit_transform(transactions_list_full)

In [None]:
LGBM_Models_Submission = {}

display("Training...")

for family in family_list:

    sales_family = transform_dict[family]
    training_data = [ts for ts in sales_family]
    TCN_covariates = future_covariates_dict[family]
    train_sliced = [training_data[i].slice_intersect(TCN_covariates[i]) for i in range(0,len(training_data))]

    LGBM_Model_Submission = XGBModel(lags = 63,
                                          lags_future_covariates = (14,1),
                                          lags_past_covariates = [-16,-17,-18,-19,-20,-21,-22],
                                          output_chunk_length=1,
                                          random_state=2022,
                                          gpu_use_dp= "false",
                                          )

    LGBM_Model_Submission.fit(series=train_sliced,
                              future_covariates=TCN_covariates,
                              past_covariates=transactions_transformed)

    LGBM_Models_Submission[family] = LGBM_Model_Submission

In [None]:
display("Predictions...")

LGBM_Forecasts_Families_Submission = {}

for family in family_list:

    sales_family = transform_dict[family]
    training_data = [ts for ts in sales_family]
    LGBM_covariates = future_covariates_dict[family]
    train_sliced = [training_data[i].slice_intersect(TCN_covariates[i]) for i in range(0,len(training_data))]

    forecast_LGBM = LGBM_Models_Submission[family].predict(n=16,
                                                           series=train_sliced,
                                                           future_covariates=LGBM_covariates,
                                                           past_covariates=transactions_transformed)

    LGBM_Forecasts_Families_Submission[family] = forecast_LGBM

In [None]:
LGBM_Forecasts_Families_back_Submission = {}

for family in family_list:

    LGBM_Forecasts_Families_back_Submission[family] = pipeline_dict[family].inverse_transform(LGBM_Forecasts_Families_Submission[family], partial=True)

In [None]:
for family in LGBM_Forecasts_Families_back_Submission:
    for n in range(0,len(LGBM_Forecasts_Families_back_Submission[family])):
        if (ts_dict[family][n].univariate_values()[-21:] == 0).all():
            LGBM_Forecasts_Families_back_Submission[family][n] = LGBM_Forecasts_Families_back_Submission[family][n].map(lambda x: x * 0)

listofseries = []

for store in range(0,54):
    for family in family_list:
        oneforecast = LGBM_Forecasts_Families_back_Submission[family][store].pd_dataframe()
        oneforecast.columns = ['fcast']
        listofseries.append(oneforecast)

df_forecasts = pd.concat(listofseries)
df_forecasts.reset_index(drop=True, inplace=True)

# No Negative Forecasts
df_forecasts[df_forecasts < 0] = 0
forecasts_kaggle = pd.concat([df_test_sorted, df_forecasts.set_index(df_test_sorted.index)], axis=1)
forecasts_kaggle_sorted = forecasts_kaggle.sort_values(by=['id'])
forecasts_kaggle_sorted = forecasts_kaggle_sorted.drop(['date','store_nbr','family'], axis=1)
forecasts_kaggle_sorted = forecasts_kaggle_sorted.rename(columns={"fcast": "sales"})
forecasts_kaggle_sorted = forecasts_kaggle_sorted.reset_index(drop=True)

# Submission
submission_kaggle = forecasts_kaggle_sorted
submission_kaggle.to_csv('submission.csv', index=False)

In [None]:
"""print("MAE = {:.2f}%".format(mae(y_val, y_pred)))
print("MSE = {:.2f}%".format(mse(y_val, y_pred)))
print("RMSE = {:.2f}%".format(rmse(y_val, y_pred)))
print("RMSLE = {:.2f}%".format(rmsle(y_val, y_pred)))
print("R2 = {:.2f}%".format(r2_score(y_val, y_pred)))"""
