In [None]:
import pandas as pd
import numpy as np
from pandas import read_csv
from pandas import datetime
from matplotlib import pyplot as plt
from pandas import Series
from sklearn.metrics import mean_squared_error
from datetime import datetime
from ipywidgets import IntProgress

In [None]:
from math import sqrt
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from fbprophet import Prophet
from tqdm import tqdm_notebook
import os

In [None]:
import warnings
warnings.filterwarnings('ignore')

### GLOBAL VARIABLES

In [None]:
INPUT_PATH = '../../../data/processed'
INPUT_FILE_NAME = 'dataproc_v001'
OUTPUT_PATH = '../../../data/submission'
FEATURES_FILE_PATH = '../../../data/features'
OUTPUT_NAME = 'submission_002'
FEATURES_FILE_NAME = 'prophet_001'
NRUN = 2
DAYS_PRED = 28
METRIC = 'rmse'

### FUNCTIONS

In [None]:
def on_kaggle():
    return "KAGGLE_KERNEL_RUN_TYPE" in os.environ

In [None]:
# score a model, return None on failure
def score_model(train, cfg, period=DAYS_PRED):
    result = None
    

    m = Prophet(**cfg)


    # show all warnings and fail on exception if debugging
    with catch_warnings():
        filterwarnings("ignore")

        m.fit(train)
        future = m.make_future_dataframe(periods=period)

        forecast = m.predict(future)


    return forecast

In [None]:
def make_submission(test, submission):
    preds = test[["id", "date", "demand"]]
    preds = preds.pivot(index="id", columns="date", values="demand").reset_index()
    preds.columns = ["id"] + ["F" + str(d + 1) for d in range(DAYS_PRED)]

    vals = submission[["id"]].merge(preds, how="inner", on="id")
    evals = submission[submission["id"].str.endswith("evaluation")]
    final = pd.concat([vals, evals])

    assert final.drop("id", axis=1).isnull().sum().sum() == 0
    assert final["id"].equals(submission["id"])

    if on_kaggle():
        final.to_csv("submission.csv", index=False)
    else:
        final.to_csv(f"{OUTPUT_PATH}/{OUTPUT_NAME}.csv", index=False)

### LOAD DATASET

In [None]:
data = pd.read_pickle(f'{INPUT_PATH}/{INPUT_FILE_NAME}.pkl')

In [None]:
submission = pd.read_pickle(f'{INPUT_PATH}/submission.pkl')

In [None]:
X_train = data[data.part == 'train'] # select only train data

In [None]:
id_date = data[data.part != 'train'][["id", "date"]].reset_index(drop=True)

In [None]:
del data

In [None]:
features = ['id','date', 'demand']

In [None]:
X_train = X_train[features]


#### TRAIN MODEL

In [None]:
cfg_prophet = dict()

In [None]:
cfg_prophet['yearly_seasonality'] = True
cfg_prophet['daily_seasonality'] = False
cfg_prophet['seasonality_prior_scale'] = 0.1

In [None]:
preds = pd.DataFrame()

for idx in tqdm_notebook(X_train.id.unique()):
    train = X_train[X_train.id == idx][['date', 'demand']]
    train.columns = ['ds', 'y']
    
    pred = score_model(train, cfg_prophet)
    pred['id'] = idx

    # append to the main preds dataset
    preds = preds.append(pred)

    

In [None]:
test = preds[['id','ds','yhat']]

In [None]:
test.columns = ['id', 'date', 'demand']

In [None]:
test = test.groupby(['id']).tail(DAYS_PRED) # we only need the last DAYS_PRED for submission

### SAVE RESULTS

In [None]:
make_submission(test, submission)    

### SAVE MODEL

In [None]:
preds.to_pickle(f'{FEATURES_FILE_PATH}/{FEATURES_FILE_NAME}.pkl')