In [5]:
import pandas as pd
import numpy as np
from datetime import date, datetime, time, timedelta
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.metrics import mean_squared_error

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder


%config InlineBackend.figure_format = 'retina' 
%matplotlib inline

#models
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor
from catboost import CatBoostRegressor

 
mpl.rcParams['lines.linewidth'] = 2

figsize=(12,9)

np.random.seed(238746)


import warnings
warnings.filterwarnings("ignore")

In [6]:
SplitTestDate = "2018-12-02"

In [57]:
models = {
    #"linear" : LinearRegression(),
    "xgb" : xgb.XGBRegressor(n_estimators=100, learning_rate=0.06),
    "forest" : RandomForestRegressor(n_estimators=60,random_state=1234),
    #"ada": AdaBoostRegressor(random_state=123, n_estimators=500)
    #"cat": CatBoostRegressor(iterations=500,learning_rate=0.5,eval_metric='MAPE')
}

features_for_model = {

    "xgb" : ['scaled_month',
            "group_A","group_B","BRAND2","BRAND4",
           'scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',"scaled_sales1","scaled_sales2","scaled_sales3",
            'scaled_promo','scaled_diff1','scaled_diff2','percentage_diff1','scaled_price',"scaled_rolling2"],
    "forest": ['scaled_month',
            "group_A","group_B","BRAND2","BRAND4",
           'scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',"scaled_sales1","scaled_sales2","scaled_sales3",
            'scaled_promo','scaled_diff1','scaled_diff2','percentage_diff1','scaled_price',"scaled_rolling2"]
 
}

finalreg = xgb.XGBRegressor(n_estimators=500, learning_rate=0.5)

In [58]:
quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal', random_state=123)
min_max_scaler = preprocessing.MinMaxScaler()

def scale(feature, scaler = min_max_scaler):
    size = len(feature)
    return scaler.fit_transform(np.array([feature]).reshape(size, 1)).T[0]
def unscale(scaled, original, scaler = min_max_scaler):
    size2 = len(scaled)
    size1 = len(original)
    return scaler.fit(np.array([original]).reshape(size1, 1)).inverse_transform(np.array([scaled]).reshape(size2, 1)).T[0]

In [59]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def plot_results(ytrue, prediction, label):
    df_ytrue = pd.DataFrame(ytrue)
    df_ytrue['predicted'] =  prediction 
    plt.plot(df_ytrue[label],label='data')
    plt.plot(df_ytrue['predicted'],label='predicted')
    plt.xticks(rotation=45)
    mape = mean_absolute_percentage_error(df_ytrue[label],df_ytrue['predicted'])

    plt.title("Forecasting on Test Set MAPE=%.3f"%mape)
    plt.legend();
    plt.show()
    return mape

def scaled_mape(prediction, y_test, rescale):
    results = pd.DataFrame(columns = ["sku","target","prediction"])

    for sku in y_test["sku"].unique():

        pred_sku = prediction[np.where(y_test["sku"] == sku)]
        originals = rescale.loc[rescale["sku"] == sku]["target"]
        pred = unscale(pred_sku,originals)

        results = pd.concat([results, pd.DataFrame({
            "date" : y_test[y_test["sku"] == sku]["date"],
            "sku" : y_test[y_test["sku"] == sku]["sku"],
            "scaled_target" : y_test[y_test["sku"] == sku]["scaled_target"],
            "target" : y_test[y_test["sku"] == sku]["target"],
            "scaled_prediction" : pred_sku,
            "prediction" : pred
        })])
    results = results.dropna().set_index("date")
    return mean_absolute_percentage_error(results["target"],results["prediction"])

    
    
    
def plot_scaled_results(prediction, y_test, rescale):
    results = pd.DataFrame(columns = ["sku","target","prediction"])

    for sku in y_test["sku"].unique():

        pred_sku = prediction[np.where(y_test["sku"] == sku)]
        originals = rescale.loc[rescale["sku"] == sku]["target"]
        pred = unscale(pred_sku,originals)

        results = pd.concat([results, pd.DataFrame({
            "date" : y_test[y_test["sku"] == sku]["date"],
            "sku" : y_test[y_test["sku"] == sku]["sku"],
            "scaled_target" : y_test[y_test["sku"] == sku]["scaled_target"],
            "target" : y_test[y_test["sku"] == sku]["target"],
            "scaled_prediction" : pred_sku,
            "prediction" : pred
        })])
    results = results.dropna().set_index("date")

    return plot_results(results["target"], results["prediction"], "target"), results

In [60]:
def predict_training(model,X,y,label = "scaled_target",span = 24):
    
    #k = number of folds
    k = int(len(X_train)/span)
    
    prediction = pd.DataFrame(columns = ["date","sku","pred"])    
    
    for fold in range(0,k):
        if fold != k - 1:
            validation = X_train.iloc[fold*span:(fold+1)*span]
        else:
            validation = X_train.iloc[fold*span:]
        
        training = X_train[~X_train.index.isin(validation.index)]
        y_tr  = y_train[~y_train.index.isin(validation.index)]
        y_val  = y_train[y_train.index.isin(validation.index)]

        model.fit(training, y_tr[label])
        partial_pred = pd.DataFrame()
        partial_pred["date"] = y_val["date"]
        partial_pred["sku"] = y_val["sku"]
        partial_pred["pred"] = model.predict(validation)
        partial_pred.set_index(validation.index)
        prediction = prediction.append(partial_pred)
        
    return prediction

In [61]:
def StackingPred(models,final_model, X_train, y_train, X_test,y_test,features_for_model,rescale = None, span = 24, label = "scaled_target"):
    final_prediction = []
    for week in y_test.date.unique():
        print(week)
        test = X_test.loc[y_test.date == week]
        #TRAIN SET PREDICTION WITH MODELS
        results_dict = {}
        for name, model in models.items():
            print("train with %s" %name)
            X = X_train[features_for_model[name]]
            results_dict.update({name : predict_training(model, X, y_train,"scaled_target",span)})

        train_result = pd.DataFrame(index = y_train.index)
        for name,r in results_dict.items():
            train_result["date"] = r["date"]
            train_result["sku"] = r["sku"]
            train_result[name] = r["pred"]

        #TEST SET PREDICTION WITH MODELS
        prediction_df = pd.DataFrame()
        for name, model in models.items():
            print("test with %s" %name)
            X = X_train[features_for_model[name]]
            model = model.fit(X, y_train["scaled_target"]) 
            prediction_df[name] = model.predict(test[features_for_model[name]])
        prediction_df.index = test.index

        #DUMMY VARIABLES FOR FINAL TRAIN AND TEST
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoded = (pd.get_dummies(train_result["sku"]) > 0)
        onehot = pd.DataFrame(onehot_encoded, index = train_result.index)

        final_train = pd.concat([onehot,train_result[models.keys()]],axis = 1)

        onehot_encoded = pd.get_dummies(y_test["sku"]) > 0
        onehot = pd.DataFrame(onehot_encoded, index = y_test.index).iloc[test.index]
        final_test = pd.concat([onehot,prediction_df[models.keys()]],axis = 1)

        #TRAIN ON TRAIN PREDICTIONS AND PREDICTION ON TEST PREDICTIONS
        print("final prediction")

        final_model.fit(final_train, y_train["scaled_target"])
        p =  final_model.predict(final_test)
        final_prediction = np.append(final_prediction,p)
        if rescale is not None:
            print("MAPE %.5f" % scaled_mape(p, y_test.loc[y_test.date == week], rescale))
        X_train = pd.concat([X_train,test])
        y_train = pd.concat([y_train,y_test.loc[y_test.date == week]])
    return final_prediction

In [62]:
df_train = pd.read_csv('data/processed_train.csv',index_col = 0)
df_final_test = pd.read_csv('data/processed_test.csv',index_col = 0)

In [63]:
df_final_test.date = df_final_test.date.apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))
df_train.date = df_train.date.apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))

df_train = df_train.loc[df_train.scope == 1]

df_train = df_train.sort_values(by=["date",'sku'])
df_final_test = df_final_test.sort_values(by=["date",'sku'])

In [64]:
rescale_df = pd.concat([df_train,df_final_test])[["target","sku"]].dropna()

In [65]:
df_train = df_train[['scaled_month',
            "group_A","group_B","BRAND2","BRAND4",
           'scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',"scaled_sales1","scaled_sales2","scaled_sales3",
            'scaled_promo','scaled_diff1','scaled_diff2','percentage_diff1','scaled_price',"scaled_rolling2",
                     "date","sku","scaled_target","target"]].dropna()
df_final_test = df_final_test[['scaled_month',
            "group_A","group_B","BRAND2","BRAND4",
           'scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',"scaled_sales1","scaled_sales2","scaled_sales3",
            'scaled_promo','scaled_diff1','scaled_diff2','percentage_diff1','scaled_price',"scaled_rolling2",
                     "date","sku","scaled_target","target"]].dropna()

In [66]:
df_final_test

Unnamed: 0,scaled_month,group_A,group_B,BRAND2,BRAND4,scaled_dayofyear,scaled_dayofmonth,scaled_weekofyear,scaled_sales1,scaled_sales2,...,scaled_promo,scaled_diff1,scaled_diff2,percentage_diff1,scaled_price,scaled_rolling2,date,sku,scaled_target,target
50,0.454545,False,True,True,False,0.484765,0.933333,0.490196,0.046024,0.066291,...,0.012491,0.020267,0.019989,0.170479,0.698690,0.056157,2019-06-29,144,0.063781,16228.0
100,0.454545,True,False,True,False,0.484765,0.933333,0.490196,0.115566,0.171742,...,0.333606,0.056176,0.046470,0.175005,0.766667,0.143654,2019-06-29,546,0.071891,61407.0
150,0.454545,True,False,True,False,0.484765,0.933333,0.490196,0.163286,0.217162,...,0.163658,0.053876,0.040684,0.206540,0.962963,0.190224,2019-06-29,549,0.108526,33777.0
200,0.454545,True,False,True,False,0.484765,0.933333,0.490196,0.218248,0.284785,...,0.306082,0.066537,0.078971,0.148918,0.766667,0.251516,2019-06-29,554,0.155733,122673.0
225,0.454545,False,True,False,True,0.484765,0.933333,0.490196,0.065936,0.096883,...,0.017095,0.030947,0.031441,0.234120,0.677570,0.081409,2019-06-29,686,0.087701,21177.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,1.000000,True,False,True,False,0.930748,0.200000,0.941176,0.081806,0.079006,...,0.005047,-0.002800,0.238812,-0.011562,0.961538,0.080406,2019-12-07,1035,0.098481,38087.0
98,1.000000,False,True,False,True,0.930748,0.200000,0.941176,0.605486,0.623549,...,1.000000,0.018063,-0.158155,0.025972,0.299065,0.614518,2019-12-07,1051,0.312093,24597.0
148,1.000000,True,False,False,True,0.930748,0.200000,0.941176,0.099803,0.097210,...,0.176090,-0.002593,0.223117,-0.010457,0.961538,0.098507,2019-12-07,1058,0.122768,27794.0
198,1.000000,True,False,False,True,0.930748,0.200000,0.941176,0.083216,0.078051,...,0.181504,-0.005164,0.225759,-0.016140,0.750000,0.080634,2019-12-07,1065,0.113069,79456.0


In [67]:
X_train = df_train[['scaled_month',
            "group_A","group_B","BRAND2","BRAND4",
           'scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',"scaled_sales1","scaled_sales2","scaled_sales3",
            'scaled_promo','scaled_diff1','scaled_diff2','percentage_diff1','scaled_price',"scaled_rolling2"]]

In [68]:
y_train = df_train[["date","sku","scaled_target","target"]]

In [69]:
X_test = df_final_test[['scaled_month',
            "group_A","group_B","BRAND2","BRAND4",
           'scaled_dayofyear','scaled_dayofmonth','scaled_weekofyear',"scaled_sales1","scaled_sales2","scaled_sales3",
            'scaled_promo','scaled_diff1','scaled_diff2','percentage_diff1','scaled_price',"scaled_rolling2"]]

In [70]:
y_test = df_final_test[["date","sku","scaled_target","target"]]

In [71]:
features_for_model["forest"]

['scaled_month',
 'group_A',
 'group_B',
 'BRAND2',
 'BRAND4',
 'scaled_dayofyear',
 'scaled_dayofmonth',
 'scaled_weekofyear',
 'scaled_sales1',
 'scaled_sales2',
 'scaled_sales3',
 'scaled_promo',
 'scaled_diff1',
 'scaled_diff2',
 'percentage_diff1',
 'scaled_price',
 'scaled_rolling2']

In [72]:
res = StackingPred(models,finalreg, X_train, y_train, X_test,y_test,features_for_model,rescale = rescale_df, span = 48, label = "scaled_target")


2019-06-29T00:00:00.000000000


KeyboardInterrupt: 

In [None]:
#res = res.set_index(res["i"])
res

In [45]:
mape,results = plot_scaled_results(res, y_test, rescale_df)

NameError: name 'res' is not defined