In [None]:
import pandas as pd
import numpy as np
import importlib
from sklearn.metrics import root_mean_squared_error

import utils

importlib.reload(utils)

from utils import load_data

In [2]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
data = load_data(prepare=False)
data = data.sort_values(by=["id","day_num"])
data.loc[:, 'lag'] = data.groupby('id')['value'].shift(1)

In [4]:
data[data["lag"].isna()]["day_num"].describe()

count    30490.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: day_num, dtype: float64

In [5]:
data.groupby("id").size().describe()

count    30490.0
mean      1941.0
std          0.0
min       1941.0
25%       1941.0
50%       1941.0
75%       1941.0
max       1941.0
dtype: float64

In [6]:
data["train"] = data["day_num"]<=1913

In [7]:
data.head(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,value,day_num,date,...,year,event_name_1,event_type_1,event_name_2,event_type_2,snap,sell_price,total_volume,train,lag
1612,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1,3,1,2011-01-29,...,2011,,,,,0,2.0,6.0,True,
32102,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_2,0,2,2011-01-30,...,2011,,,,,0,2.0,0.0,True,3.0
62592,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_3,0,3,2011-01-31,...,2011,,,,,0,2.0,0.0,True,0.0
93082,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_4,1,4,2011-02-01,...,2011,,,,,1,2.0,2.0,True,0.0
123572,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_5,4,5,2011-02-02,...,2011,,,,,1,2.0,8.0,True,1.0
154062,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_6,2,6,2011-02-03,...,2011,,,,,1,2.0,4.0,True,4.0
184552,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_7,0,7,2011-02-04,...,2011,,,,,1,2.0,0.0,True,2.0
215042,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_8,2,8,2011-02-05,...,2011,,,,,1,2.0,4.0,True,0.0
245532,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_9,0,9,2011-02-06,...,2011,SuperBowl,Sporting,,,1,2.0,0.0,True,2.0
276022,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_10,0,10,2011-02-07,...,2011,,,,,1,2.0,0.0,True,0.0


In [8]:
data["events"] = data["event_name_1"].notna()+data["event_name_2"].notna()

In [9]:
data[["lag","value","events"]].corr()

Unnamed: 0,lag,value,events
lag,1.0,0.767991,0.006919
value,0.767991,1.0,-0.00366
events,0.006919,-0.00366,1.0


In [10]:
dummies = pd.get_dummies(data[["event_name_1","event_type_1","event_name_2","event_type_2"]])

dummie_columns = dummies.columns

In [11]:
data = pd.concat([data,dummies],axis=1)

In [12]:
del dummies

In [13]:
data['sell_price_filled'] = data.groupby('id')['sell_price'].ffill().bfill()

In [14]:
train = data[data['train']].dropna(subset="sell_price")
train = train[train["day_num"]>1]

test = data[~data['train']]

In [15]:
import category_encoders as ce

categories = ["event_name_1", "event_type_1", "event_name_2", "event_type_2", "cat_id", "state_id","wday","snap"]
target_enc = ce.TargetEncoder(cols=categories)
encoded_train = target_enc.fit_transform(train[categories], train['value'])
encoded_train = encoded_train.rename(columns=lambda col: f"{col}_encoded")

encoded_columns = encoded_train.columns

In [16]:
train = pd.concat([train,encoded_train],axis=1)

In [17]:
encoded_test = target_enc.fit_transform(test[categories], test['value'])
encoded_test = encoded_test.rename(columns=lambda col: f"{col}_encoded")

test = pd.concat([test,encoded_test],axis=1)

In [18]:
del encoded_train, encoded_test

In [19]:
features = list(encoded_columns)+["day_num", "wday", "month", "year", "snap", "lag", "sell_price"]

In [20]:
X_train = train[features]
y_train = train['value']


X_test = test[features]
y_test = test['value']

In [21]:
import lightgbm as lgb


In [22]:
gbm = lgb.LGBMRegressor(objective ='poisson' )

gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l2'
        )

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.246942 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 690
[LightGBM] [Info] Number of data points in the train set: 46017025, number of used features: 15
[LightGBM] [Info] Start training from score 0.355521


In [23]:
from sklearn import linear_model
clf = linear_model.PoissonRegressor(max_iter=200)
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


In [24]:
clf.score(X_train, y_train)

0.20320674610716738

In [25]:
train["id"].nunique()

30490

In [26]:
def make_prediciton(regressor):
    predictions = []

    for day in test["day_num"].unique():
        # Combine the current lags and additional features
        t = test.loc[test["day_num"]==day,:]    
        
        # Make the prediction

        if type(regressor) == "lightgbm.sklearn.LGBMRegressor":
            next_pred = gbm.predict(X_train, num_iteration=gbm.best_iteration_)
        else:
            next_pred = regressor.predict(t[features])

        predictions.append(next_pred)

        if day < 1941:
            test.loc[ (test["day_num"]==day+1) ,"lag"] = next_pred

    print("Recursive predictions:", predictions)
    return( np.array(predictions).T.flatten())

In [27]:
test["fc_lin"] = make_prediciton(clf)

Recursive predictions: [array([1.69637782, 1.79922011, 1.69637782, ..., 0.70205049, 0.70205049,
       0.70205049]), array([1.67746798, 1.68000837, 1.67746798, ..., 0.68414133, 0.68414133,
       0.68414133]), array([1.61937001, 1.61943054, 1.61937001, ..., 0.66045628, 0.66045628,
       0.66045628]), array([1.56332891, 1.56333031, 1.56332891, ..., 0.63792299, 0.63792299,
       0.63792299]), array([1.51387005, 1.51387008, 1.51387005, ..., 0.61804574, 0.61804574,
       0.61804574]), array([1.87055479, 1.87055479, 1.87055479, ..., 0.76399672, 0.76399672,
       0.76399672]), array([1.86805731, 1.86805731, 1.86805731, ..., 0.73716585, 0.73716585,
       0.73716585]), array([1.78832767, 1.78832767, 1.78832767, ..., 0.72789038, 0.72789038,
       0.72789038]), array([1.7227754 , 1.7227754 , 1.7227754 , ..., 0.70193642, 0.70193642,
       0.70193642]), array([1.6619672 , 1.6619672 , 1.6619672 , ..., 0.65666708, 0.65666708,
       0.65666708]), array([1.6032147 , 1.6032147 , 1.6032147 , ...

In [28]:
test["fc_gbm"] = make_prediciton(gbm)

Recursive predictions: [array([0.5966539 , 2.92796583, 0.5966539 , ..., 0.34367006, 0.34367006,
       0.34367006]), array([1.15022544, 1.94083801, 1.15022544, ..., 0.63154029, 0.63154029,
       0.63154029]), array([1.1603609 , 1.50652159, 1.1603609 , ..., 0.63548392, 0.63548392,
       0.63548392]), array([1.1603609 , 1.50652159, 1.1603609 , ..., 0.63548392, 0.63548392,
       0.63548392]), array([1.35621272, 1.79634885, 1.35621272, ..., 0.75118781, 0.75118781,
       0.75118781]), array([1.47847183, 1.9303135 , 1.47847183, ..., 0.90683424, 0.90683424,
       0.90683424]), array([1.43283688, 1.84013163, 1.43283688, ..., 0.8148436 , 0.8148436 ,
       0.8148436 ]), array([1.36855604, 1.80601959, 1.36855604, ..., 0.75315241, 0.75315241,
       0.75315241]), array([1.18146217, 1.56642825, 1.18146217, ..., 0.63685999, 0.63685999,
       0.63685999]), array([1.1859428 , 1.58030176, 1.1859428 , ..., 0.63548392, 0.63548392,
       0.63548392]), array([1.1859428 , 1.58030176, 1.1859428 , ...

In [29]:
test["fc_lin"].describe()

count    853720.000000
mean          1.178480
std           0.552837
min           0.003305
25%           0.750488
50%           1.201842
75%           1.613466
max          12.956938
Name: fc_lin, dtype: float64

In [30]:
test["fc_gbm"].describe()

count    853720.000000
mean          1.499840
std           1.875776
min           0.268423
25%           0.761277
50%           1.066078
75%           1.506522
max         161.913563
Name: fc_gbm, dtype: float64

In [None]:
root_mean_squared_error(test["value"],test["fc_lin"])

3.5622449127451574

In [32]:
root_mean_squared_error(test["value"],test["fc_gbm"])

3.117888245617378

In [34]:
weights = (
    train[train["day_num"]>(1914-28)]
        .groupby("id")["total_volume"]
        .sum()
)

In [35]:
weights = weights/weights.sum()

In [36]:
weights.sum()

1.0

In [37]:
train["squared_diff"] = (train["value"] - train["lag"]) **2

mse_train = (
    train.groupby("id")["squared_diff"]
        .mean()
).rename("mse_train")



In [38]:
mse_train.describe()

count    30490.000000
mean         7.873469
std         50.214720
min          0.025105
25%          0.678340
50%          1.660325
75%          4.230648
max       4929.767782
Name: mse_train, dtype: float64

In [64]:
test["squared_diff"] = (test["value"] - test["fc_gbm"])**2

mse_test = (
    test
        .groupby("id")["squared_diff"]
        .mean()
).rename("mse_test")



In [65]:
mse_test.describe()

count    30490.000000
mean         9.721227
std         69.153293
min          0.255920
25%          0.668457
50%          1.314442
75%          3.209080
max       3441.847522
Name: mse_test, dtype: float64

In [66]:
t = (
    pd.concat([mse_test, mse_train, weights], axis=1)
)

In [67]:
result = (
    t["total_volume"] * np.sqrt( (t["mse_test"] / t["mse_train"]) )
).sum()

In [68]:
result

1.1819592632263867

In [None]:
train.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,value,day_num,date,...,sell_price_filled,event_name_1_encoded,event_type_1_encoded,event_name_2_encoded,event_type_2_encoded,cat_id_encoded,state_id_encoded,wday_encoded,snap_encoded,squared_diff
32102,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_2,0,2,2011-01-30,...,2.0,1.432388,1.432388,1.426622,1.426622,2.106858,1.573651,1.708244,1.368049,9.0
62592,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_3,0,3,2011-01-31,...,2.0,1.432388,1.432388,1.426622,1.426622,2.106858,1.573651,1.365807,1.368049,0.0
93082,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_4,1,4,2011-02-01,...,2.0,1.432388,1.432388,1.426622,1.426622,2.106858,1.573651,1.262531,1.547151,1.0
123572,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_5,4,5,2011-02-02,...,2.0,1.432388,1.432388,1.426622,1.426622,2.106858,1.573651,1.247616,1.547151,9.0
154062,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_6,2,6,2011-02-03,...,2.0,1.432388,1.432388,1.426622,1.426622,2.106858,1.573651,1.255723,1.547151,4.0
