In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
from typing import Union
from tqdm.auto import tqdm as tqdm
from sklearn import preprocessing
import gc
import lightgbm as lgb
import random
import os
import re 
import lightgbm as lgb
import dask.dataframe as dd
from sklearn import preprocessing, metrics
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit

In [3]:
data = pd.read_pickle('all_data_first.pkl')
sample_submission = pd.read_csv('sample_submission.csv')
data.shape

(22079892, 63)

Because of the size of our data, we will be using LGB to do our modeling and feature selection.

To ensure we don't violate the assumption of multicollinearity, before we run our models, let's take a glimpse of correlation between features.

In [10]:
#Those that return 'False' are those that are correlated with another feature by at least 83%.

corr = data.corr()
m = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.83).any()
m

ATR                 False
RS                   True
ann_vol             False
beta                 True
bollinger           False
cat_id              False
dayofweek            True
demand              False
dept_id             False
entropy              True
event                True
info_ratio           True
is_weekend           True
item_id             False
month               False
quarter             False
relative_vol         True
rolling_kurt_t30    False
rolling_max_14      False
rolling_max_180     False
rolling_max_30      False
rolling_max_60      False
rolling_max_90      False
rolling_mean_14     False
rolling_mean_180    False
rolling_mean_30     False
rolling_mean_60     False
rolling_mean_90     False
rolling_min_14      False
rolling_min_180      True
rolling_min_30      False
rolling_min_60      False
rolling_min_90      False
rolling_skew_t30    False
rolling_std_14      False
rolling_std_180     False
rolling_std_30      False
rolling_std_60      False
rolling_std_

In [3]:
# ca_model = data.loc[data['state_id']==0]
# tx_model = data.loc[data['state_id']==1]
# wi_model = data.loc[data['state_id']==2]

Running lgb on the entire dataset is too memory-intensive. So, we will first model the data by state to find feature importance. Contingent on the amount of features we drop, we hope that we can model the full data set.

In [4]:
# wi_items = '[A-Z]+_\d{1}\d?_\d+_Wi\w+'
# sample_submission = sample_submission.loc[sample_submission['id'].str.contains(wi_items)]
# sample_submission.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
12196,HOBBIES_1_001_TX_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12197,HOBBIES_1_002_TX_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12198,HOBBIES_1_003_TX_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12199,HOBBIES_1_004_TX_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12200,HOBBIES_1_005_TX_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [3]:
'''Columns we can drop due to no importance'''

to_drop = ['is_quarter_start','is_quarter_end', 'is_year_start',
           'gap_placement', 'gap_size', 'gap_start',
           'number_of_gaps', 'is_month_start', 'is_year_end', 'is_month_end']

data.drop(to_drop, axis=1, inplace=True)

KeyError: "['is_quarter_start' 'is_quarter_end' 'is_year_start' 'gap_placement'\n 'gap_size' 'gap_start' 'number_of_gaps' 'is_month_start' 'is_year_end'\n 'is_month_end'] not found in axis"

In [4]:
columns = [col for col in data.columns if col not in 
            ['id', 'demand', 'date', 'wm_yr_wk']]

The following lgb code was provided by: https://www.kaggle.com/ragnar123/asymmetric-loss-functions-lgbm

In [None]:
def run_lgb(data, features, custom_asymmetric_train, custom_asymmetric_valid):
    
    # going to evaluate with the last 28 days
    x_train = data[data['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_val = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    y_val = x_val['demand']
    test = data[(data['date'] > '2016-04-24')]
    del data
    gc.collect()

    # define random hyperparammeters
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 42,
        'learning_rate': 0.1,
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75}

    train_set = lgb.Dataset(x_train[features], y_train)
    val_set = lgb.Dataset(x_val[features], y_val)
    
    del x_train, y_train

    model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 50, 
                      valid_sets = [train_set, val_set], verbose_eval = 100, fobj = custom_asymmetric_train, 
                      feval = custom_asymmetric_valid)
    
    val_pred = model.predict(x_val[features])
    y_pred = model.predict(test[features])
    x_val['demand'] = val_pred
    test['demand'] = y_pred
    x_val = x_val[['id', 'date', 'demand']]
    test = test[['id', 'date', 'demand']]

    return x_val, test

In [None]:
def evaluate(x_val, train_fold_df, valid_fold_df, calendar, sell_prices):
    x_val = pd.pivot(x_val, index = 'id', columns = 'date', values = 'demand').reset_index()
    x_val.columns = ['id'] + ['d_' + str(i) for i in range(1886, 1914)]
    x_val = train_fold_df[['id']].merge(x_val, on = 'id')
    x_val.drop('id', axis = 1, inplace = True)
    evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, sell_prices)
    score = evaluator.score(x_val)
    return score

def predict(test, submission):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    final.to_csv('submission.csv', index = False)

In [None]:
# define cost and eval functions
def custom_asymmetric_train(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    grad = np.where(residual < 0, -2 * residual, -2 * residual * 1.15)
    hess = np.where(residual < 0, 2, 2 * 1.15)
    return grad, hess

def custom_asymmetric_valid(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    loss = np.where(residual < 0, (residual ** 2) , (residual ** 2) * 1.15) 
    return "custom_asymmetric_eval", np.mean(loss), False

In [None]:
# seed everything
seed_everything(42)
# reading data
data, calendar, sell_prices, submission, train_fold_df, valid_fold_df = read_data()
# get feature columns, also ignoring some features because we dont have enough ram and they overffit
features = [col for col in data.columns if col not in ['id', 'demand', 'part', 'date', 'wm_yr_wk', 'mean_demand_month', 'std_demand_month', 'max_demand_month', 'mean_demand_week', 'std_demand_week', 
                                                       'max_demand_week']]

print(f'We are training with {len(features)} fetures')
data.tail()

# run model with asymmetric loss function
x_val, test = run_lgb(data, features, custom_asymmetric_train, custom_asymmetric_valid)
score = evaluate(x_val, train_fold_df, valid_fold_df, calendar, sell_prices)
print(f'Our wrmsse is {score}')
# predict test
predict(test, submission)

Now that we have our predictions, it is time to add back our slope and intercepts since we stripped them for feature engineering and modeling.

In [62]:
''' Grabbing the regression attributes from the original sales df
so we can add the slope back and scale back our submission values.'''

from scipy.stats import linregress

columns =['id','slope','intercept','std']
index = range(0, len(sales))
regress = pd.DataFrame(columns = columns, index = index)
regress.fillna(0, inplace=True)
d_cols = sales.columns[-712:]#since our data is only from d_1202 and beyond, we will only grab columns beyond those days.

for j, item in enumerate(sales.id.unique()):
    d_vals = np.squeeze(sales.loc[sales['id']==item][d_cols].T)
    item_len = range(0,len(d_vals))
    regress['id'].iloc[j] = item
    regress['slope'].iloc[j] = linregress(d_vals, item_len)[0]
    regress['intercept'].iloc[j] = linregress(d_vals, item_len)[1]
    regress['std'].iloc[j] = linregress(d_vals, item_len)[-1]

In [63]:
regress.head()

Unnamed: 0,id,slope,intercept,std
0,HOBBIES_1_001_CA_1_validation,24.833733,339.211582,8.431746
1,HOBBIES_1_002_CA_1_validation,-11.273509,359.315893,11.544826
2,HOBBIES_1_003_CA_1_validation,79.57182,327.225182,10.603538
3,HOBBIES_1_004_CA_1_validation,-8.532467,374.170764,3.337409
4,HOBBIES_1_005_CA_1_validation,11.664122,342.705507,6.167046


Now to repeat the same process for our predictions so we can scale properly.

In [99]:
data.loc[data['bollinger'].isna(), 'bollinger'] = data['demand']

In [105]:
''' Grabbing the regression attributes from our predictions
so we can add the slope back and scale back our submission values.'''

columns =['id','slope','intercept','std']
index = range(0, len(loop_df))
final_reg = pd.DataFrame(columns = columns, index = index)
final_reg.fillna(0, inplace=True)

for j, item in enumerate(data['id'].unique()):
    d_vals = np.squeeze(data.loc[data['id']==item]['bollinger'])
    item_len = range(0,len(d_vals))
    final_reg['id'].iloc[j] = item
    final_reg['slope'].iloc[j] = linregress(d_vals, item_len)[0]
    final_reg['intercept'].iloc[j] = linregress(d_vals, item_len)[1]
    final_reg['std'].iloc[j] = linregress(d_vals, item_len)[-1]

In [108]:
regress = regress.set_index('id')
regress = regress.reindex(index=final_reg['id'])
regress = regress.reset_index()
regress.head()

Unnamed: 0,id,slope,intercept,std
0,HOUSEHOLD_1_030_CA_4_validation,-33.492083,375.962157,9.239507
1,HOUSEHOLD_1_032_CA_4_validation,25.275446,319.539289,4.39703
2,HOUSEHOLD_1_033_CA_4_validation,147.501947,308.887727,10.088969
3,HOUSEHOLD_1_034_CA_4_validation,1.020886,355.091359,10.804175
4,HOUSEHOLD_1_035_CA_4_validation,207.8328,344.9916,26.329572
