In [116]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import gc 
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from lightgbm import LGBMRegressor 
from sklearn.neighbors import KNeighborsRegressor

In [630]:
train = pd.read_csv("dataset_train.csv", parse_dates=['date'])
test = pd.read_csv("dataset_valid.csv", parse_dates=['date'])

# Preparing data and creating new features

In [631]:
def smape(preds, target):
    '''
    Function to calculate SMAPE
    '''
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = ( 200 * np.sum(num / denom)) / n
    return smape_val

def add_date_features(df):
    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['weekofyear'] = df.date.dt.weekofyear
    df['day'] = df.date.dt.day
    df['quarter'] = df.date.dt.quarter
    df['dayofyear'] = df.date.dt.dayofyear
    df['dayofweek'] = df.date.dt.dayofweek
#     df['is_month_start'] = df.date.dt.is_month_start
#     df['is_month_end'] = df.date.dt.is_month_end
#     df['is_quarter_start'] = df.date.dt.is_quarter_start
#     df['is_quarter_end'] = df.date.dt.is_quarter_end
    df['is_year_start'] = df.date.dt.is_year_start
    df['is_year_end'] = df.date.dt.is_year_end

    return df.copy()

def meanf_(df, columns,  test_exist = True, test=None):
    for i in columns:
        meanf_column = 0
        meanf_column = df.loc[:, [i, 'sales']].groupby(i).agg('mean').reset_index()
        meanf_column.rename(columns={'sales':("meanf_"+i)}, inplace=True)
        df = pd.merge(df,meanf_column, on=i)
        if (test_exist == True):
            test = pd.merge(test,meanf_column, on=i, how = 'left')
    if (test_exist == True):
        return df, test
    else:
        return df

In [632]:
train = add_date_features(train)
test = add_date_features(test)

In [635]:
train, test = meanf_(df = train, columns = ['day', 'dayofweek', 'dayofyear', 'item', 'month', 'quarter', 'shop',
       'weekofyear', 'year', #'is_month_start', 'is_month_end', 
                                            #'is_quarter_start', 'is_quarter_end'
                                             'is_year_start', 
                                            'is_year_end'],
                                           
                     test_exist = True, test = test)

In [636]:
test_months = set(test.month.unique())
print('Test months', test_months)

train['train_or_test'] = 'train'
test['train_or_test'] = 'test'
df = pd.concat([train,test], ignore_index=True)
print('Combined df shape:{}'.format(df.shape))
del train, test
gc.collect()

Test months {4, 5, 6, 7, 8, 9, 10, 11, 12}
Combined df shape:(913000, 25)


310

In [637]:
df.sales = df.sales.apply(np.log1p)

In [638]:
valid_mask = (df.year == 2016) & (df.month.isin(test_months))
throw_mask = (df.year == 2017) & (~(df.month.isin(test_months)))
df.loc[valid_mask, 'train_or_test'] = 'val'
df.loc[throw_mask, 'train_or_test'] = 'no_train'

In [639]:
def noise(scale, size):
    return np.random.normal(
        loc=0, # why is 0?
        scale=scale, 
        size=size
    )

def lag_features(df, groupcols, target, lags):
    g = df.groupby(groupcols)
    for lag in lags:
        name = '_'.join([target, 'lag', str(lag)])
        df[name] = g[target].shift(lag).values + noise(50, len(df))
    return df

def rolling_mean(df, groupcols, target, windows, 
                 min_periods=2, shift=1, win_type=None):
    g = df.groupby(groupcols)
    for w in windows:
        for s in shift:
            name = '_'.join([target, 'rmean', str(w)])
            df[name] = g[target].shift(s).rolling(w, min_periods, win_type=win_type).mean().values #+ noise(0.56, len(df))
    return df

def emw_mean(df, groupcols, target, alpha=[0.9], shift=[1]):
    g = df.groupby(groupcols)
    for a in alpha:
        for s in shift:
            name = '_'.join([target, 'lag', str(s), 'ewm', str(a)])
            df[name] = g[target].shift(s).ewm(alpha=a).mean().values
    return df

In [640]:
# df = lag_features(
#     df, 
#     groupcols=['shop','item'],
#     target='sales', 
#     lags=[365, 365+7, 365-7, 365+14, 365-14, 365*2, 365*2+7, 365*2-7, 365*2+14, 365-14]
# )#0.15

# df = rolling_mean(
#     df, 
#     groupcols=['shop','item'], 
#     target='sales', 
#     windows = [2, 3, 4, 5, 6, 7,],
#     shift=[365, 365+7, 365-7, 365+14, 365-14],#, 365*2, 365*2+7, 365*2-7, 365*2+14, 365-14],
#     win_type='triang'
# )#0.17

# df = emw_mean(
#     df, 
#     groupcols=['shop','item'],
#     target='sales', 
#     alpha=[0.99, 0.95, 0.9, 0.8, 0.7, 0.6, 0.5],
#     shift=[365, 300]#, 365+7, 365-7, 365+14, 365-14]#, 365*2, 365*2+7, 365*2-7, 365*2+14, 365*2-14]
# )

In [641]:
df = df[df['month']>=4]

In [642]:
temp = df.loc[df.train_or_test.isin(['train', 'val', 'test']), :]
y_train = temp[temp.train_or_test == 'train']['sales'].values.reshape((-1))
y_valid = temp[temp.train_or_test == 'val']['sales'].values.reshape((-1))

In [643]:
val = temp.loc[temp.train_or_test=='val', :]
train = temp.loc[temp.train_or_test=='train', :]
test = temp.loc[temp.train_or_test=='test', :]

In [644]:
idx = train.dropna().index
train = train.loc[idx]
y_train = train.sales[idx]

In [645]:
train = train.drop(['sales', 'train_or_test', 'date'], axis = 1)
val = val.drop(['sales', 'train_or_test', 'date'], axis = 1)
test = test.drop(['sales', 'train_or_test', 'date'], axis = 1)

In [646]:
train = train.dropna()

In [647]:
print(val.shape)
print(train.shape)
print(test.shape)

(137500, 22)
(412500, 22)
(137500, 22)


# LightGBM

In [648]:
lgb1 = LGBMRegressor(verbose = 0,
                    task = 'train',
                    boosting_type ='gbdt',
                    objective = 'regression',
                    num_leaves= 300,
                    n_estimators = 800,
                    bagging_freq=15,
                    num_boost_round= 500,
                    early_stopping_rounds= 15,
                    seed=42
                   )
lgb1.fit(train.drop(['meanf_day', 'meanf_dayofweek',
       'meanf_dayofyear', 'meanf_item', 'meanf_month', 'meanf_quarter',
       'meanf_shop', 'meanf_weekofyear', 'meanf_year'], axis = 1), y_train, 
#         feval=lambda x, data: ('smape', smape(x, data.get_label()), False),
        eval_metric='rmse',
        eval_set=[(val.drop(['meanf_day', 'meanf_dayofweek',
       'meanf_dayofyear', 'meanf_item', 'meanf_month', 'meanf_quarter',
       'meanf_shop', 'meanf_weekofyear', 'meanf_year'], axis = 1), y_valid)],
        verbose = True
       )

[1]	valid_0's rmse: 0.528629
Training until validation scores don't improve for 15 rounds.
[2]	valid_0's rmse: 0.489428
[3]	valid_0's rmse: 0.456164
[4]	valid_0's rmse: 0.426496
[5]	valid_0's rmse: 0.401199
[6]	valid_0's rmse: 0.38024
[7]	valid_0's rmse: 0.36059
[8]	valid_0's rmse: 0.343879
[9]	valid_0's rmse: 0.329093
[10]	valid_0's rmse: 0.316697
[11]	valid_0's rmse: 0.304837
[12]	valid_0's rmse: 0.294521
[13]	valid_0's rmse: 0.285846
[14]	valid_0's rmse: 0.277192
[15]	valid_0's rmse: 0.268181
[16]	valid_0's rmse: 0.260371
[17]	valid_0's rmse: 0.253361
[18]	valid_0's rmse: 0.248344
[19]	valid_0's rmse: 0.242887
[20]	valid_0's rmse: 0.238854
[21]	valid_0's rmse: 0.235091
[22]	valid_0's rmse: 0.231937
[23]	valid_0's rmse: 0.2266
[24]	valid_0's rmse: 0.223432
[25]	valid_0's rmse: 0.218856
[26]	valid_0's rmse: 0.215468
[27]	valid_0's rmse: 0.21179
[28]	valid_0's rmse: 0.20921
[29]	valid_0's rmse: 0.206058
[30]	valid_0's rmse: 0.203426
[31]	valid_0's rmse: 0.201257
[32]	valid_0's rmse: 0.

LGBMRegressor(bagging_freq=15, boosting_type='gbdt', class_weight=None,
       colsample_bytree=1.0, early_stopping_rounds=15, learning_rate=0.1,
       max_depth=-1, min_child_samples=20, min_child_weight=0.001,
       min_split_gain=0.0, n_estimators=800, n_jobs=-1,
       num_boost_round=500, num_leaves=300, objective='regression',
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=42,
       silent=True, subsample=1.0, subsample_for_bin=200000,
       subsample_freq=1, task='train', verbose=0)

In [649]:
lgb_val_pr = lgb1.predict(val.drop(['meanf_day', 'meanf_dayofweek',
       'meanf_dayofyear', 'meanf_item', 'meanf_month', 'meanf_quarter',
       'meanf_shop', 'meanf_weekofyear', 'meanf_year'], axis = 1))

In [650]:
smape(np.expm1(lgb_val_pr), np.expm1(y_valid))

13.91563599350167

In [651]:
lgb_pred = lgb1.predict(test.drop(['meanf_day', 'meanf_dayofweek',
       'meanf_dayofyear', 'meanf_item', 'meanf_month', 'meanf_quarter',
       'meanf_shop', 'meanf_weekofyear', 'meanf_year'], axis = 1))

In [652]:
ridge = Ridge(alpha = 7.0)
ridge.fit(train, y_train)

Ridge(alpha=7.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [653]:
ridge_val_pr = ridge.predict(val)
smape(np.expm1(ridge_val_pr), np.expm1(y_valid))

13.707361691561523

In [654]:
pr_ridge = ridge.predict(test)

In [655]:
subm = pd.read_csv("sample_submission.csv")
subm['sales'] = (np.expm1(lgb_pred) + np.expm1(pr_ridge))/2
pd.DataFrame(subm).to_csv('25_11_lgb_and_ridge_meanf_end_start_year.csv', index=False)