In [19]:
# !pip install lightgbm
import time
import pandas as pd
import numpy as np
from itertools import product
import gc
import tqdm.notebook as tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import lightgbm as lgb
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

start = time.time()

#### utils

In [20]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

def concat_df(train_data, test_data):
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

def divide_df(all_data, train_size):
    return all_data.loc[:train_size-1], all_data.loc[train_size:].drop(['target'], axis=1)

def get_result_df(predict, round=False):
    result = pd.DataFrame(data={'ID': range(0, 214200), 'item_cnt_month': predict})
#     clip after aggregation?
    result['item_cnt_month'] = result['item_cnt_month'].clip(0, 20)
    if round:
        result['item_cnt_month'] = result['item_cnt_month'].round()
    return result

#### Data read

In [21]:
df_train = pd.read_csv('sales_train.csv')
df_test = pd.read_csv('test.csv')
df_shops = pd.read_csv('shops.csv')
df_items = pd.read_csv('items.csv')
df_item_cats = pd.read_csv('item_categories.csv')

In [22]:
# FIX SHOPS

# Якутск Орджоникидзе, 56
df_train.loc[df_train.shop_id == 0, 'shop_id'] = 57
df_test.loc[df_test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
df_train.loc[df_train.shop_id == 1, 'shop_id'] = 58
df_test.loc[df_test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
df_train.loc[df_train.shop_id == 10, 'shop_id'] = 11
df_test.loc[df_test.shop_id == 10, 'shop_id'] = 11

In [23]:
# Create cats from shops

# Extract type and sub type code
df_item_cats['split'] = df_item_cats['item_category_name'].str.split('-')
df_item_cats['type'] = df_item_cats['split'].map(lambda x: x[0].strip())
df_item_cats['type_code'] = LabelEncoder().fit_transform(df_item_cats['type'])
# if subtype is nan then type
df_item_cats['subtype'] = df_item_cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
df_item_cats['subtype_code'] = LabelEncoder().fit_transform(df_item_cats['subtype'])
df_item_cats = df_item_cats[['item_category_id','type_code', 'subtype_code']]

# Extract city
df_shops.loc[df_shops['shop_name'] == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
df_shops['city'] = df_shops['shop_name'].str.split(' ').map(lambda x: x[0])
df_shops.loc[df_shops.city == '!Якутск', 'city'] = 'Якутск'
df_shops['city_code'] = LabelEncoder().fit_transform(df_shops['city'])
df_shops = df_shops[['shop_id','city_code']]

#### Verify wether train and test data from same distribution 
df_train_unique_shop_item = df_train[['shop_id', 'item_id']].drop_duplicates()
df_train_unique_shop_item['count'] = 1
matched_pair_count = pd.merge(df_test, df_train_unique_shop_item, on=['shop_id', 'item_id'], how='left')['count'].sum()

same_distribution = df_test.shape[0] == matched_pair_count
fraction_of_misshig = round(1 - matched_pair_count/df_test.shape[0], 2)
print(f'Test and traind data are from same distribution: {same_distribution}')
print(f'Fraction of test data missing in train data: {fraction_of_misshig}')

#### Prepare mean encodings

In [24]:
sales = df_train.copy()
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})
gb['item_cnt_day'] = gb['item_cnt_day'].clip(0, 20)
gb.rename(columns={'item_cnt_day': 'target'}, inplace=True)

# Fix column names
#gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':'sum'})
gb['item_cnt_day'] = gb['item_cnt_day'].clip(0, 20)
gb.rename(columns={'item_cnt_day': 'target_shop'}, inplace=True)
#gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':'sum'})
gb['item_cnt_day'] = gb['item_cnt_day'].clip(0, 20)
gb.rename(columns={'item_cnt_day': 'target_item'}, inplace=True)
#gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)


TRAIN_SIZE = all_data.shape[0]

df_test_concat = df_test.drop(columns=['ID'])
df_test_concat['date_block_num'] = 34
all_data = concat_df(all_data, df_test_concat)

print(TRAIN_SIZE)
print(df_test_concat.shape[0])
print(TRAIN_SIZE + df_test_concat.shape[0])
print(len(df_all))

all_data = downcast_dtypes(all_data)
del grid, gb, df_test_concat, sales
gc.collect();

10913850
214200
11128050
11128050


#### Prepare historical lags item

In [25]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm.tqdm(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift



# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = df_items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = pd.merge(all_data, df_item_cats, how='left', on='item_category_id')
all_data = pd.merge(all_data, df_shops, how='left', on='shop_id')

all_data = downcast_dtypes(all_data)
gc.collect();

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [27]:
all_data.shape

(11128050, 28)

In [28]:
# WHAT THE FUCK MAN???
# df_model = pd.merge(df_all, all_data, on=index_cols, how='left').fillna(0)

# df_model['shop_id'] = df_model['shop_id'].astype('category')
# df_model['item_category_id'] = df_model['item_category_id'].astype('category')
# df_model = pd.get_dummies(df_model, drop_first=True)

target_col = 'target'
train, X_test = divide_df(all_data, TRAIN_SIZE)

print(train.shape[0])
print(X_test.shape[0])
print(train.shape[0] + X_test.shape[0])

# Don't use old data from year 2013
train = train[train['date_block_num'] >= 12] 

X_train = train.drop(columns=[target_col])
Y_train = train[target_col]

del train, df_all, all_data
gc.collect();

10913850
214200
11128050


#### Get validation set

In [29]:
import random
cv_fraction = 1
item_set = X_train['item_id'].unique()
random.shuffle(item_set)
l = int(len(item_set) * cv_fraction)
cv_item_set = item_set[:l]
cv_items_filter = X_train['item_id'].isin(cv_item_set)

cv_X_train = X_train[cv_items_filter]
cv_Y_train = Y_train[cv_items_filter]

max_date = cv_X_train['date_block_num'].max()

cv_X_test = cv_X_train[cv_X_train['date_block_num'] == max_date]
cv_Y_test = cv_Y_train[cv_X_train['date_block_num'] == max_date]

cv_Y_train = cv_Y_train[cv_X_train['date_block_num'] < max_date]
cv_X_train = cv_X_train[cv_X_train['date_block_num'] < max_date]

In [30]:
cv_X_test.shape[0]

238172

#### HP optimization and prediction with lightgbm

In [31]:
def train_and_predict_lgbm(X, Y, X_val, Y_val, X_test, params = None):
    from scipy.stats import randint as sp_randint
    from scipy.stats import uniform as sp_uniform

    if not params:
#         lgb_params = {
#                        'feature_fraction': [0.55, 0.65, 0.75, 0.85, 0.95],
#                        'metric': ['rmse'],
#                        'nthread':[-1], 
#                        'min_data_in_leaf': [2**4, 2**5, 2**6, 2**7, 2**8, 2**9, 2**10], 
#                        'bagging_fraction': [0.65, 0.75, 0.85], 
#                        'learning_rate': [0.1, 0.01, 0.03], 
#                        'objective': ['mse'], 
#                        'bagging_seed': [2**7], 
#                        'num_leaves': sp_randint(100, 1000),
#                        'bagging_freq':[1],
#                        'verbose':[0] 
#                       }
        lgb_params ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
        fit_params={"early_stopping_rounds":30, 
                    "eval_metric" : 'rmse', 
                    "eval_set" : [(X_val, Y_val)],
                    'eval_names': ['valid'],
                    #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
                    'verbose': 100,
                    'categorical_feature': 'auto'}
        
        n_HP_points_to_test = 10
        
        clf = lgb.LGBMRegressor(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=10)
        gs = RandomizedSearchCV(
            estimator=clf, param_distributions=lgb_params, 
            n_iter=n_HP_points_to_test,
            scoring='r2',
            cv=4,
            refit=True,
            random_state=314,
            verbose=False)
        gs.fit(X, Y, **fit_params)
        print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))
        params = gs.best_params_
    
#     lgb_params = {
#                'feature_fraction': params['feature_fraction'],
#                'metric': params['metric'],
#                'nthread': params['nthread'], 
#                'min_data_in_leaf': params['min_data_in_leaf'], 
#                'bagging_fraction': params['bagging_fraction'], 
#                'learning_rate': params['learning_rate'], 
#                'objective': params['objective'], 
#                'bagging_seed': params['bagging_seed'], 
#                'num_leaves': params['num_leaves'],
#                'bagging_freq':params['bagging_freq'],
#                'verbose':params['verbose'] 
#               }

    model = lgb.train(params, lgb.Dataset(X, label=Y), 100)
    pred_lgb = model.predict(X_test)
    return pred_lgb, params
    

In [32]:
#get_result_df(Y_predict1, round=True).to_csv('result_rfc_26_features_round.csv', index=False)
# Nice, but the solution can be improved! Your public and private LB scores are: 1.107601 and 1.105683.

In [33]:
def train_and_predict_lr(X, Y, X_test):
    lr = LinearRegression()
    lr.fit(X.values, Y)
    pred_lr = lr.predict(X_test.values)
    return pred_lr

In [34]:
def get_mix(alpha, X):
    return (alpha * X[:,0]) + ((1-alpha) * X[:,1])

def get_best_alpha(X_train_level2, target):
    alphas_to_try = np.linspace(0, 1, 1001)
    max_r2 = 0
    best_alpha = 1
    
    for alpha in alphas_to_try:
        mix = get_mix(alpha, X_train_level2)
        r2 = r2_score(target, mix)
        if max_r2 < r2:
            max_r2 = r2
            best_alpha = alpha
    return best_alpha

In [35]:
X_test.shape

(214200, 27)

In [None]:
pred_lgb, params = train_and_predict_lgbm(cv_X_train, cv_Y_train, cv_X_test, cv_Y_test, cv_X_test)
print('Test R-squared for lgb is %f' % r2_score(cv_Y_test, pred_lgb))
pred_lgb, _ = train_and_predict_lgbm(X_train, Y_train, None, None, X_test, params=params)

get_result_df(pred_lgb, round=True).to_csv('result_lgb_26_features.csv', index=False)
# OK Baseline Nice, public and private LB scores are: 1.126919 and 1.127538.

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 0.931902
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 0.937028
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 0.934138
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 0.932592
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 0.9295
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 0.930752
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 0.929825
Training until validation scores don't impr

cv_fraction = 0.2
HP = 50
Clip AFTER pred but not after grouping

Best score reached: 0.41605717389124625 with params: {'colsample_bytree': 0.55607546409401, 'min_child_samples': 103, 'min_child_weight': 10.0, 'num_leaves': 22, 'reg_alpha': 2, 'reg_lambda': 1, 'subsample': 0.8939112927620336} 
Test R-squared for lgb is 0.829127

<b>================</b>

cv_fraction = 0.2
HP = 50
Clip AFTER pred and after grouping
Best score reached: 0.38987888420722855 with params: {'colsample_bytree': 0.8754369812451743, 'min_child_samples': 372, 'min_child_weight': 10.0, 'num_leaves': 44, 'reg_alpha': 1, 'reg_lambda': 0, 'subsample': 0.568664015245299} 
Test R-squared for lgb is 0.594777


<b>================</b>

cv_fraction = 0.2
HP = 50
No clip AFTER pred but not after grouping
Best score reached: 0.392851604813763 with params: {'colsample_bytree': 0.9731668400523877, 'min_child_samples': 171, 'min_child_weight': 1e-05, 'num_leaves': 41, 'reg_alpha': 10, 'reg_lambda': 100, 'subsample': 0.5575732396028996} 
Test R-squared for lgb is 0.678374

<b>================</b>
Best score reached: 0.5112108383831551 with params: {'colsample_bytree': 0.952164731370897, 'min_child_samples': 111, 'min_child_weight': 0.01, 'num_leaves': 38, 'reg_alpha': 0, 'reg_lambda': 0.1, 'subsample': 0.3029313662262354} 
Test R-squared for lgb is 0.537906

In [17]:
# pred_tree, params = train_and_predict_tree(cv_X_train, cv_Y_train, cv_X_test)
pred_tree, params = train_and_predict_lgbm(cv_X_train, cv_Y_train, cv_X_test, cv_Y_test, cv_X_test)
print('Test R-squared for RF is %f' % r2_score(cv_Y_test, pred_tree))

pred_lr = train_and_predict_lr(cv_X_train, cv_Y_train, cv_X_test)
print('Test R-squared for linreg is %f' % r2_score(cv_Y_test, pred_lr))

X_train_level2 = np.c_[pred_tree, pred_lr]

best_alpha = get_best_alpha(X_train_level2, cv_Y_test)
r2_train_simple_mix = r2_score(cv_Y_test, get_mix(best_alpha, X_train_level2))

print('Best alpha: %f; Corresponding r2 score on train: %f' % (best_alpha, r2_train_simple_mix))

#### Submision
pred_tree, _ = train_and_predict_lgbm(X_train, Y_train, None, None, X_test, params=params)
pred_lr = train_and_predict_lr(X_train, Y_train, X_test)

X_train_level2 = np.c_[pred_tree, pred_lr]

result = get_mix(best_alpha, X_train_level2)

get_result_df(result, round=True).to_csv('result_lgb+lr_26_features.csv', index=False)

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 2.79235
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 2.80813
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 2.8962
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 2.84771
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 2.79324
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 2.78456
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[10]	valid's rmse: 2.89732
Training until validation scores don't improve fo

Best score reached: 0.5112108383831551 with params: {'colsample_bytree': 0.952164731370897, 'min_child_samples': 111, 'min_child_weight': 0.01, 'num_leaves': 38, 'reg_alpha': 0, 'reg_lambda': 0.1, 'subsample': 0.3029313662262354}

Test R-squared for RF is 0.537906

Test R-squared for linreg is 0.374154

Best alpha: 1.000000; Corresponding r2 score on train: 0.537906

In [18]:
print('Elapsed: {}'.format(round((time.time()-start))))

Elapsed: 233


In [19]:
# get_result_df(cv_Y_test, round=True).to_csv('result_rfc_26_features_round.csv', index=False)

# Don't use old data from year 2013