In [1]:
import pandas as pd
import numpy as np
import os
import data
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
from tqdm import tqdm_notebook as tn
import gc

import sklearn
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

import category_encoders as ce
from hyperopt import tpe, fmin, hp, Trials

import lightgbm as lgb

from itertools import product

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

## Read Data

In [3]:
DATA_FOLDER = os.path.join(data.DATA_ROOT, "readonly")
TMP_FOLDER = os.path.join(data.DATA_ROOT, "tmp")
#DATA_FOLDER = "../input"
#TMP_FOLDER = os.path.join(data.DATA_ROOT, "tmp")

VALIDATION_FOLDER = os.path.join(TMP_FOLDER, "validation")
SUBMISSION_FOLDER = os.path.join(TMP_FOLDER, "submission")

In [4]:
FOLDER = VALIDATION_FOLDER
#FOLDER = SUBMISSION_FOLDER

train_data_path = os.path.join(FOLDER, "train.csv")
test_data_path = os.path.join(FOLDER, "test.csv")

train_data = pd.read_csv(train_data_path).drop(columns=['month', 'date_block_num'])
test_data = pd.read_csv(test_data_path).drop(columns=['month', 'date_block_num'])

train_data = downcast_dtypes(train_data)
test_data = downcast_dtypes(test_data)

print("train data shape: " + str(train_data.shape))
train_data.dtypes

train data shape: (6186922, 88)


item_category_id                         int32
shop_id                                  int32
item_id                                  int32
target                                 float32
shop_id_count_month_lag_1              float32
shop_id_sale_month_lag_1               float32
item_id_count_month_lag_1              float32
item_id_sale_month_lag_1               float32
item_price_mean_month_lag_1            float32
item_price_std_month_lag_1             float32
item_shop_cnt_sum_month_lag_1          float32
item_shop_cnt_std_month_lag_1          float32
item_shop_sale_sum_month_lag_1         float32
item_shop_sale_std_month_lag_1         float32
item_category_id_count_month_lag_1     float32
item_category_id_sale_month_lag_1      float32
shop_id_count_month_lag_2              float32
shop_id_sale_month_lag_2               float32
item_id_count_month_lag_2              float32
item_id_sale_month_lag_2               float32
item_price_mean_month_lag_2            float32
item_price_st

## Make X and y

In [5]:
train_y = train_data['target']
train_x = train_data.drop(['target'], axis='columns')
feature_names = train_x.columns.tolist()

if 'target' in test_data.columns.tolist():
    test_x = test_data.drop(['target'], axis='columns')
    test_y = test_data['target']
else:
    test_x = test_data

## Clipping

In [6]:
train_y.loc[train_y > 40] = 40
if test_y is not None:
    test_y.loc[test_y > 40] = 40

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## Categorical Feature Encoding

In [7]:
categorical_columns = [
    'shop_id', 'item_id', 'item_category_id'
]

In [8]:
# Leave-One-Out Encoding
categorical_encoder = ce.LeaveOneOutEncoder(cols=categorical_columns, impute_missing=False, drop_invariant=True)
categorical_encoder.fit(train_x, train_y)

train_x = categorical_encoder.transform(train_x)
test_x = categorical_encoder.transform(test_x)

train_x = downcast_dtypes(train_x)
test_x = downcast_dtypes(test_x)

train_x.dtypes

item_category_id                       float32
shop_id                                float32
item_id                                float32
shop_id_count_month_lag_1              float32
shop_id_sale_month_lag_1               float32
item_id_count_month_lag_1              float32
item_id_sale_month_lag_1               float32
item_price_mean_month_lag_1            float32
item_price_std_month_lag_1             float32
item_shop_cnt_sum_month_lag_1          float32
item_shop_cnt_std_month_lag_1          float32
item_shop_sale_sum_month_lag_1         float32
item_shop_sale_std_month_lag_1         float32
item_category_id_count_month_lag_1     float32
item_category_id_sale_month_lag_1      float32
shop_id_count_month_lag_2              float32
shop_id_sale_month_lag_2               float32
item_id_count_month_lag_2              float32
item_id_sale_month_lag_2               float32
item_price_mean_month_lag_2            float32
item_price_std_month_lag_2             float32
item_shop_cnt

## Imputation

In [9]:
train_x.fillna(0, inplace=True)
test_x.fillna(0, inplace=True)

train_y.fillna(0, inplace=True)
test_y.fillna(0, inplace=True)

## Normalization

In [10]:
normalizer = RobustScaler()
normalizer.fit(train_x)

train_x = normalizer.transform(train_x)
test_x = normalizer.transform(test_x)

# train_y = train_y / 20
# if test_y is not None:
#     test_y = test_y / 20.0

## Model Training

In [11]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':3, 
               'min_data_in_leaf': 58470, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**9,
               'bagging_freq':1,
               'max_depth': 10,
               'verbose':0 
              }

lgb_dataset = lgb.Dataset(train_x, label=train_y)

print("start training...")
model = lgb.train(lgb_params, lgb_dataset, 100)

pred_train_y = model.predict(train_x)
#print('Train R-squared for LightGBM is %f' % r2_score(train_y, pred_y))
    
pred_test_y = model.predict(test_x)



start training...


In [12]:
train_y_tmp = train_y.copy()
test_y_tmp = test_y.copy()

train_y_tmp[train_y_tmp>20] = 20.0
pred_train_y[pred_train_y>20] = 20.0 
print('Train R-squared for LightGBM is %f' % r2_score(train_y_tmp, pred_train_y))
print('Train msre is: ' + str(sqrt(mean_squared_error(train_y_tmp, pred_train_y))))

if test_y is not None:
    test_y_tmp[test_y_tmp>20] = 20.0
    pred_test_y[pred_test_y>20] = 20.0 
    print('Test R-squared for LightGBM is %f' % r2_score(test_y_tmp, pred_test_y))
    print('Test msre is: ' + str(sqrt(mean_squared_error(test_y_tmp, pred_test_y))))
    
gc.collect();

Train R-squared for LightGBM is 0.268050
Train msre is: 1.0166271399383389
Test R-squared for LightGBM is 0.107186
Test msre is: 1.0735659013397745


### Pick Most Important Features

In [13]:
feature_importances = pd.DataFrame(
    {
        'feature_name': feature_names,
        'feature_importance': model.feature_importance().tolist()
    }
)

feature_importances.sort_values('feature_importance', ascending=False, inplace=True)
important_feature_names = set(feature_importances[:30]['feature_name'].tolist())

important_feature_names

feature_boolean_mask = np.asarray(list(
    map(
        lambda feature_name: feature_name in important_feature_names,
        feature_names
    )
))

train_x = train_x[:, feature_boolean_mask]
test_x = test_x[:, feature_boolean_mask]

lgb_dataset = lgb.Dataset(train_x, label=train_y)

In [14]:
del model, train_x
gc.collect();

## Hyper-parameter Tuning

In [15]:
test_y_pred_list = list()
def objective(hyper_param):
    
    print("hyper_params: " + str(hyper_param))
    max_depth, num_leaves, min_data_in_leaf = hyper_param
    
    lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':3, 
               'min_data_in_leaf': int(min_data_in_leaf), 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': int(num_leaves),
               'max_depth': int(max_depth),
               'bagging_freq':1,
               'verbose':0 
              }
    print("start training..." + str(len(test_y_pred_list)))
    model = lgb.train(lgb_params, lgb_dataset, 100)
    pred_test_y = model.predict(test_x)
    
    pred_test_y[pred_test_y>20] = 20.0 
    result = sqrt(mean_squared_error(test_y_tmp, pred_test_y))
    test_y_pred_list.append(pred_test_y)
    
    gc.collect();
    
    return result


# tuning algorithm
tpe_algo = tpe.suggest

# search space
space = [
    hp.quniform('max_depth', 5, 10, 1),
    hp.quniform('num_leaves', 2**5, 2**10, 5),
    hp.quniform('min_data_in_leaf', 2**5, 10**4, 5)
]

# history
tpe_trials = Trials()


In [16]:
# Run 2000 evals with the tpe algorithm
tpe_best = fmin(fn=objective, space=space, algo=tpe_algo, trials=tpe_trials, max_evals=1000, rstate= np.random.RandomState(50))

print(tpe_best)

hyper_params: (8.0, 1020.0, 5860.0)
start training...0
hyper_params: (8.0, 885.0, 4530.0)
start training...1
hyper_params: (9.0, 775.0, 9835.0)
start training...2
hyper_params: (7.0, 435.0, 655.0)
start training...3
hyper_params: (6.0, 40.0, 8950.0)
start training...4
hyper_params: (6.0, 405.0, 4390.0)
start training...5
hyper_params: (9.0, 660.0, 3755.0)
start training...6
hyper_params: (7.0, 305.0, 7800.0)
start training...7
hyper_params: (9.0, 555.0, 4425.0)
start training...8
hyper_params: (9.0, 775.0, 5535.0)
start training...9
hyper_params: (6.0, 585.0, 7245.0)
start training...10
hyper_params: (6.0, 835.0, 200.0)
start training...11
hyper_params: (10.0, 130.0, 2365.0)
start training...12
hyper_params: (6.0, 670.0, 6905.0)
start training...13
hyper_params: (7.0, 950.0, 4545.0)
start training...14
hyper_params: (5.0, 355.0, 7075.0)
start training...15
hyper_params: (7.0, 85.0, 5785.0)
start training...16
hyper_params: (10.0, 195.0, 9615.0)
start training...17
hyper_params: (5.0, 5

hyper_params: (7.0, 90.0, 540.0)
start training...150
hyper_params: (8.0, 35.0, 1320.0)
start training...151
hyper_params: (7.0, 175.0, 2130.0)
start training...152
hyper_params: (7.0, 55.0, 40.0)
start training...153
hyper_params: (8.0, 135.0, 260.0)
start training...154
hyper_params: (7.0, 195.0, 800.0)
start training...155
hyper_params: (8.0, 215.0, 50.0)
start training...156
hyper_params: (7.0, 260.0, 1615.0)
start training...157
hyper_params: (8.0, 35.0, 1085.0)
start training...158
hyper_params: (7.0, 235.0, 2490.0)
start training...159
hyper_params: (7.0, 105.0, 1985.0)
start training...160
hyper_params: (8.0, 60.0, 3375.0)
start training...161
hyper_params: (8.0, 910.0, 1260.0)
start training...162
hyper_params: (7.0, 295.0, 895.0)
start training...163
hyper_params: (7.0, 170.0, 320.0)
start training...164
hyper_params: (8.0, 320.0, 7480.0)
start training...165
hyper_params: (7.0, 150.0, 1710.0)
start training...166
hyper_params: (7.0, 115.0, 45.0)
start training...167
hyper_pa

hyper_params: (8.0, 145.0, 9310.0)
start training...299
hyper_params: (7.0, 80.0, 5255.0)
start training...300
hyper_params: (6.0, 115.0, 815.0)
start training...301
hyper_params: (7.0, 95.0, 4920.0)
start training...302
hyper_params: (6.0, 195.0, 6760.0)
start training...303
hyper_params: (7.0, 60.0, 395.0)
start training...304
hyper_params: (8.0, 160.0, 4475.0)
start training...305
hyper_params: (7.0, 280.0, 1395.0)
start training...306
hyper_params: (7.0, 30.0, 1015.0)
start training...307
hyper_params: (8.0, 245.0, 1980.0)
start training...308
hyper_params: (6.0, 135.0, 9645.0)
start training...309
hyper_params: (7.0, 70.0, 3545.0)
start training...310
hyper_params: (7.0, 805.0, 3115.0)
start training...311
hyper_params: (6.0, 210.0, 560.0)
start training...312
hyper_params: (8.0, 990.0, 1680.0)
start training...313
hyper_params: (7.0, 740.0, 270.0)
start training...314
hyper_params: (7.0, 675.0, 755.0)
start training...315
hyper_params: (6.0, 105.0, 2615.0)
start training...316
hy

hyper_params: (8.0, 630.0, 920.0)
start training...448
hyper_params: (8.0, 65.0, 55.0)
start training...449
hyper_params: (8.0, 95.0, 295.0)
start training...450
hyper_params: (9.0, 50.0, 2005.0)
start training...451
hyper_params: (8.0, 175.0, 8715.0)
start training...452
hyper_params: (8.0, 520.0, 1485.0)
start training...453
hyper_params: (8.0, 35.0, 705.0)
start training...454
hyper_params: (9.0, 265.0, 1190.0)
start training...455
hyper_params: (8.0, 80.0, 1790.0)
start training...456
hyper_params: (10.0, 130.0, 940.0)
start training...457
hyper_params: (8.0, 160.0, 2785.0)
start training...458
hyper_params: (9.0, 495.0, 45.0)
start training...459
hyper_params: (8.0, 315.0, 400.0)
start training...460
hyper_params: (10.0, 55.0, 1410.0)
start training...461
hyper_params: (8.0, 100.0, 2305.0)
start training...462
hyper_params: (9.0, 230.0, 225.0)
start training...463
hyper_params: (8.0, 205.0, 1130.0)
start training...464
hyper_params: (8.0, 865.0, 1715.0)
start training...465
hyper_

hyper_params: (8.0, 105.0, 820.0)
start training...597
hyper_params: (8.0, 70.0, 1510.0)
start training...598
hyper_params: (8.0, 200.0, 235.0)
start training...599
hyper_params: (8.0, 160.0, 1305.0)
start training...600
hyper_params: (9.0, 35.0, 1030.0)
start training...601
hyper_params: (8.0, 135.0, 1090.0)
start training...602
hyper_params: (9.0, 120.0, 355.0)
start training...603
hyper_params: (8.0, 30.0, 615.0)
start training...604
hyper_params: (8.0, 95.0, 1870.0)
start training...605
hyper_params: (9.0, 55.0, 1005.0)
start training...606
hyper_params: (9.0, 75.0, 1610.0)
start training...607
hyper_params: (8.0, 185.0, 530.0)
start training...608
hyper_params: (8.0, 150.0, 1360.0)
start training...609
hyper_params: (8.0, 110.0, 835.0)
start training...610
hyper_params: (9.0, 50.0, 205.0)
start training...611
hyper_params: (8.0, 85.0, 1150.0)
start training...612
hyper_params: (8.0, 170.0, 9600.0)
start training...613
hyper_params: (9.0, 205.0, 405.0)
start training...614
hyper_pa

hyper_params: (10.0, 145.0, 630.0)
start training...745
hyper_params: (10.0, 110.0, 920.0)
start training...746
hyper_params: (10.0, 475.0, 215.0)
start training...747
hyper_params: (10.0, 30.0, 420.0)
start training...748
hyper_params: (10.0, 50.0, 370.0)
start training...749
hyper_params: (10.0, 160.0, 3460.0)
start training...750
hyper_params: (10.0, 80.0, 765.0)
start training...751
hyper_params: (10.0, 195.0, 45.0)
start training...752
hyper_params: (10.0, 100.0, 1260.0)
start training...753
hyper_params: (10.0, 55.0, 1045.0)
start training...754
hyper_params: (10.0, 120.0, 430.0)
start training...755
hyper_params: (10.0, 425.0, 1465.0)
start training...756
hyper_params: (10.0, 245.0, 620.0)
start training...757
hyper_params: (10.0, 135.0, 45.0)
start training...758
hyper_params: (10.0, 950.0, 1625.0)
start training...759
hyper_params: (10.0, 65.0, 260.0)
start training...760
hyper_params: (10.0, 95.0, 4830.0)
start training...761
hyper_params: (10.0, 170.0, 6265.0)
start training

hyper_params: (10.0, 30.0, 2590.0)
start training...893
hyper_params: (10.0, 65.0, 1160.0)
start training...894
hyper_params: (10.0, 85.0, 1675.0)
start training...895
hyper_params: (9.0, 50.0, 370.0)
start training...896
hyper_params: (10.0, 500.0, 210.0)
start training...897
hyper_params: (9.0, 105.0, 575.0)
start training...898
hyper_params: (10.0, 35.0, 1310.0)
start training...899
hyper_params: (10.0, 445.0, 760.0)
start training...900
hyper_params: (9.0, 135.0, 1490.0)
start training...901
hyper_params: (10.0, 80.0, 955.0)
start training...902
hyper_params: (10.0, 210.0, 2290.0)
start training...903
hyper_params: (9.0, 160.0, 40.0)
start training...904
hyper_params: (10.0, 65.0, 430.0)
start training...905
hyper_params: (10.0, 30.0, 200.0)
start training...906
hyper_params: (9.0, 240.0, 1140.0)
start training...907
hyper_params: (10.0, 120.0, 655.0)
start training...908
hyper_params: (10.0, 90.0, 7860.0)
start training...909
hyper_params: (6.0, 190.0, 1650.0)
start training...910

In [19]:
test_y_pred_agg = np.zeros(shape=test_y.shape)
for test_y_pred in test_y_pred_list:
    test_y_pred_agg += test_y_pred
test_y_pred_agg /= len(test_y_pred_list)

print('Test msre is: ' + str(sqrt(mean_squared_error(test_y_tmp, test_y_pred_agg))))

Test msre is: 0.0


In [22]:
tpe_trials.results

[{'loss': 1.0489852283746777, 'status': 'ok'},
 {'loss': 1.0484295488979563, 'status': 'ok'},
 {'loss': 1.0529061439142497, 'status': 'ok'},
 {'loss': 1.0256654398984903, 'status': 'ok'},
 {'loss': 1.0533371222391328, 'status': 'ok'},
 {'loss': 1.0539926454246016, 'status': 'ok'},
 {'loss': 1.0467743984110414, 'status': 'ok'},
 {'loss': 1.052725270175449, 'status': 'ok'},
 {'loss': 1.046422771902942, 'status': 'ok'},
 {'loss': 1.0480135168797728, 'status': 'ok'},
 {'loss': 1.053601648433423, 'status': 'ok'},
 {'loss': 1.0272364714298896, 'status': 'ok'},
 {'loss': 1.0380943002252152, 'status': 'ok'},
 {'loss': 1.0526738807409093, 'status': 'ok'},
 {'loss': 1.0469845904950983, 'status': 'ok'},
 {'loss': 1.0548726142570182, 'status': 'ok'},
 {'loss': 1.0498507141721223, 'status': 'ok'},
 {'loss': 1.0531865191298027, 'status': 'ok'},
 {'loss': 1.0277236464899961, 'status': 'ok'},
 {'loss': 1.051521088763826, 'status': 'ok'},
 {'loss': 1.0278288494588421, 'status': 'ok'},
 {'loss': 1.03408

In [None]:
# lgb_params = {
#                'feature_fraction': 0.75,
#                'metric': 'rmse',
#                'nthread':1, 
#                'min_data_in_leaf': 2**7, 
#                'bagging_fraction': 0.75, 
#                'learning_rate': 0.03, 
#                'objective': 'mse', 
#                'bagging_seed': 2**7, 
#                'num_leaves': 2**6,
#                'bagging_freq':1,
#                'verbose':0 
#               }
# model = lgb.train(lgb_params, lgb.Dataset(train_x, label=train_y), 100)

# pred_y = model.predict(train_x)
# print('Train R-squared for LightGBM is %f' % r2_score(train_y, pred_y))
    
# pred_y = model.predict(test_x)
# if test_y is not None:
#     print('Test R-squared for LightGBM is %f' % r2_score(test_y, pred_y))

In [None]:
# sqrt(mean_squared_error(test_y, pred_y))

In [None]:
# model = LinearRegression().fit(train_x, train_y)

# pred_y = model.predict(train_x)
# print('Train R-squared for LinearRegression is %f' % r2_score(train_y, pred_y))
    
# pred_y = model.predict(test_x)
# if test_y is not None:
#     print('Test R-squared for LinearRegression is %f' % r2_score(test_y, pred_y))


In [None]:
# sqrt(mean_squared_error(test_y, pred_y))