In [1]:
import pandas as pd
import numpy as np
import os
import data
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
from tqdm import tqdm_notebook as tn
import gc

import sklearn
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

import category_encoders as ce
from hyperopt import tpe, fmin, hp, Trials

import lightgbm as lgb

from itertools import product

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

## Read Data

In [3]:
DATA_FOLDER = os.path.join(data.DATA_ROOT, "readonly")
TMP_FOLDER = os.path.join(data.DATA_ROOT, "tmp")
#DATA_FOLDER = "../input"
#TMP_FOLDER = os.path.join(data.DATA_ROOT, "tmp")

VALIDATION_FOLDER = os.path.join(TMP_FOLDER, "validation")
SUBMISSION_FOLDER = os.path.join(TMP_FOLDER, "submission")

In [4]:
FOLDER = VALIDATION_FOLDER
#FOLDER = SUBMISSION_FOLDER

train_data_path = os.path.join(FOLDER, "train.csv")
test_data_path = os.path.join(FOLDER, "test.csv")

train_data = pd.read_csv(train_data_path).drop(columns=['month', 'date_block_num'])
test_data = pd.read_csv(test_data_path).drop(columns=['month', 'date_block_num'])

train_data = downcast_dtypes(train_data)
test_data = downcast_dtypes(test_data)

print("train data shape: " + str(train_data.shape))
train_data.dtypes

train data shape: (6186922, 88)


item_category_id                         int32
shop_id                                  int32
item_id                                  int32
target                                 float32
shop_id_count_month_lag_1              float32
shop_id_sale_month_lag_1               float32
item_id_count_month_lag_1              float32
item_id_sale_month_lag_1               float32
item_price_mean_month_lag_1            float32
item_price_std_month_lag_1             float32
item_shop_cnt_sum_month_lag_1          float32
item_shop_cnt_std_month_lag_1          float32
item_shop_sale_sum_month_lag_1         float32
item_shop_sale_std_month_lag_1         float32
item_category_id_count_month_lag_1     float32
item_category_id_sale_month_lag_1      float32
shop_id_count_month_lag_2              float32
shop_id_sale_month_lag_2               float32
item_id_count_month_lag_2              float32
item_id_sale_month_lag_2               float32
item_price_mean_month_lag_2            float32
item_price_st

## Make X and y

In [None]:
train_y = train_data['target']
train_x = train_data.drop(['target'], axis='columns')
feature_names = train_x.columns.tolist()

if 'target' in test_data.columns.tolist():
    test_x = test_data.drop(['target'], axis='columns')
    test_y = test_data['target']
else:
    test_x = test_data

## Clipping

In [None]:
train_y.loc[train_y > 40] = 40
if test_y is not None:
    test_y.loc[test_y > 40] = 40

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## Categorical Feature Encoding

In [None]:
categorical_columns = [
    'shop_id', 'item_id', 'item_category_id'
]

In [None]:
# Leave-One-Out Encoding
categorical_encoder = ce.LeaveOneOutEncoder(cols=categorical_columns, impute_missing=False, drop_invariant=True)
categorical_encoder.fit(train_x, train_y)

train_x = categorical_encoder.transform(train_x)
test_x = categorical_encoder.transform(test_x)

train_x = downcast_dtypes(train_x)
test_x = downcast_dtypes(test_x)

train_x.dtypes

item_category_id                       float32
shop_id                                float32
item_id                                float32
shop_id_count_month_lag_1              float32
shop_id_sale_month_lag_1               float32
item_id_count_month_lag_1              float32
item_id_sale_month_lag_1               float32
item_price_mean_month_lag_1            float32
item_price_std_month_lag_1             float32
item_shop_cnt_sum_month_lag_1          float32
item_shop_cnt_std_month_lag_1          float32
item_shop_sale_sum_month_lag_1         float32
item_shop_sale_std_month_lag_1         float32
item_category_id_count_month_lag_1     float32
item_category_id_sale_month_lag_1      float32
shop_id_count_month_lag_2              float32
shop_id_sale_month_lag_2               float32
item_id_count_month_lag_2              float32
item_id_sale_month_lag_2               float32
item_price_mean_month_lag_2            float32
item_price_std_month_lag_2             float32
item_shop_cnt

## Imputation

In [None]:
train_x.fillna(0, inplace=True)
test_x.fillna(0, inplace=True)

train_y.fillna(0, inplace=True)
test_y.fillna(0, inplace=True)

## Normalization

In [None]:
normalizer = RobustScaler()
normalizer.fit(train_x)

train_x = normalizer.transform(train_x)
test_x = normalizer.transform(test_x)

# train_y = train_y / 20
# if test_y is not None:
#     test_y = test_y / 20.0

## Model Training

In [None]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':3, 
               'min_data_in_leaf': 58470, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**9,
               'bagging_freq':1,
               'max_depth': 10,
               'verbose':0 
              }

lgb_dataset = lgb.Dataset(train_x, label=train_y)

print("start training...")
model = lgb.train(lgb_params, lgb_dataset, 100)

pred_train_y = model.predict(train_x)
#print('Train R-squared for LightGBM is %f' % r2_score(train_y, pred_y))
    
pred_test_y = model.predict(test_x)



start training...


In [None]:
train_y_tmp = train_y.copy()
test_y_tmp = test_y.copy()

train_y_tmp[train_y_tmp>20] = 20.0
pred_train_y[pred_train_y>20] = 20.0 
print('Train R-squared for LightGBM is %f' % r2_score(train_y_tmp, pred_train_y))
print('Train msre is: ' + str(sqrt(mean_squared_error(train_y_tmp, pred_train_y))))

if test_y is not None:
    test_y_tmp[test_y_tmp>20] = 20.0
    pred_test_y[pred_test_y>20] = 20.0 
    print('Test R-squared for LightGBM is %f' % r2_score(test_y_tmp, pred_test_y))
    print('Test msre is: ' + str(sqrt(mean_squared_error(test_y_tmp, pred_test_y))))
    
del train_y_tmp, test_y_tmp
gc.collect();

Train R-squared for LightGBM is 0.268050
Train msre is: 1.0166271399383389
Test R-squared for LightGBM is 0.107186
Test msre is: 1.0735659013397745


### Pick Most Important Features

In [None]:
feature_importances = pd.DataFrame(
    {
        'feature_name': feature_names,
        'feature_importance': model.feature_importance().tolist()
    }
)

feature_importances.sort_values('feature_importance', ascending=False, inplace=True)
important_feature_names = set(feature_importances[:30]['feature_name'].tolist())

important_feature_names

feature_boolean_mask = np.asarray(list(
    map(
        lambda feature_name: feature_name in important_feature_names,
        feature_names
    )
))

train_x = train_x[:, feature_boolean_mask]
test_x = test_x[:, feature_boolean_mask]

lgb_dataset = lgb.Dataset(train_x, label=train_y)

In [None]:
del model, train_x
gc.collect();

## Hyper-parameter Tuning

In [None]:
def objective(hyper_param):
    
    print("hyper_params: " + str(hyper_param))
    max_depth, num_leaves, min_data_in_leaf = hyper_param
    
    lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':3, 
               'min_data_in_leaf': int(min_data_in_leaf), 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': int(num_leaves),
               'max_depth': int(max_depth),
               'bagging_freq':1,
               'verbose':0 
              }
    print("start training...")
    model = lgb.train(lgb_params, lgb_dataset, 100)
    print("start prediction...")
    pred_test_y = model.predict(test_x)
    print("start estimation...")
    
    train_y_tmp = train_y.copy()
    test_y_tmp = test_y.copy()
    test_y_tmp[test_y_tmp>20] = 20.0
    pred_test_y[pred_test_y>20] = 20.0 
    result = sqrt(mean_squared_error(test_y_tmp, pred_test_y))
    print('Test msre is: ' + str(result))
    del train_y_tmp, test_y_tmp
    gc.collect();
    print("estimation done!")
    
    return result


# tuning algorithm
tpe_algo = tpe.suggest

# search space
space = [
    hp.quniform('max_depth', 5, 10, 1),
    hp.quniform('num_leaves', 2**5, 2**10, 5),
    hp.quniform('min_data_in_leaf', 2**5, 10**4, 5)
]

# history
tpe_trials = Trials()


In [None]:
# Run 2000 evals with the tpe algorithm
tpe_best = fmin(fn=objective, space=space, algo=tpe_algo, trials=tpe_trials, max_evals=1000, rstate= np.random.RandomState(50))

print(tpe_best)

hyper_params: (8.0, 1020.0, 5860.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0489852283746777
estimation done!
hyper_params: (8.0, 885.0, 4530.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0484295488979563
estimation done!
hyper_params: (9.0, 775.0, 9835.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0529061439142497
estimation done!
hyper_params: (7.0, 435.0, 655.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0256654398984903
estimation done!
hyper_params: (6.0, 40.0, 8950.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0533371222391328
estimation done!
hyper_params: (6.0, 405.0, 4390.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0539926454246016
estimation done!
hyper_params: (9.0, 660.0, 3755.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0467743984110414
estimation done!


start estimation...
Test msre is: 1.0322290598415191
estimation done!
hyper_params: (7.0, 220.0, 2450.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0385169501365437
estimation done!
hyper_params: (9.0, 335.0, 730.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0288418737225933
estimation done!
hyper_params: (5.0, 455.0, 1900.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0388397383835295
estimation done!
hyper_params: (8.0, 290.0, 4405.0)
start training...
start prediction...
start estimation...
Test msre is: 1.045876116531368
estimation done!
hyper_params: (6.0, 575.0, 455.0)
start training...
start prediction...
start estimation...
Test msre is: 1.025870115221238
estimation done!
hyper_params: (7.0, 660.0, 3615.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0478235175958002
estimation done!
hyper_params: (10.0, 755.0, 5250.0)
start training...
start prediction...
st

start prediction...
start estimation...
Test msre is: 1.0372510492798677
estimation done!
hyper_params: (9.0, 310.0, 715.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0267870454612273
estimation done!
hyper_params: (8.0, 785.0, 3230.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0428968934165246
estimation done!
hyper_params: (7.0, 520.0, 2510.0)
start training...
start prediction...
start estimation...
Test msre is: 1.039868419505434
estimation done!
hyper_params: (5.0, 115.0, 9515.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0569481639426284
estimation done!
hyper_params: (6.0, 700.0, 7170.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0525995119908187
estimation done!
hyper_params: (10.0, 855.0, 40.0)
start training...
start prediction...
start estimation...
Test msre is: 1.0488863745044643
estimation done!
hyper_params: (7.0, 435.0, 4225.0)
start training...
st

In [None]:
# lgb_params = {
#                'feature_fraction': 0.75,
#                'metric': 'rmse',
#                'nthread':1, 
#                'min_data_in_leaf': 2**7, 
#                'bagging_fraction': 0.75, 
#                'learning_rate': 0.03, 
#                'objective': 'mse', 
#                'bagging_seed': 2**7, 
#                'num_leaves': 2**6,
#                'bagging_freq':1,
#                'verbose':0 
#               }
# model = lgb.train(lgb_params, lgb.Dataset(train_x, label=train_y), 100)

# pred_y = model.predict(train_x)
# print('Train R-squared for LightGBM is %f' % r2_score(train_y, pred_y))
    
# pred_y = model.predict(test_x)
# if test_y is not None:
#     print('Test R-squared for LightGBM is %f' % r2_score(test_y, pred_y))

In [None]:
# sqrt(mean_squared_error(test_y, pred_y))

In [None]:
# model = LinearRegression().fit(train_x, train_y)

# pred_y = model.predict(train_x)
# print('Train R-squared for LinearRegression is %f' % r2_score(train_y, pred_y))
    
# pred_y = model.predict(test_x)
# if test_y is not None:
#     print('Test R-squared for LinearRegression is %f' % r2_score(test_y, pred_y))


In [None]:
# sqrt(mean_squared_error(test_y, pred_y))