In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
from tqdm import tqdm_notebook as tn
import gc

import sklearn
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

import category_encoders as ce
from hyperopt import tpe, fmin, hp, Trials

import lightgbm as lgb

from itertools import product

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

## Read Data

In [3]:
DATA_FOLDER = "../input"
TMP_FOLDER = os.path.join(DATA_FOLDER, "futuresalepredictiontmp")

In [4]:
FOLDER = TMP_FOLDER
#FOLDER = SUBMISSION_FOLDER

train_data_path = os.path.join(FOLDER, "train.csv")
test_data_path = os.path.join(FOLDER, "test.csv")

train_data = pd.read_csv(train_data_path).drop(columns=['month', 'date_block_num'])
test_data = pd.read_csv(test_data_path).drop(columns=['month', 'date_block_num'])

train_data = downcast_dtypes(train_data)
test_data = downcast_dtypes(test_data)

print("train data shape: " + str(train_data.shape))
train_data.dtypes

train data shape: (6186922, 88)


item_category_id                         int32
shop_id                                  int32
item_id                                  int32
target                                 float32
shop_id_count_month_lag_1              float32
shop_id_sale_month_lag_1               float32
item_id_count_month_lag_1              float32
item_id_sale_month_lag_1               float32
item_price_mean_month_lag_1            float32
item_price_std_month_lag_1             float32
item_shop_cnt_sum_month_lag_1          float32
item_shop_cnt_std_month_lag_1          float32
item_shop_sale_sum_month_lag_1         float32
item_shop_sale_std_month_lag_1         float32
item_category_id_count_month_lag_1     float32
item_category_id_sale_month_lag_1      float32
shop_id_count_month_lag_2              float32
shop_id_sale_month_lag_2               float32
item_id_count_month_lag_2              float32
item_id_sale_month_lag_2               float32
item_price_mean_month_lag_2            float32
item_price_st

## Make X and y

In [5]:
train_y = train_data['target']
train_x = train_data.drop(['target'], axis='columns')
feature_names = train_x.columns.tolist()

if 'target' in test_data.columns.tolist():
    test_x = test_data.drop(['target'], axis='columns')
    test_y = test_data['target']
else:
    test_x = test_data

## Clipping

In [6]:
train_y.loc[train_y > 40] = 40
if test_y is not None:
    test_y.loc[test_y > 40] = 40

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## Categorical Feature Encoding

In [7]:
categorical_columns = [
    'shop_id', 'item_id', 'item_category_id'
]

In [8]:
# Leave-One-Out Encoding
categorical_encoder = ce.LeaveOneOutEncoder(cols=categorical_columns, impute_missing=False, drop_invariant=True)
categorical_encoder.fit(train_x, train_y)

train_x = categorical_encoder.transform(train_x)
test_x = categorical_encoder.transform(test_x)

train_x = downcast_dtypes(train_x)
test_x = downcast_dtypes(test_x)

train_x.dtypes

item_category_id                       float32
shop_id                                float32
item_id                                float32
shop_id_count_month_lag_1              float32
shop_id_sale_month_lag_1               float32
item_id_count_month_lag_1              float32
item_id_sale_month_lag_1               float32
item_price_mean_month_lag_1            float32
item_price_std_month_lag_1             float32
item_shop_cnt_sum_month_lag_1          float32
item_shop_cnt_std_month_lag_1          float32
item_shop_sale_sum_month_lag_1         float32
item_shop_sale_std_month_lag_1         float32
item_category_id_count_month_lag_1     float32
item_category_id_sale_month_lag_1      float32
shop_id_count_month_lag_2              float32
shop_id_sale_month_lag_2               float32
item_id_count_month_lag_2              float32
item_id_sale_month_lag_2               float32
item_price_mean_month_lag_2            float32
item_price_std_month_lag_2             float32
item_shop_cnt

## Imputation

In [9]:
train_x.fillna(0, inplace=True)
test_x.fillna(0, inplace=True)

train_y.fillna(0, inplace=True)
test_y.fillna(0, inplace=True)

## Normalization

In [10]:
normalizer = RobustScaler()
normalizer.fit(train_x)

train_x = normalizer.transform(train_x)
test_x = normalizer.transform(test_x)

# train_y = train_y / 20
# if test_y is not None:
#     test_y = test_y / 20.0

## Model Training

In [11]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':3, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

lgb_dataset = lgb.Dataset(train_x, label=train_y)

print("start training...")
model = lgb.train(lgb_params, lgb_dataset, 100)

pred_train_y = model.predict(train_x)
#print('Train R-squared for LightGBM is %f' % r2_score(train_y, pred_y))
    
pred_test_y = model.predict(test_x)



start training...


In [12]:
train_y_tmp = train_y.copy()
test_y_tmp = test_y.copy()

train_y_tmp[train_y_tmp>20] = 20.0
pred_train_y[pred_train_y>20] = 20.0 
print('Train R-squared for LightGBM is %f' % r2_score(train_y_tmp, pred_train_y))
print('Train msre is: ' + str(sqrt(mean_squared_error(train_y_tmp, pred_train_y))))

if test_y is not None:
    test_y_tmp[test_y_tmp>20] = 20.0
    pred_test_y[pred_test_y>20] = 20.0 
    print('Test R-squared for LightGBM is %f' % r2_score(test_y_tmp, pred_test_y))
    print('Test msre is: ' + str(sqrt(mean_squared_error(test_y_tmp, pred_test_y))))
    
gc.collect();

Train R-squared for LightGBM is 0.460630
Train msre is: 0.8726989153191355
Test R-squared for LightGBM is 0.139518
Test msre is: 1.0539477867028537


### Pick Most Important Features

In [13]:
feature_importances = pd.DataFrame(
    {
        'feature_name': feature_names,
        'feature_importance': model.feature_importance().tolist()
    }
)

feature_importances.sort_values('feature_importance', ascending=False, inplace=True)
important_feature_names = set(feature_importances[:30]['feature_name'].tolist())

important_feature_names

feature_boolean_mask = np.asarray(list(
    map(
        lambda feature_name: feature_name in important_feature_names,
        feature_names
    )
))

train_x = train_x[:, feature_boolean_mask]
test_x = test_x[:, feature_boolean_mask]

lgb_dataset = lgb.Dataset(train_x, label=train_y)

In [14]:
del model, train_x
gc.collect();

## Hyper-parameter Tuning

In [15]:
test_y_pred_list = list()
def objective(hyper_param):
    
    print("hyper_params: " + str(hyper_param))
    max_depth, num_leaves, min_data_in_leaf = hyper_param
    
    lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':3, 
               'min_data_in_leaf': int(min_data_in_leaf), 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': int(num_leaves),
               'max_depth': int(max_depth),
               'bagging_freq':1,
               'verbose':0 
              }
    print("start training..." + str(len(test_y_pred_list)))
    model = lgb.train(lgb_params, lgb_dataset, 100)
    pred_test_y = model.predict(test_x)
    
    pred_test_y[pred_test_y>20] = 20.0 
    result = sqrt(mean_squared_error(test_y_tmp, pred_test_y))
    test_y_pred_list.append(pred_test_y)
    
    gc.collect();
    
    return result


# tuning algorithm
tpe_algo = tpe.suggest

# search space
space = [
    hp.quniform('max_depth', 5, 10, 1),
    hp.quniform('num_leaves', 2**5, 2**10, 5),
    hp.quniform('min_data_in_leaf', 2**5, 10**4, 5)
]

# history
tpe_trials = Trials()


In [16]:
# Run 2000 evals with the tpe algorithm
tpe_best = fmin(fn=objective, space=space, algo=tpe_algo, trials=tpe_trials, max_evals=100, rstate= np.random.RandomState(50))

print(tpe_best)

hyper_params: (8.0, 1020.0, 5860.0)
start training...0
hyper_params: (8.0, 885.0, 4530.0)
start training...1
hyper_params: (9.0, 775.0, 9835.0)
start training...2
hyper_params: (7.0, 435.0, 655.0)
start training...3
hyper_params: (6.0, 40.0, 8950.0)
start training...4
hyper_params: (6.0, 405.0, 4390.0)
start training...5
hyper_params: (9.0, 660.0, 3755.0)
start training...6
hyper_params: (7.0, 305.0, 7800.0)
start training...7
hyper_params: (9.0, 555.0, 4425.0)
start training...8
hyper_params: (9.0, 775.0, 5535.0)
start training...9
hyper_params: (6.0, 585.0, 7245.0)
start training...10
hyper_params: (6.0, 835.0, 200.0)
start training...11
hyper_params: (10.0, 130.0, 2365.0)
start training...12
hyper_params: (6.0, 670.0, 6905.0)
start training...13
hyper_params: (7.0, 950.0, 4545.0)
start training...14
hyper_params: (5.0, 355.0, 7075.0)
start training...15
hyper_params: (7.0, 85.0, 5785.0)
start training...16
hyper_params: (10.0, 195.0, 9615.0)
start training...17
hyper_params: (5.0, 5

In [28]:
losses = np.asarray(list(map(
    lambda trial_result: trial_result['loss'],
    tpe_trials.results
)))
pred_ys = np.asarray(test_y_pred_list)

indices = np.argsort(losses)
losses_sorted = losses[indices]
pred_ys_soretd = pred_ys[indices, :]
pred_ys_sorted = pred_ys_soretd[0:5, :].tolist()


In [29]:
test_y_pred_agg = np.zeros(shape=test_y.shape)
for test_y_pred in pred_ys_sorted:
    test_y_pred_agg += np.asarray(test_y_pred)
test_y_pred_agg /= len(pred_ys_sorted)

print('Test msre is: ' + str(sqrt(mean_squared_error(test_y_tmp, test_y_pred_agg))))

Test msre is: 1.0265782761097844


In [18]:
tpe_trials.results

[{'loss': 1.0558229477326573, 'status': 'ok'},
 {'loss': 1.0583271782471553, 'status': 'ok'},
 {'loss': 1.0574323928972862, 'status': 'ok'},
 {'loss': 1.031782203751633, 'status': 'ok'},
 {'loss': 1.059645805966958, 'status': 'ok'},
 {'loss': 1.0632516446551346, 'status': 'ok'},
 {'loss': 1.0558374707251503, 'status': 'ok'},
 {'loss': 1.0594748586785376, 'status': 'ok'},
 {'loss': 1.0563795126476294, 'status': 'ok'},
 {'loss': 1.055948506750416, 'status': 'ok'},
 {'loss': 1.0615185174700679, 'status': 'ok'},
 {'loss': 1.0293080718667411, 'status': 'ok'},
 {'loss': 1.0454800124655141, 'status': 'ok'},
 {'loss': 1.0561078045876906, 'status': 'ok'},
 {'loss': 1.0576964920493244, 'status': 'ok'},
 {'loss': 1.0622568725319321, 'status': 'ok'},
 {'loss': 1.0569191212396443, 'status': 'ok'},
 {'loss': 1.056333146990011, 'status': 'ok'},
 {'loss': 1.0319578831635274, 'status': 'ok'},
 {'loss': 1.057057118322009, 'status': 'ok'},
 {'loss': 1.040620873955733, 'status': 'ok'},
 {'loss': 1.0483360

In [19]:
# lgb_params = {
#                'feature_fraction': 0.75,
#                'metric': 'rmse',
#                'nthread':1, 
#                'min_data_in_leaf': 2**7, 
#                'bagging_fraction': 0.75, 
#                'learning_rate': 0.03, 
#                'objective': 'mse', 
#                'bagging_seed': 2**7, 
#                'num_leaves': 2**6,
#                'bagging_freq':1,
#                'verbose':0 
#               }
# model = lgb.train(lgb_params, lgb.Dataset(train_x, label=train_y), 100)

# pred_y = model.predict(train_x)
# print('Train R-squared for LightGBM is %f' % r2_score(train_y, pred_y))
    
# pred_y = model.predict(test_x)
# if test_y is not None:
#     print('Test R-squared for LightGBM is %f' % r2_score(test_y, pred_y))

In [20]:
# sqrt(mean_squared_error(test_y, pred_y))

In [21]:
# model = LinearRegression().fit(train_x, train_y)

# pred_y = model.predict(train_x)
# print('Train R-squared for LinearRegression is %f' % r2_score(train_y, pred_y))
    
# pred_y = model.predict(test_x)
# if test_y is not None:
#     print('Test R-squared for LinearRegression is %f' % r2_score(test_y, pred_y))


In [22]:
# sqrt(mean_squared_error(test_y, pred_y))