In [1]:
import pandas as pd
import numpy as np
import os
import data
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
from tqdm import tqdm_notebook as tn
import gc

import sklearn
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score

import category_encoders as ce


import lightgbm as lgb

from itertools import product

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

## Read Data

In [3]:
DATA_FOLDER = os.path.join(data.DATA_ROOT, "readonly")
TMP_FOLDER = os.path.join(data.DATA_ROOT, "tmp")
#DATA_FOLDER = "../input"
#TMP_FOLDER = os.path.join(data.DATA_ROOT, "tmp")

VALIDATION_FOLDER = os.path.join(TMP_FOLDER, "validation")
SUBMISSION_FOLDER = os.path.join(TMP_FOLDER, "submission")

In [4]:
FOLDER = VALIDATION_FOLDER
#FOLDER = SUBMISSION_FOLDER

train_data_path = os.path.join(FOLDER, "train.csv")
test_data_path = os.path.join(FOLDER, "test.csv")

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

train_data = downcast_dtypes(train_data)
test_data = downcast_dtypes(test_data)

print("train data shape: " + str(train_data.shape))
train_data.dtypes

train data shape: (889869, 90)


shop_id                                  int32
item_id                                  int32
item_category_id                         int32
date_block_num                           int32
month                                    int32
target                                 float32
shop_id_count_month_lag_1              float32
shop_id_sale_month_lag_1               float32
item_id_count_month_lag_1              float32
item_id_sale_month_lag_1               float32
item_category_id_count_month_lag_1     float32
item_category_id_sale_month_lag_1      float32
item_price_mean_month_lag_1            float32
item_price_std_month_lag_1             float32
item_shop_cnt_sum_month_lag_1          float32
item_shop_cnt_std_month_lag_1          float32
item_shop_sale_sum_month_lag_1         float32
item_shop_sale_std_month_lag_1         float32
shop_id_count_month_lag_2              float32
shop_id_sale_month_lag_2               float32
item_id_count_month_lag_2              float32
item_id_sale_

## Make X and y

In [5]:
train_y = train_data['target']
train_x = train_data.drop(['target'], axis='columns')

if 'target' in test_data.columns.tolist():
    test_x = test_data.drop(['target'], axis='columns')
    test_y = test_data['target']
else:
    test_x = test_data

## Categorical Feature Encoding

In [6]:
categorical_columns = [
    'shop_id', 'item_id', 'item_category_id'
]

In [7]:
# Leave-One-Out Encoding
categorical_encoder = ce.TargetEncoder(cols=categorical_columns, impute_missing=False, drop_invariant=True)
categorical_encoder.fit(train_x, train_y)

train_x = categorical_encoder.transform(train_x)
test_x = categorical_encoder.transform(test_x)

train_x.dtypes

train_x = downcast_dtypes(train_x)
test_x = downcast_dtypes(test_x)

## Imputation

In [8]:
train_x.fillna(-1, inplace=True)
test_x.fillna(-1, inplace=True)

## Clipping

In [9]:
train_y[train_y > 20] = 20
if test_y is not None:
    test_y[test_y > 20] = 20

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Normalization

In [11]:
normalizer = RobustScaler()
normalizer.fit(train_x)

train_x = normalizer.transform(train_x)
test_x = normalizer.transform(test_x)

train_y = train_y / 20
if test_y is not None:
    test_y = test_y / 20.0

## Model Training

In [14]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }
model = lgb.train(lgb_params, lgb.Dataset(train_x, label=train_y), 100)

pred_y = model.predict(train_x)
print('Test R-squared for LightGBM is %f' % r2_score(train_y, pred_y))
    
pred_y = model.predict(test_x)
if test_y is not None:
    print('Test R-squared for LightGBM is %f' % r2_score(test_y, pred_y))

Test R-squared for LightGBM is 0.621857
Test R-squared for LightGBM is 0.327670
