In [1]:
import pandas as pd
import numpy as np
import os
import data
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
from tqdm import tqdm_notebook as tn
import gc

import sklearn
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

import category_encoders as ce
from hyperopt import tpe, fmin, hp, Trials

from xgboost import XGBRegressor
from xgboost import plot_importance

from itertools import product

In [2]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

## Read Data

In [3]:
DATA_FOLDER = os.path.join(data.DATA_ROOT, "readonly")
TMP_FOLDER = os.path.join(data.DATA_ROOT, "tmp")
#DATA_FOLDER = "../input"
#TMP_FOLDER = os.path.join(data.DATA_ROOT, "tmp")

VALIDATION_FOLDER = os.path.join(TMP_FOLDER, "validation")
SUBMISSION_FOLDER = os.path.join(TMP_FOLDER, "submission")

In [4]:
FOLDER = VALIDATION_FOLDER
#FOLDER = SUBMISSION_FOLDER

train_data_path = os.path.join(FOLDER, "train.csv")
test_data_path = os.path.join(FOLDER, "test.csv")

train_data = pd.read_csv(train_data_path).drop(columns=['month', 'date_block_num'])
test_data = pd.read_csv(test_data_path).drop(columns=['month', 'date_block_num'])

train_data = downcast_dtypes(train_data)
test_data = downcast_dtypes(test_data)

print("train data shape: " + str(train_data.shape))
train_data.dtypes

train data shape: (6186922, 88)


item_category_id                         int32
shop_id                                  int32
item_id                                  int32
target                                 float32
shop_id_count_month_lag_1              float32
shop_id_sale_month_lag_1               float32
item_id_count_month_lag_1              float32
item_id_sale_month_lag_1               float32
item_price_mean_month_lag_1            float32
item_price_std_month_lag_1             float32
item_shop_cnt_sum_month_lag_1          float32
item_shop_cnt_std_month_lag_1          float32
item_shop_sale_sum_month_lag_1         float32
item_shop_sale_std_month_lag_1         float32
item_category_id_count_month_lag_1     float32
item_category_id_sale_month_lag_1      float32
shop_id_count_month_lag_2              float32
shop_id_sale_month_lag_2               float32
item_id_count_month_lag_2              float32
item_id_sale_month_lag_2               float32
item_price_mean_month_lag_2            float32
item_price_st

## Make X and y

In [5]:
train_y = train_data['target']
train_x = train_data.drop(['target'], axis='columns')
feature_names = train_x.columns.tolist()

if 'target' in test_data.columns.tolist():
    test_x = test_data.drop(['target'], axis='columns')
    test_y = test_data['target']
else:
    test_x = test_data

## Clipping

In [6]:
train_y.loc[train_y > 40] = 40
if test_y is not None:
    test_y.loc[test_y > 40] = 40

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## Categorical Feature Encoding

In [7]:
categorical_columns = [
    'shop_id', 'item_id', 'item_category_id'
]

In [8]:
# Leave-One-Out Encoding
categorical_encoder = ce.LeaveOneOutEncoder(cols=categorical_columns, impute_missing=False, drop_invariant=True)
categorical_encoder.fit(train_x, train_y)

train_x = categorical_encoder.transform(train_x)
test_x = categorical_encoder.transform(test_x)

train_x = downcast_dtypes(train_x)
test_x = downcast_dtypes(test_x)

train_x.dtypes

item_category_id                       float32
shop_id                                float32
item_id                                float32
shop_id_count_month_lag_1              float32
shop_id_sale_month_lag_1               float32
item_id_count_month_lag_1              float32
item_id_sale_month_lag_1               float32
item_price_mean_month_lag_1            float32
item_price_std_month_lag_1             float32
item_shop_cnt_sum_month_lag_1          float32
item_shop_cnt_std_month_lag_1          float32
item_shop_sale_sum_month_lag_1         float32
item_shop_sale_std_month_lag_1         float32
item_category_id_count_month_lag_1     float32
item_category_id_sale_month_lag_1      float32
shop_id_count_month_lag_2              float32
shop_id_sale_month_lag_2               float32
item_id_count_month_lag_2              float32
item_id_sale_month_lag_2               float32
item_price_mean_month_lag_2            float32
item_price_std_month_lag_2             float32
item_shop_cnt

## Imputation

In [9]:
train_x.fillna(0, inplace=True)
test_x.fillna(0, inplace=True)

train_y.fillna(0, inplace=True)
test_y.fillna(0, inplace=True)

## Normalization

In [10]:
normalizer = RobustScaler()
normalizer.fit(train_x)

train_x = normalizer.transform(train_x)
test_x = normalizer.transform(test_x)

# train_y = train_y / 20
# if test_y is not None:
#     test_y = test_y / 20.0

## Model Training

In [11]:
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

print("start training...")
model.fit(
    train_x, 
    train_y, 
    eval_metric="rmse", 
    eval_set=[(train_x, train_y), (test_x, test_y)], 
    verbose=True, 
    early_stopping_rounds = 10)

pred_train_y = model.predict(train_x)
print('Train R-squared for XGBoost is %f' % r2_score(train_y, pred_y))
    
pred_test_y = model.predict(test_x)



start training...
[15:38:34] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[0]	validation_0-rmse:1.39602	validation_1-rmse:1.38101
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.34924	validation_1-rmse:1.35577


KeyboardInterrupt: 

In [None]:
train_y_tmp = train_y.copy()
test_y_tmp = test_y.copy()

train_y_tmp[train_y_tmp>20] = 20.0
pred_train_y[pred_train_y>20] = 20.0 
print('Train R-squared for XGBoost is %f' % r2_score(train_y_tmp, pred_train_y))
print('Train msre is: ' + str(sqrt(mean_squared_error(train_y_tmp, pred_train_y))))

if test_y is not None:
    test_y_tmp[test_y_tmp>20] = 20.0
    pred_test_y[pred_test_y>20] = 20.0 
    print('Test R-squared for XGBoost is %f' % r2_score(test_y_tmp, pred_test_y))
    print('Test msre is: ' + str(sqrt(mean_squared_error(test_y_tmp, pred_test_y))))
    
del train_y_tmp, test_y_tmp
gc.collect();