In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from lightgbm import LGBMClassifier,LGBMRegressor
import lightgbm as lgb
import tensorflow as tf
import dask.dataframe as dd
import math
import random
import os
import gc
import sys
plt.style.use('fivethirtyeight')
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
def seed_everything(seed=51):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [4]:
seed_everything(seed=51)

In [5]:
sample_submission = pd.read_csv('../input/sample_submission.csv')

In [6]:
sample_submission['is_evaluation'] = sample_submission['id'].apply(lambda x: 1 if x.split('_')[-1] == 'evaluation' else 0)

In [7]:
validation = sample_submission[sample_submission['is_evaluation'] == 0].reset_index(drop=True)
evaluation = sample_submission[sample_submission['is_evaluation'] == 1].reset_index(drop=True)

In [8]:
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [9]:
train = pd.read_pickle('train1_1.pkl')
val = pd.read_pickle('val1_1.pkl')
print(train.shape,val.shape)

(3842020, 66) (788701, 66)


In [10]:
del train['sold'],val['sold']

In [11]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)

Mem. usage decreased to 534.95 Mb (33.3% reduction)
Mem. usage decreased to 109.82 Mb (33.3% reduction)


In [12]:
train.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,snap_CA,snap_TX,snap_WI,sell_price,sales_lag_28,sales_lag_56,sales_lag_91,sales_lag_182,sales_lag_364,sales_lag_728,turnover,turnover_lag_28,turnover_lag_56,turnover_lag_91,turnover_lag_182,turnover_lag_364,turnover_lag_728,sales_lag_28_roll_mean_7,sales_lag_28_roll_std_7,sales_lag_28_roll_max_7,sales_lag_28_roll_mean_28,sales_lag_28_roll_max_28,sales_lag_28_roll_mean_91,sales_lag_28_roll_std_91,sales_lag_28_roll_max_91,sales_lag_28_roll_mean_182,sales_lag_28_roll_std_182,sales_lag_91_roll_mean_7,sales_lag_91_roll_std_7,sales_lag_91_roll_mean_28,sales_lag_91_roll_mean_91,sales_lag_91_roll_mean_182,sales_lag_182_roll_mean_182,sales_lag_182_roll_std_182,sales_lag_364_roll_mean_7,sales_lag_364_roll_std_7,sales_lag_364_roll_max_7,sales_lag_364_roll_mean_28,sales_lag_364_roll_std_28,sales_lag_364_roll_max_28,turnover_lag_28_roll_mean_7,turnover_lag_28_roll_mean_91,turnover_lag_91_roll_mean_7,turnover_lag_91_roll_std_7,turnover_lag_91_roll_max_7,turnover_lag_91_roll_mean_182,turnover_lag_182_roll_mean_182,turnover_lag_182_roll_std_182,turnover_lag_364_roll_std_91,day_number,is_other,is_holiday,dayofyear,year,month
32563320,HOBBIES_1_001_CA_1,1437,3,1,0,0,d_1069,1.0,2014-01-01,11349,1,1,0,8.257812,1.0,1.0,0.0,0.0,0.0,0.0,8.257812,8.257812,8.257812,0.0,0.0,0.0,0.0,1.0,0.577148,2.0,0.714355,3.0,0.549316,0.703613,3.0,0.368164,0.623535,0.571289,0.786621,0.464355,0.32959,0.164795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.257812,4.539062,4.71875,6.496094,16.515625,1.368164,0.0,0.0,0.0,1069,1,1,1,2014,1
32563321,HOBBIES_1_002_CA_1,1438,3,1,0,0,d_1069,0.0,2014-01-01,11349,1,1,0,3.970703,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,7.941406,0.0,3.970703,0.0,0.0,0.0,0.071411,1.0,0.12085,0.360107,2.0,0.203247,0.442627,0.428467,0.786621,0.142822,0.230713,0.236206,0.252686,0.495117,0.142822,0.37793,1.0,0.964355,1.169922,3.0,0.0,0.47998,1.702148,3.125,7.941406,0.937988,1.003906,1.96582,3.419922,1069,1,1,1,2014,1
32563323,HOBBIES_1_004_CA_1,1440,3,1,0,0,d_1069,2.0,2014-01-01,11349,1,1,0,4.640625,1.0,2.0,1.0,0.0,0.0,0.0,9.28125,4.640625,9.28125,4.640625,0.0,0.0,0.0,3.142578,2.671875,8.0,2.107422,8.0,2.033203,1.753906,8.0,1.642578,1.658203,1.713867,1.704102,1.642578,1.450195,1.439453,1.330078,1.566406,0.0,0.0,0.0,0.214233,0.567871,2.0,14.585938,9.4375,7.957031,7.910156,18.5625,6.679688,6.113281,7.226562,5.367188,1069,1,1,1,2014,1
32563324,HOBBIES_1_005_CA_1,1441,3,1,0,0,d_1069,1.0,2014-01-01,11349,1,1,0,3.080078,1.0,0.0,3.0,1.0,1.0,1.0,3.080078,3.080078,0.0,9.242188,3.080078,3.080078,2.480469,1.0,1.414062,4.0,0.75,4.0,1.0,1.095703,4.0,1.054688,1.210938,1.286133,1.253906,1.392578,1.274414,1.043945,1.022461,1.216797,1.857422,1.675781,5.0,1.463867,1.426758,5.0,3.080078,3.080078,3.960938,3.861328,9.242188,3.214844,3.148438,3.748047,4.445312,1069,1,1,1,2014,1
32563325,HOBBIES_1_006_CA_1,1442,3,1,0,0,d_1069,0.0,2014-01-01,11349,1,1,0,1.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,0.714355,0.755859,2.0,1.036133,10.0,0.966797,1.760742,10.0,0.944824,1.607422,1.0,0.816406,0.821289,0.856934,1.005859,1.422852,1.972656,1.857422,2.544922,7.0,2.357422,2.857422,9.0,0.714355,0.966797,1.0,0.816406,2.0,1.005859,1.422852,1.972656,2.371094,1069,1,1,1,2014,1


In [13]:
print(train.shape)
train = pd.concat([train,val],axis=0).reset_index(drop=True)
del val
print(train.shape)
gc.collect()

(3842020, 65)
(4630721, 65)


28

In [14]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id']
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "dayofyear","year","turnover","day_number","month"]

In [15]:
train.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,snap_CA,snap_TX,snap_WI,sell_price,sales_lag_28,sales_lag_56,sales_lag_91,sales_lag_182,sales_lag_364,sales_lag_728,turnover,turnover_lag_28,turnover_lag_56,turnover_lag_91,turnover_lag_182,turnover_lag_364,turnover_lag_728,sales_lag_28_roll_mean_7,sales_lag_28_roll_std_7,sales_lag_28_roll_max_7,sales_lag_28_roll_mean_28,sales_lag_28_roll_max_28,sales_lag_28_roll_mean_91,sales_lag_28_roll_std_91,sales_lag_28_roll_max_91,sales_lag_28_roll_mean_182,sales_lag_28_roll_std_182,sales_lag_91_roll_mean_7,sales_lag_91_roll_std_7,sales_lag_91_roll_mean_28,sales_lag_91_roll_mean_91,sales_lag_91_roll_mean_182,sales_lag_182_roll_mean_182,sales_lag_182_roll_std_182,sales_lag_364_roll_mean_7,sales_lag_364_roll_std_7,sales_lag_364_roll_max_7,sales_lag_364_roll_mean_28,sales_lag_364_roll_std_28,sales_lag_364_roll_max_28,turnover_lag_28_roll_mean_7,turnover_lag_28_roll_mean_91,turnover_lag_91_roll_mean_7,turnover_lag_91_roll_std_7,turnover_lag_91_roll_max_7,turnover_lag_91_roll_mean_182,turnover_lag_182_roll_mean_182,turnover_lag_182_roll_std_182,turnover_lag_364_roll_std_91,day_number,is_other,is_holiday,dayofyear,year,month
0,HOBBIES_1_001_CA_1,1437,3,1,0,0,d_1069,1.0,2014-01-01,11349,1,1,0,8.257812,1.0,1.0,0.0,0.0,0.0,0.0,8.257812,8.257812,8.257812,0.0,0.0,0.0,0.0,1.0,0.577148,2.0,0.714355,3.0,0.549316,0.703613,3.0,0.368164,0.623535,0.571289,0.786621,0.464355,0.32959,0.164795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.257812,4.539062,4.71875,6.496094,16.515625,1.368164,0.0,0.0,0.0,1069,1,1,1,2014,1
1,HOBBIES_1_002_CA_1,1438,3,1,0,0,d_1069,0.0,2014-01-01,11349,1,1,0,3.970703,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,7.941406,0.0,3.970703,0.0,0.0,0.0,0.071411,1.0,0.12085,0.360107,2.0,0.203247,0.442627,0.428467,0.786621,0.142822,0.230713,0.236206,0.252686,0.495117,0.142822,0.37793,1.0,0.964355,1.169922,3.0,0.0,0.47998,1.702148,3.125,7.941406,0.937988,1.003906,1.96582,3.419922,1069,1,1,1,2014,1
2,HOBBIES_1_004_CA_1,1440,3,1,0,0,d_1069,2.0,2014-01-01,11349,1,1,0,4.640625,1.0,2.0,1.0,0.0,0.0,0.0,9.28125,4.640625,9.28125,4.640625,0.0,0.0,0.0,3.142578,2.671875,8.0,2.107422,8.0,2.033203,1.753906,8.0,1.642578,1.658203,1.713867,1.704102,1.642578,1.450195,1.439453,1.330078,1.566406,0.0,0.0,0.0,0.214233,0.567871,2.0,14.585938,9.4375,7.957031,7.910156,18.5625,6.679688,6.113281,7.226562,5.367188,1069,1,1,1,2014,1
3,HOBBIES_1_005_CA_1,1441,3,1,0,0,d_1069,1.0,2014-01-01,11349,1,1,0,3.080078,1.0,0.0,3.0,1.0,1.0,1.0,3.080078,3.080078,0.0,9.242188,3.080078,3.080078,2.480469,1.0,1.414062,4.0,0.75,4.0,1.0,1.095703,4.0,1.054688,1.210938,1.286133,1.253906,1.392578,1.274414,1.043945,1.022461,1.216797,1.857422,1.675781,5.0,1.463867,1.426758,5.0,3.080078,3.080078,3.960938,3.861328,9.242188,3.214844,3.148438,3.748047,4.445312,1069,1,1,1,2014,1
4,HOBBIES_1_006_CA_1,1442,3,1,0,0,d_1069,0.0,2014-01-01,11349,1,1,0,1.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,0.714355,0.755859,2.0,1.036133,10.0,0.966797,1.760742,10.0,0.944824,1.607422,1.0,0.816406,0.821289,0.856934,1.005859,1.422852,1.972656,1.857422,2.544922,7.0,2.357422,2.857422,9.0,0.714355,0.966797,1.0,0.816406,2.0,1.005859,1.422852,1.972656,2.371094,1069,1,1,1,2014,1


In [16]:
#cat_feats = cat_feats + ['dept_id_store_id','dept_id_state_id','cat_id_store_id','cat_id_state_id']

In [17]:
train_cols = train.columns[~train.columns.isin(useless_cols)]

In [17]:
#dropped_list = []

In [18]:
#dropped_list2 = []

In [19]:
#train_cols = [x for x in train_cols if x not in dropped_list]

In [20]:
#train_cols = [x for x in train_cols if x not in dropped_list2]

In [21]:
#cat_feats.remove('cat_id')
#cat_feats.remove('state_id')

In [18]:
print(list(train_cols))

['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'sales_lag_28', 'sales_lag_56', 'sales_lag_91', 'sales_lag_182', 'sales_lag_364', 'sales_lag_728', 'turnover_lag_28', 'turnover_lag_56', 'turnover_lag_91', 'turnover_lag_182', 'turnover_lag_364', 'turnover_lag_728', 'sales_lag_28_roll_mean_7', 'sales_lag_28_roll_std_7', 'sales_lag_28_roll_max_7', 'sales_lag_28_roll_mean_28', 'sales_lag_28_roll_max_28', 'sales_lag_28_roll_mean_91', 'sales_lag_28_roll_std_91', 'sales_lag_28_roll_max_91', 'sales_lag_28_roll_mean_182', 'sales_lag_28_roll_std_182', 'sales_lag_91_roll_mean_7', 'sales_lag_91_roll_std_7', 'sales_lag_91_roll_mean_28', 'sales_lag_91_roll_mean_91', 'sales_lag_91_roll_mean_182', 'sales_lag_182_roll_mean_182', 'sales_lag_182_roll_std_182', 'sales_lag_364_roll_mean_7', 'sales_lag_364_roll_std_7', 'sales_lag_364_roll_max_7', 'sales_lag_364_roll_mean_28', 'sales_lag_364_roll_std_28', 'sales_lag_364_roll_max_28', 'turnover_lag_28_rol

In [19]:
print(len(train_cols))

55


In [20]:
X_train = train[train_cols]
y_train = train["sales"]

In [21]:
train[train_cols].head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,snap_CA,snap_TX,snap_WI,sell_price,sales_lag_28,sales_lag_56,sales_lag_91,sales_lag_182,sales_lag_364,sales_lag_728,turnover_lag_28,turnover_lag_56,turnover_lag_91,turnover_lag_182,turnover_lag_364,turnover_lag_728,sales_lag_28_roll_mean_7,sales_lag_28_roll_std_7,sales_lag_28_roll_max_7,sales_lag_28_roll_mean_28,sales_lag_28_roll_max_28,sales_lag_28_roll_mean_91,sales_lag_28_roll_std_91,sales_lag_28_roll_max_91,sales_lag_28_roll_mean_182,sales_lag_28_roll_std_182,sales_lag_91_roll_mean_7,sales_lag_91_roll_std_7,sales_lag_91_roll_mean_28,sales_lag_91_roll_mean_91,sales_lag_91_roll_mean_182,sales_lag_182_roll_mean_182,sales_lag_182_roll_std_182,sales_lag_364_roll_mean_7,sales_lag_364_roll_std_7,sales_lag_364_roll_max_7,sales_lag_364_roll_mean_28,sales_lag_364_roll_std_28,sales_lag_364_roll_max_28,turnover_lag_28_roll_mean_7,turnover_lag_28_roll_mean_91,turnover_lag_91_roll_mean_7,turnover_lag_91_roll_std_7,turnover_lag_91_roll_max_7,turnover_lag_91_roll_mean_182,turnover_lag_182_roll_mean_182,turnover_lag_182_roll_std_182,turnover_lag_364_roll_std_91,is_other,is_holiday
0,1437,3,1,0,0,1,1,0,8.257812,1.0,1.0,0.0,0.0,0.0,0.0,8.257812,8.257812,0.0,0.0,0.0,0.0,1.0,0.577148,2.0,0.714355,3.0,0.549316,0.703613,3.0,0.368164,0.623535,0.571289,0.786621,0.464355,0.32959,0.164795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.257812,4.539062,4.71875,6.496094,16.515625,1.368164,0.0,0.0,0.0,1,1
1,1438,3,1,0,0,1,1,0,3.970703,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,7.941406,0.0,3.970703,0.0,0.0,0.0,0.071411,1.0,0.12085,0.360107,2.0,0.203247,0.442627,0.428467,0.786621,0.142822,0.230713,0.236206,0.252686,0.495117,0.142822,0.37793,1.0,0.964355,1.169922,3.0,0.0,0.47998,1.702148,3.125,7.941406,0.937988,1.003906,1.96582,3.419922,1,1
2,1440,3,1,0,0,1,1,0,4.640625,1.0,2.0,1.0,0.0,0.0,0.0,4.640625,9.28125,4.640625,0.0,0.0,0.0,3.142578,2.671875,8.0,2.107422,8.0,2.033203,1.753906,8.0,1.642578,1.658203,1.713867,1.704102,1.642578,1.450195,1.439453,1.330078,1.566406,0.0,0.0,0.0,0.214233,0.567871,2.0,14.585938,9.4375,7.957031,7.910156,18.5625,6.679688,6.113281,7.226562,5.367188,1,1
3,1441,3,1,0,0,1,1,0,3.080078,1.0,0.0,3.0,1.0,1.0,1.0,3.080078,0.0,9.242188,3.080078,3.080078,2.480469,1.0,1.414062,4.0,0.75,4.0,1.0,1.095703,4.0,1.054688,1.210938,1.286133,1.253906,1.392578,1.274414,1.043945,1.022461,1.216797,1.857422,1.675781,5.0,1.463867,1.426758,5.0,3.080078,3.080078,3.960938,3.861328,9.242188,3.214844,3.148438,3.748047,4.445312,1,1
4,1442,3,1,0,0,1,1,0,1.0,1.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,0.714355,0.755859,2.0,1.036133,10.0,0.966797,1.760742,10.0,0.944824,1.607422,1.0,0.816406,0.821289,0.856934,1.005859,1.422852,1.972656,1.857422,2.544922,7.0,2.357422,2.857422,9.0,0.714355,0.966797,1.0,0.816406,2.0,1.005859,1.422852,1.972656,2.371094,1,1


In [22]:
train_data = lgb.Dataset(X_train , label = y_train,categorical_feature=cat_feats, free_raw_data=False)
#valid_data = lgb.Dataset(X_val, label = y_val, categorical_feature=cat_feats,free_raw_data=False)

In [23]:
#del X_train, y_train, X_val, y_val ; gc.collect()

In [24]:
params = {
"objective" : "poisson",
"metric" :"rmse",
"force_row_wise" : False,
"learning_rate" : 0.02,
 "feature_fraction" : 1.0,
"sub_row" : 0.75,
"bagging_freq" : 1,
"lambda_l2" : 0.1,
'verbosity': 1,
'num_iterations' : 1274,
'num_leaves': 64,
"min_data_in_leaf": 100,
}

In [25]:
m_lgb = lgb.train(params, train_data, verbose_eval= 50) 



In [26]:
m_lgb.save_model("model_1.lgb")

<lightgbm.basic.Booster at 0x7fdc2c881438>

In [28]:
pd.Series(m_lgb.feature_importance('gain'),index=m_lgb.feature_name()).sort_values()

cat_id                            2.142956e+04
state_id                          2.476772e+04
turnover_lag_91_roll_max_7        4.081120e+04
is_other                          4.434182e+04
turnover_lag_91                   4.848092e+04
sales_lag_364_roll_std_7          6.185842e+04
turnover_lag_182_roll_mean_182    6.551583e+04
sales_lag_91_roll_std_7           6.999583e+04
snap_CA                           7.088671e+04
snap_TX                           7.413106e+04
is_holiday                        8.038890e+04
turnover_lag_91_roll_mean_182     9.405521e+04
turnover_lag_182                  1.006518e+05
turnover_lag_56                   1.019685e+05
turnover_lag_91_roll_std_7        1.150534e+05
sales_lag_364_roll_max_28         1.288162e+05
sales_lag_364_roll_max_7          1.332935e+05
turnover_lag_364                  1.426397e+05
sales_lag_364_roll_std_28         1.442993e+05
sales_lag_364_roll_mean_7         1.456658e+05
sales_lag_28_roll_max_91          1.570303e+05
sales_lag_182

In [None]:
#1274

In [None]:
"""
Training until validation scores don't improve for 25 rounds.
[50]	valid_0's rmse: 3.05769
[100]	valid_0's rmse: 2.69825
[150]	valid_0's rmse: 2.54086
[200]	valid_0's rmse: 2.47269
[250]	valid_0's rmse: 2.44135
[300]	valid_0's rmse: 2.42463
[350]	valid_0's rmse: 2.41536
[400]	valid_0's rmse: 2.40923
[450]	valid_0's rmse: 2.40532
[500]	valid_0's rmse: 2.40226
[550]	valid_0's rmse: 2.39882
[600]	valid_0's rmse: 2.39598
[650]	valid_0's rmse: 2.39383
[700]	valid_0's rmse: 2.3911
[750]	valid_0's rmse: 2.38946
[800]	valid_0's rmse: 2.38828
[850]	valid_0's rmse: 2.38754
[900]	valid_0's rmse: 2.38633
[950]	valid_0's rmse: 2.38612
[1000]	valid_0's rmse: 2.38547
[1050]	valid_0's rmse: 2.38431
[1100]	valid_0's rmse: 2.38393
[1150]	valid_0's rmse: 2.38373
[1200]	valid_0's rmse: 2.3829
[1250]	valid_0's rmse: 2.38252
Early stopping, best iteration is:
[1274]	valid_0's rmse: 2.38225
"""

In [None]:
"""
Training until validation scores don't improve for 25 rounds.
[50]	valid_0's rmse: 3.07038
[100]	valid_0's rmse: 2.7132
[150]	valid_0's rmse: 2.55708
[200]	valid_0's rmse: 2.48791
[250]	valid_0's rmse: 2.45609
[300]	valid_0's rmse: 2.43962
[350]	valid_0's rmse: 2.42953
[400]	valid_0's rmse: 2.42298
[450]	valid_0's rmse: 2.41843
[500]	valid_0's rmse: 2.41465
[550]	valid_0's rmse: 2.41161
[600]	valid_0's rmse: 2.40842
[650]	valid_0's rmse: 2.40572
[700]	valid_0's rmse: 2.40291
[750]	valid_0's rmse: 2.40021
[800]	valid_0's rmse: 2.39833
[850]	valid_0's rmse: 2.39731
[900]	valid_0's rmse: 2.39626
[950]	valid_0's rmse: 2.39547
[1000]	valid_0's rmse: 2.39463
[1050]	valid_0's rmse: 2.39432
[1100]	valid_0's rmse: 2.39379
Early stopping, best iteration is:
[1106]	valid_0's rmse: 2.39362
"""

In [None]:
base_score = 2.38225

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

In [None]:
used_list = []
dropped_list = []
for i in tqdm(train_cols):
    X_val2 = X_val.copy()
    X_val2[i] = np.random.permutation(X_val2[i])
    predicted = m_lgb.predict(X_val2)
    feature_score = sqrt(mean_squared_error(y_val,predicted))
    print(i)
    print(base_score - feature_score)
    print("*"*50)
    if base_score - feature_score >= 0:
        dropped_list.append(i)
    else:
        used_list.append(i)

In [None]:
print(len(used_list))
print(len(dropped_list))

In [None]:
print(used_list)

In [None]:
print(dropped_list)

In [None]:
plt.figure(figsize=(20,10))
pd.Series(m_lgb.feature_importance(),index=train_cols).sort_values()#.plot(kind='barh')