In [1]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
SEED = 51

In [3]:
FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size',
                  'OOF_PRED', 'month_0']

In [4]:
# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [5]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [6]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

In [7]:
# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [8]:
# load csv
train_df = pd.read_csv('../input/train.csv', index_col=['card_id'])
test_df = pd.read_csv('../input/test.csv', index_col=['card_id'])

print("Train samples: {}, test samples: {}".format(len(train_df), len(test_df)))

# outlier
train_df['outliers'] = 0
train_df.loc[train_df['target'] < -30, 'outliers'] = 1

# set target as nan
test_df['target'] = np.nan

# merge
df = train_df.append(test_df)

del train_df, test_df
gc.collect()

# to datetime
df['first_active_month'] = pd.to_datetime(df['first_active_month'])

# datetime features
df['quarter'] = df['first_active_month'].dt.quarter
df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days

df['days_feature1'] = df['elapsed_time'] * df['feature_1']
df['days_feature2'] = df['elapsed_time'] * df['feature_2']
df['days_feature3'] = df['elapsed_time'] * df['feature_3']

df['days_feature1_ratio'] = df['feature_1'] / df['elapsed_time']
df['days_feature2_ratio'] = df['feature_2'] / df['elapsed_time']
df['days_feature3_ratio'] = df['feature_3'] / df['elapsed_time']

# one hot encoding
df, cols = one_hot_encoder(df, nan_as_category=False)

for f in ['feature_1','feature_2','feature_3']:
    order_label = df.groupby([f])['outliers'].mean()
    df[f] = df[f].map(order_label)

df['feature_sum'] = df['feature_1'] + df['feature_2'] + df['feature_3']
df['feature_mean'] = df['feature_sum']/3
df['feature_max'] = df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
df['feature_min'] = df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
df['feature_var'] = df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)

Train samples: 201917, test samples: 123623


In [9]:
# preprocessing historical transactions
# load csv
hist_df = pd.read_csv('../input/historical_transactions.csv')

# fillna
hist_df['category_2'].fillna(1.0,inplace=True)
hist_df['category_3'].fillna('A',inplace=True)
hist_df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
hist_df['installments'].replace(-1, np.nan,inplace=True)
hist_df['installments'].replace(999, np.nan,inplace=True)

# trim
hist_df['purchase_amount'] = hist_df['purchase_amount'].apply(lambda x: min(x, 0.8))

# Y/N to 1/0
hist_df['authorized_flag'] = hist_df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
hist_df['category_1'] = hist_df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
hist_df['category_3'] = hist_df['category_3'].map({'A':0, 'B':1, 'C':2})

# datetime features
hist_df['purchase_date'] = pd.to_datetime(hist_df['purchase_date'])
hist_df['month'] = hist_df['purchase_date'].dt.month
hist_df['day'] = hist_df['purchase_date'].dt.day
hist_df['hour'] = hist_df['purchase_date'].dt.hour
hist_df['weekofyear'] = hist_df['purchase_date'].dt.weekofyear
hist_df['weekday'] = hist_df['purchase_date'].dt.weekday
hist_df['weekend'] = (hist_df['purchase_date'].dt.weekday >=5).astype(int)

# additional features
hist_df['price'] = hist_df['purchase_amount'] / hist_df['installments']

#Christmas : December 25 2017
hist_df['Christmas_Day_2017']=(pd.to_datetime('2017-12-25')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Mothers Day: May 14 2017
hist_df['Mothers_Day_2017']=(pd.to_datetime('2017-06-04')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#fathers day: August 13 2017
hist_df['fathers_day_2017']=(pd.to_datetime('2017-08-13')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Childrens day: October 12 2017
hist_df['Children_day_2017']=(pd.to_datetime('2017-10-12')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Valentine's Day : 12th June, 2017
hist_df['Valentine_Day_2017']=(pd.to_datetime('2017-06-12')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Black Friday : 24th November 2017
hist_df['Black_Friday_2017']=(pd.to_datetime('2017-11-24') - hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

#2018
#Mothers Day: May 13 2018
hist_df['Mothers_Day_2018']=(pd.to_datetime('2018-05-13')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

hist_df['month_diff'] = ((datetime.datetime.today() - hist_df['purchase_date']).dt.days)//30
hist_df['month_diff'] += hist_df['month_lag']

# additional features
hist_df['duration'] = hist_df['purchase_amount']*hist_df['month_diff']
hist_df['amount_month_ratio'] = hist_df['purchase_amount']/hist_df['month_diff']

# reduce memory usage
hist_df = reduce_mem_usage(hist_df)

col_unique =['subsector_id', 'merchant_id', 'merchant_category_id']
col_seas = ['month', 'hour', 'weekofyear', 'weekday', 'day']

aggs = {}
for col in col_unique:
    aggs[col] = ['nunique']

for col in col_seas:
    aggs[col] = ['nunique', 'mean', 'min', 'max']

aggs['purchase_amount'] = ['sum','max','min','mean','var','skew']
aggs['installments'] = ['sum','max','mean','var','skew']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var','skew']
aggs['month_diff'] = ['max','min','mean','var','skew']
aggs['authorized_flag'] = ['mean']
aggs['weekend'] = ['mean'] # overwrite
aggs['weekday'] = ['mean'] # overwrite
aggs['day'] = ['nunique', 'mean', 'min'] # overwrite
aggs['category_1'] = ['mean']
aggs['category_2'] = ['mean']
aggs['category_3'] = ['mean']
aggs['card_id'] = ['size','count']
aggs['price'] = ['sum','mean','max','min','var']
aggs['Christmas_Day_2017'] = ['mean']
aggs['Mothers_Day_2017'] = ['mean']
aggs['fathers_day_2017'] = ['mean']
aggs['Children_day_2017'] = ['mean']
aggs['Valentine_Day_2017'] = ['mean']
aggs['Black_Friday_2017'] = ['mean']
aggs['Mothers_Day_2018'] = ['mean']
aggs['duration']=['mean','min','max','var','skew']
aggs['amount_month_ratio']=['mean','min','max','var','skew']

for col in ['category_2','category_3']:
    hist_df[col+'_mean'] = hist_df.groupby([col])['purchase_amount'].transform('mean')
    hist_df[col+'_min'] = hist_df.groupby([col])['purchase_amount'].transform('min')
    hist_df[col+'_max'] = hist_df.groupby([col])['purchase_amount'].transform('max')
    hist_df[col+'_sum'] = hist_df.groupby([col])['purchase_amount'].transform('sum')
    aggs[col+'_mean'] = ['mean']

hist_df = hist_df.reset_index().groupby('card_id').agg(aggs)

# change column name
hist_df.columns = pd.Index([e[0] + "_" + e[1] for e in hist_df.columns.tolist()])
hist_df.columns = ['hist_'+ c for c in hist_df.columns]

hist_df['hist_purchase_date_diff'] = (hist_df['hist_purchase_date_max']-hist_df['hist_purchase_date_min']).dt.days
hist_df['hist_purchase_date_average'] = hist_df['hist_purchase_date_diff']/hist_df['hist_card_id_size']
hist_df['hist_purchase_date_uptonow'] = (datetime.datetime.today()-hist_df['hist_purchase_date_max']).dt.days
hist_df['hist_purchase_date_uptomin'] = (datetime.datetime.today()-hist_df['hist_purchase_date_min']).dt.days

# reduce memory usage
hist_df = reduce_mem_usage(hist_df)

Memory usage after optimization is: 1832.41 MB
Decreased by 73.4%
Memory usage after optimization is: 56.19 MB
Decreased by 57.6%


In [10]:
# preprocessing new_merchant_transactions
# load csv
new_merchant_df = pd.read_csv('../input/new_merchant_transactions.csv')

# fillna
new_merchant_df['category_2'].fillna(1.0,inplace=True)
new_merchant_df['category_3'].fillna('A',inplace=True)
new_merchant_df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
new_merchant_df['installments'].replace(-1, np.nan,inplace=True)
new_merchant_df['installments'].replace(999, np.nan,inplace=True)

# trim
new_merchant_df['purchase_amount'] = new_merchant_df['purchase_amount'].apply(lambda x: min(x, 0.8))

# Y/N to 1/0
new_merchant_df['authorized_flag'] = new_merchant_df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
new_merchant_df['category_1'] = new_merchant_df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
new_merchant_df['category_3'] = new_merchant_df['category_3'].map({'A':0, 'B':1, 'C':2}).astype(int)

# datetime features
new_merchant_df['purchase_date'] = pd.to_datetime(new_merchant_df['purchase_date'])
new_merchant_df['month'] = new_merchant_df['purchase_date'].dt.month
new_merchant_df['day'] = new_merchant_df['purchase_date'].dt.day
new_merchant_df['hour'] = new_merchant_df['purchase_date'].dt.hour
new_merchant_df['weekofyear'] = new_merchant_df['purchase_date'].dt.weekofyear
new_merchant_df['weekday'] = new_merchant_df['purchase_date'].dt.weekday
new_merchant_df['weekend'] = (new_merchant_df['purchase_date'].dt.weekday >=5).astype(int)

# additional features
new_merchant_df['price'] = new_merchant_df['purchase_amount'] / new_merchant_df['installments']

#Christmas : December 25 2017
new_merchant_df['Christmas_Day_2017']=(pd.to_datetime('2017-12-25')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Childrens day: October 12 2017
new_merchant_df['Children_day_2017']=(pd.to_datetime('2017-10-12')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
#Black Friday : 24th November 2017
new_merchant_df['Black_Friday_2017']=(pd.to_datetime('2017-11-24') - new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

#Mothers Day: May 13 2018
new_merchant_df['Mothers_Day_2018']=(pd.to_datetime('2018-05-13')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

new_merchant_df['month_diff'] = ((datetime.datetime.today() - new_merchant_df['purchase_date']).dt.days)//30
new_merchant_df['month_diff'] += new_merchant_df['month_lag']

# additional features
new_merchant_df['duration'] = new_merchant_df['purchase_amount']*new_merchant_df['month_diff']
new_merchant_df['amount_month_ratio'] = new_merchant_df['purchase_amount']/new_merchant_df['month_diff']

# reduce memory usage
new_merchant_df = reduce_mem_usage(new_merchant_df)

col_unique =['subsector_id', 'merchant_id', 'merchant_category_id']
col_seas = ['month', 'hour', 'weekofyear', 'weekday', 'day']

aggs = {}
for col in col_unique:
    aggs[col] = ['nunique']

for col in col_seas:
    aggs[col] = ['nunique', 'mean', 'min', 'max']

aggs['purchase_amount'] = ['sum','max','min','mean','var','skew']
aggs['installments'] = ['sum','max','mean','var','skew']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var','skew']
aggs['month_diff'] = ['mean','var','skew']
aggs['weekend'] = ['mean']
aggs['month'] = ['mean', 'min', 'max']
aggs['weekday'] = ['mean', 'min', 'max']
aggs['category_1'] = ['mean']
aggs['category_2'] = ['mean']
aggs['category_3'] = ['mean']
aggs['card_id'] = ['size','count']
aggs['price'] = ['mean','max','min','var']
aggs['Christmas_Day_2017'] = ['mean']
aggs['Children_day_2017'] = ['mean']
aggs['Black_Friday_2017'] = ['mean']
aggs['Mothers_Day_2018'] = ['mean']
aggs['duration']=['mean','min','max','var','skew']
aggs['amount_month_ratio']=['mean','min','max','var','skew']

for col in ['category_2','category_3']:
    new_merchant_df[col+'_mean'] = new_merchant_df.groupby([col])['purchase_amount'].transform('mean')
    new_merchant_df[col+'_min'] = new_merchant_df.groupby([col])['purchase_amount'].transform('min')
    new_merchant_df[col+'_max'] = new_merchant_df.groupby([col])['purchase_amount'].transform('max')
    new_merchant_df[col+'_sum'] = new_merchant_df.groupby([col])['purchase_amount'].transform('sum')
    aggs[col+'_mean'] = ['mean']

new_merchant_df = new_merchant_df.reset_index().groupby('card_id').agg(aggs)

# change column name
new_merchant_df.columns = pd.Index([e[0] + "_" + e[1] for e in new_merchant_df.columns.tolist()])
new_merchant_df.columns = ['new_'+ c for c in new_merchant_df.columns]

new_merchant_df['new_purchase_date_diff'] = (new_merchant_df['new_purchase_date_max']-new_merchant_df['new_purchase_date_min']).dt.days
new_merchant_df['new_purchase_date_average'] = new_merchant_df['new_purchase_date_diff']/new_merchant_df['new_card_id_size']
new_merchant_df['new_purchase_date_uptonow'] = (datetime.datetime.today()-new_merchant_df['new_purchase_date_max']).dt.days
new_merchant_df['new_purchase_date_uptomin'] = (datetime.datetime.today()-new_merchant_df['new_purchase_date_min']).dt.days

# reduce memory usage
new_merchant_df = reduce_mem_usage(new_merchant_df)

Memory usage after optimization is: 117.94 MB
Decreased by 71.9%
Memory usage after optimization is: 44.53 MB
Decreased by 57.6%


In [11]:
df.head()

Unnamed: 0_level_0,feature_1,feature_2,feature_3,first_active_month,outliers,target,quarter,elapsed_time,days_feature1,days_feature2,days_feature3,days_feature1_ratio,days_feature2_ratio,days_feature3_ratio,feature_sum,feature_mean,feature_max,feature_min,feature_var
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C_ID_92a2005557,0.013145,0.008752,0.011428,2017-06-01,0.0,-0.820283,2.0,634.0,3170.0,1268.0,634.0,0.007886,0.003155,0.001577,0.033324,0.011108,0.013145,0.008752,0.002214
C_ID_3d0044924f,0.010712,0.011385,0.010283,2017-01-01,0.0,0.392913,1.0,785.0,3140.0,785.0,0.0,0.005096,0.001274,0.0,0.032379,0.010793,0.011385,0.010283,0.000555
C_ID_d639edf6cd,0.01061,0.008752,0.010283,2016-08-01,0.0,0.688056,3.0,938.0,1876.0,1876.0,0.0,0.002132,0.002132,0.0,0.029645,0.009882,0.01061,0.008752,0.000992
C_ID_186d6a6901,0.010712,0.014166,0.010283,2017-09-01,0.0,0.142495,3.0,542.0,2168.0,1626.0,0.0,0.00738,0.005535,0.0,0.035161,0.01172,0.014166,0.010283,0.002129
C_ID_cdbd2c0db2,0.008058,0.014166,0.010283,2017-11-01,0.0,-0.159749,4.0,481.0,481.0,1443.0,0.0,0.002079,0.006237,0.0,0.032508,0.010836,0.014166,0.008058,0.003091


In [12]:
df = pd.merge(df, hist_df, on='card_id', how='outer')
df = pd.merge(df, new_merchant_df, on='card_id', how='outer')

In [13]:
# additional features
df['hist_first_buy'] = (df['hist_purchase_date_min'] - df['first_active_month']).dt.days
df['hist_last_buy'] = (df['hist_purchase_date_max'] - df['first_active_month']).dt.days
df['new_first_buy'] = (df['new_purchase_date_min'] - df['first_active_month']).dt.days
df['new_last_buy'] = (df['new_purchase_date_max'] - df['first_active_month']).dt.days

date_features=['hist_purchase_date_max','hist_purchase_date_min',
               'new_purchase_date_max', 'new_purchase_date_min']

for f in date_features:
    df[f] = df[f].astype(np.int64) * 1e-9

df['card_id_total'] = df['new_card_id_size']+df['hist_card_id_size']
df['card_id_cnt_total'] = df['new_card_id_count']+df['hist_card_id_count']
df['card_id_cnt_ratio'] = df['new_card_id_count']/df['hist_card_id_count']
df['purchase_amount_total'] = df['new_purchase_amount_sum']+df['hist_purchase_amount_sum']
df['purchase_amount_mean'] = df['new_purchase_amount_mean']+df['hist_purchase_amount_mean']
df['purchase_amount_max'] = df['new_purchase_amount_max']+df['hist_purchase_amount_max']
df['purchase_amount_min'] = df['new_purchase_amount_min']+df['hist_purchase_amount_min']
df['purchase_amount_ratio'] = df['new_purchase_amount_sum']/df['hist_purchase_amount_sum']
df['month_diff_mean'] = df['new_month_diff_mean']+df['hist_month_diff_mean']
df['month_diff_ratio'] = df['new_month_diff_mean']/df['hist_month_diff_mean']
df['month_lag_mean'] = df['new_month_lag_mean']+df['hist_month_lag_mean']
df['month_lag_max'] = df['new_month_lag_max']+df['hist_month_lag_max']
df['month_lag_min'] = df['new_month_lag_min']+df['hist_month_lag_min']
df['category_1_mean'] = df['new_category_1_mean']+df['hist_category_1_mean']
df['installments_total'] = df['new_installments_sum']+df['hist_installments_sum']
df['installments_mean'] = df['new_installments_mean']+df['hist_installments_mean']
df['installments_max'] = df['new_installments_max']+df['hist_installments_max']
df['installments_ratio'] = df['new_installments_sum']/df['hist_installments_sum']
df['price_total'] = df['purchase_amount_total'] / df['installments_total']
df['price_mean'] = df['purchase_amount_mean'] / df['installments_mean']
df['price_max'] = df['purchase_amount_max'] / df['installments_max']
df['duration_mean'] = df['new_duration_mean']+df['hist_duration_mean']
df['duration_min'] = df['new_duration_min']+df['hist_duration_min']
df['duration_max'] = df['new_duration_max']+df['hist_duration_max']
df['amount_month_ratio_mean']=df['new_amount_month_ratio_mean']+df['hist_amount_month_ratio_mean']
df['amount_month_ratio_min']=df['new_amount_month_ratio_min']+df['hist_amount_month_ratio_min']
df['amount_month_ratio_max']=df['new_amount_month_ratio_max']+df['hist_amount_month_ratio_max']
df['new_CLV'] = df['new_card_id_count'] * df['new_purchase_amount_sum'] / df['new_month_diff_mean']
df['hist_CLV'] = df['hist_card_id_count'] * df['hist_purchase_amount_sum'] / df['hist_month_diff_mean']
df['CLV_ratio'] = df['new_CLV'] / df['hist_CLV']

In [14]:
train_df = df[df['target'].notnull()]
test_df = df[df['target'].isnull()]
del df
gc.collect()

99

In [15]:
stratified = False

if stratified:
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=SEED)
else:
    folds = KFold(n_splits= 5, shuffle=True, random_state=SEED)

# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

In [16]:
print(train_df.shape,test_df.shape)

(201917, 202) (123623, 202)


In [17]:
# k-fold
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
    train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
    valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]

    # set data structure
    lgb_train = lgb.Dataset(train_x,
                            label=train_y,
                            free_raw_data=False)
    lgb_test = lgb.Dataset(valid_x,
                           label=valid_y,
                           free_raw_data=False)

    # params optimized by optuna
    params ={
            'task': 'train',
            'boosting': 'goss',
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.01,
            'subsample': 0.9855232997390695,
            'max_depth': 7,
            'top_rate': 0.9064148448434349,
            'num_leaves': 63,
            'min_child_weight': 41.9612869171337,
            'other_rate': 0.0721768246018207,
            'reg_alpha': 9.677537745007898,
            'colsample_bytree': 0.5665320670155495,
            'min_split_gain': 9.820197773625843,
            'reg_lambda': 8.2532317400459,
            'min_data_in_leaf': 21,
            'verbose': -1,
            'seed':SEED
            }

    reg = lgb.train(
                    params,
                    lgb_train,
                    valid_sets=[lgb_train, lgb_test],
                    valid_names=['train', 'test'],
                    num_boost_round=10000,
                    early_stopping_rounds= 200,
                    verbose_eval=200
                    )

    oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
    sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
    del reg, train_x, train_y, valid_x, valid_y
    gc.collect()
print("Average RMSE = {}".format(rmse(train_df['target'],oof_preds)))

Training until validation scores don't improve for 200 rounds.
[200]	train's rmse: 3.55636	test's rmse: 3.75528
[400]	train's rmse: 3.47241	test's rmse: 3.73443
[600]	train's rmse: 3.42095	test's rmse: 3.72929
[800]	train's rmse: 3.38156	test's rmse: 3.72799
[1000]	train's rmse: 3.34693	test's rmse: 3.72727
[1200]	train's rmse: 3.31327	test's rmse: 3.72742
Early stopping, best iteration is:
[1120]	train's rmse: 3.32648	test's rmse: 3.72707
Fold  1 RMSE : 3.727073
Training until validation scores don't improve for 200 rounds.
[200]	train's rmse: 3.5938	test's rmse: 3.60188
[400]	train's rmse: 3.5073	test's rmse: 3.58409
[600]	train's rmse: 3.45639	test's rmse: 3.57919
[800]	train's rmse: 3.41665	test's rmse: 3.57808
[1000]	train's rmse: 3.38292	test's rmse: 3.57773
Early stopping, best iteration is:
[876]	train's rmse: 3.40383	test's rmse: 3.57747
Fold  2 RMSE : 3.577471
Training until validation scores don't improve for 200 rounds.
[200]	train's rmse: 3.55495	test's rmse: 3.74746
[400]

In [18]:
train_df = train_df.reset_index(drop=False)
del test_df['target']

In [19]:
print(train_df.shape,test_df.shape)

(201917, 203) (123623, 201)


In [None]:
#train_df.to_csv('train_3_961.csv',index=False)
#test_df.to_csv('test_3_961.csv',index=False)

In [None]:
feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head()

In [None]:
# save submission file
submission_file_name = '3_691.csv'
test_df.loc[:,'target'] = sub_preds
test_df = test_df.reset_index()
test_df[['card_id', 'target']].to_csv(submission_file_name, index=False)