In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import dask.dataframe as dd
import os
import sys
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, tqdm_notebook
import time
from collections import Counter
tqdm.pandas(tqdm_notebook)
import lightgbm as lgb
import gc
import datetime
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import matplotlib.gridspec as gridspec
pd.set_option('display.float_format', '{:.5f}'.format)
SEED = 51
np.random.seed(SEED)
%matplotlib inline

In [2]:
FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size',
                  'OOF_PRED', 'month_0','outliers']

In [3]:
%%time
train_df = dd.read_csv('../input/train_3_961.csv')
test_df = dd.read_csv('../input/test_3_961.csv')
train_df = train_df.compute()
test_df = test_df.compute()
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

CPU times: user 12.9 s, sys: 1.72 s, total: 14.6 s
Wall time: 5.7 s


In [4]:
used_features = list(train_df.columns)

In [5]:
print(train_df.shape,test_df.shape)

(201917, 203) (123623, 202)


In [6]:
%%time
train_last = dd.read_csv('../input/train_last.csv')
test_last = dd.read_csv('../input/test_last.csv')
train_last = train_last.compute()
test_last = test_last.compute()
train_last = train_last.reset_index(drop=True)
test_last = test_last.reset_index(drop=True)

CPU times: user 18.1 s, sys: 2.3 s, total: 20.4 s
Wall time: 7.33 s


In [7]:
print(train_last.shape,test_last.shape)

(201917, 287) (123623, 286)


In [8]:
train_df.head()

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,outliers,target,quarter,elapsed_time,days_feature1,days_feature2,days_feature3,days_feature1_ratio,days_feature2_ratio,days_feature3_ratio,feature_sum,feature_mean,feature_max,feature_min,feature_var,hist_subsector_id_nunique,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_month_nunique,hist_month_mean,hist_month_min,hist_month_max,hist_hour_nunique,hist_hour_mean,hist_hour_min,hist_hour_max,hist_weekofyear_nunique,hist_weekofyear_mean,hist_weekofyear_min,hist_weekofyear_max,hist_weekday_mean,hist_day_nunique,hist_day_mean,hist_day_min,hist_purchase_amount_sum,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_mean,hist_purchase_amount_var,hist_purchase_amount_skew,hist_installments_sum,hist_installments_max,hist_installments_mean,hist_installments_var,hist_installments_skew,hist_purchase_date_max,hist_purchase_date_min,hist_month_lag_max,hist_month_lag_min,hist_month_lag_mean,hist_month_lag_var,hist_month_lag_skew,hist_month_diff_max,hist_month_diff_min,hist_month_diff_mean,hist_month_diff_var,hist_month_diff_skew,hist_authorized_flag_mean,hist_weekend_mean,hist_category_1_mean,hist_category_2_mean,hist_category_3_mean,hist_card_id_size,hist_card_id_count,hist_price_sum,hist_price_mean,hist_price_max,hist_price_min,hist_price_var,hist_Christmas_Day_2017_mean,hist_Mothers_Day_2017_mean,hist_fathers_day_2017_mean,hist_Children_day_2017_mean,hist_Valentine_Day_2017_mean,hist_Black_Friday_2017_mean,hist_Mothers_Day_2018_mean,hist_duration_mean,hist_duration_min,hist_duration_max,hist_duration_var,hist_duration_skew,hist_amount_month_ratio_mean,hist_amount_month_ratio_min,hist_amount_month_ratio_max,hist_amount_month_ratio_var,hist_amount_month_ratio_skew,hist_category_2_mean_mean,hist_category_3_mean_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,hist_purchase_date_uptomin,new_subsector_id_nunique,new_merchant_id_nunique,new_merchant_category_id_nunique,new_month_mean,new_month_min,new_month_max,new_hour_nunique,new_hour_mean,new_hour_min,new_hour_max,new_weekofyear_nunique,new_weekofyear_mean,new_weekofyear_min,new_weekofyear_max,new_weekday_mean,new_weekday_min,new_weekday_max,new_day_nunique,new_day_mean,new_day_min,new_day_max,new_purchase_amount_sum,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_mean,new_purchase_amount_var,new_purchase_amount_skew,new_installments_sum,new_installments_max,new_installments_mean,new_installments_var,new_installments_skew,new_purchase_date_max,new_purchase_date_min,new_month_lag_max,new_month_lag_min,new_month_lag_mean,new_month_lag_var,new_month_lag_skew,new_month_diff_mean,new_month_diff_var,new_month_diff_skew,new_weekend_mean,new_category_1_mean,new_category_2_mean,new_category_3_mean,new_card_id_size,new_card_id_count,new_price_mean,new_price_max,new_price_min,new_price_var,new_Christmas_Day_2017_mean,new_Children_day_2017_mean,new_Black_Friday_2017_mean,new_Mothers_Day_2018_mean,new_duration_mean,new_duration_min,new_duration_max,new_duration_var,new_duration_skew,new_amount_month_ratio_mean,new_amount_month_ratio_min,new_amount_month_ratio_max,new_amount_month_ratio_var,new_amount_month_ratio_skew,new_category_2_mean_mean,new_category_3_mean_mean,new_purchase_date_diff,new_purchase_date_average,new_purchase_date_uptonow,new_purchase_date_uptomin,hist_first_buy,hist_last_buy,new_first_buy,new_last_buy,card_id_total,card_id_cnt_total,card_id_cnt_ratio,purchase_amount_total,purchase_amount_mean,purchase_amount_max,purchase_amount_min,purchase_amount_ratio,month_diff_mean,month_diff_ratio,month_lag_mean,month_lag_max,month_lag_min,category_1_mean,installments_total,installments_mean,installments_max,installments_ratio,price_total,price_mean,price_max,duration_mean,duration_min,duration_max,amount_month_ratio_mean,amount_month_ratio_min,amount_month_ratio_max,new_CLV,hist_CLV,CLV_ratio
0,C_ID_92a2005557,0.01314,0.00875,0.01143,2017-06-01,0.0,-0.82028,2.0,633.0,3165.0,1266.0,633.0,0.0079,0.00316,0.00158,0.03332,0.01111,0.01314,0.00875,0.00221,21,94,41,9,8.055,1,12,23,13.31,0,23,35,33.06,1,52,3.21,31,15.51,1,-167.4,0.8,-0.7393,-0.644,0.02057,5.133,4.0,1.0,0.01538,0.01521,7.92,1519551075.0,1498573088.0,0,-8,-3.912,5.75,0.066,13,11,12.03,0.03766,3.775,0.95,0.3462,0.0,1.046,0.01538,260,260,,,inf,-inf,,13.125,0.0,6.266,26.77,0.0,16.47,7.754,-7.746,-9.46,9.6,2.994,5.062,-0.05353,-0.0616,0.06665,0.00014,5.105,-0.6064,-0.639,242,0.9307,364,606,10.0,23.0,14.0,3.479,3.0,4.0,8.0,12.87,8.0,16.0,7.0,13.305,10.0,17.0,3.13,0.0,6.0,17.0,16.44,5.0,31.0,-13.24,-0.2961,-0.7246,-0.5757,0.01843,0.896,0.0,0.0,0.0,0.0,0.0,1525000985.0,1520258676.0,2.0,1.0,1.479,0.261,0.09326,11.95,0.0435,-4.797,0.261,0.0,1.0,0.0,23.0,23.0,-inf,-inf,-inf,,0.0,0.0,0.0,41.75,-6.9,-8.695,-3.258,2.752,0.949,-0.0481,-0.06036,-0.02467,0.00012,0.8687,-0.585,-0.6123,54.0,2.348,300.0,355.0,26.0,269.0,277.0,332.0,283.0,283.0,0.08846,-180.6,-1.22,0.504,-1.464,0.0791,23.98,0.9937,-2.434,2.0,-7.0,0.0,4.0,0.01538,1.0,0.0,-45.16,-79.3,0.504,-14.64,-18.16,6.344,-0.1016,-0.12195,0.042,-25.48039,-3617.039,0.00704
1,C_ID_3d0044924f,0.01071,0.01138,0.01028,2017-01-01,0.0,0.39291,1.0,784.0,3136.0,784.0,0.0,0.0051,0.00128,0.0,0.03238,0.01079,0.01138,0.01028,0.00056,24,142,57,12,6.22,1,12,24,14.72,0,23,50,25.22,1,52,3.363,31,16.67,1,-215.4,0.8,-0.742,-0.615,0.0586,3.744,545.0,10.0,1.566,2.258,3.8,1517437869.0,1483720182.0,0,-12,-5.03,14.48,-0.258,14,12,13.07,0.09845,1.569,0.9688,0.3772,0.08856,1.0,1.2,350,350,-202.21849,-0.58109,0.4,-0.7424,0.06027,10.65,8.734,12.555,13.79,6.57,9.836,0.0,-8.04,-10.33,10.4,10.01,3.73,-0.04715,-0.06137,0.06152,0.00035,3.713,-0.6064,-0.533,390,1.114,388,778,4.0,6.0,5.0,2.5,2.0,3.0,5.0,11.164,6.0,17.0,4.0,9.0,5.0,13.0,1.5,0.0,4.0,4.0,13.5,1.0,30.0,-4.355,-0.7017,-0.7393,-0.726,0.00021,1.256,6.0,1.0,1.0,0.0,0.0,1522392506.0,1517504874.0,2.0,1.0,1.5,0.3,0.0,13.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,6.0,-0.72596,-0.70186,-0.73941,0.00021,0.0,0.0,0.0,56.84,-9.44,-9.61,-9.125,0.03442,1.215,-0.05585,-0.0569,-0.054,0.0,0.0,-0.585,-0.6187,56.0,9.336,331.0,387.0,5.0,395.0,396.0,453.0,356.0,356.0,0.01714,-219.8,-1.342,0.09814,-1.481,0.02022,26.06,0.9946,-3.531,2.0,-11.0,0.08856,551.0,2.566,11.0,0.01101,-0.399,-0.523,0.00892,-17.47,-19.94,1.273,-0.103,-0.1183,0.00754,-2.01022,-5767.3643,0.00035
2,C_ID_d639edf6cd,0.01061,0.00875,0.01028,2016-08-01,0.0,0.68806,3.0,937.0,1874.0,1874.0,0.0,0.00213,0.00213,0.0,0.02964,0.00988,0.01061,0.00875,0.00099,7,13,8,10,4.56,1,12,14,17.9,8,23,22,18.38,2,49,3.303,19,19.33,2,-29.17,-0.1459,-0.73,-0.678,0.00764,5.62,0.0,0.0,0.0,0.0,0.0,1519758505.0,1484122882.0,0,-13,-8.6,14.766,0.7256,13,12,12.02,0.02325,6.56,0.9536,0.2559,0.0,4.63,0.0,43,43,-inf,-inf,-inf,-inf,,9.67,17.0,8.91,7.51,17.66,8.91,1.721,-8.16,-9.46,-1.75,1.133,5.387,-0.05643,-0.06085,-0.01215,5e-05,5.668,-0.604,-0.6396,412,9.58,361,774,1.0,1.0,1.0,4.0,4.0,4.0,1.0,17.0,17.0,17.0,1.0,17.0,17.0,17.0,5.0,5.0,5.0,1.0,28.0,28.0,28.0,-0.7,-0.7,-0.7,-0.7,,,0.0,0.0,0.0,,,1524937391.0,1524937391.0,2.0,2.0,2.0,,,12.0,,,1.0,0.0,5.0,0.0,1.0,1.0,-inf,-inf,-inf,,0.0,0.0,0.0,14.0,-8.41,-8.41,-8.41,,,-0.05835,-0.05835,-0.05835,,,-0.5806,-0.6123,0.0,0.0,301.0,301.0,163.0,575.0,635.0,635.0,44.0,44.0,0.02326,-29.88,-1.379,-0.846,-1.43,0.024,24.03,0.998,-6.6,2.0,-11.0,0.0,0.0,0.0,0.0,,-inf,-inf,-inf,-16.56,-17.88,-10.16,-0.11475,-0.1192,-0.0705,-0.05835,-104.32878,0.00056
3,C_ID_186d6a6901,0.01071,0.01417,0.01028,2017-09-01,0.0,0.1425,3.0,541.0,2164.0,1623.0,0.0,0.00739,0.00555,0.0,0.03516,0.01172,0.01417,0.01028,0.00213,13,50,25,6,7.742,1,12,16,14.445,0,23,20,32.0,1,52,2.793,25,16.88,2,-50.12,0.8,-0.7407,-0.6514,0.03836,5.758,86.0,3.0,1.146,0.235,3.307,1519818280.0,1506442941.0,0,-5,-2.832,3.248,0.5576,12,11,11.984,0.01299,-8.77,1.0,0.1428,0.1559,3.064,1.064,77,77,-47.47444,-0.63299,0.8,-0.7409,0.05334,44.66,0.0,0.0,2.623,0.0,24.19,17.9,-7.805,-8.89,9.6,5.52,5.76,-0.05432,-0.0652,0.06665,0.00027,5.727,-0.608,-0.583,154,2.0,360,515,5.0,7.0,6.0,3.715,3.0,4.0,5.0,13.0,7.0,21.0,5.0,13.86,10.0,16.0,3.285,1.0,6.0,7.0,13.14,4.0,24.0,-4.656,-0.567,-0.7344,-0.665,0.00434,0.8086,6.0,1.0,1.0,0.0,0.0,1524049211.0,1520423706.0,2.0,1.0,1.714,0.238,-1.2295,12.0,0.0,0.0,0.4285,0.1428,3.572,0.857,7.0,7.0,-0.68127,-0.58165,-0.73414,0.00296,0.0,0.0,0.0,37.72,-7.98,-8.81,-6.8,0.6255,0.81,-0.05542,-0.0612,-0.04724,3e-05,0.9243,-0.5854,-0.6177,41.0,5.855,311.0,353.0,25.0,180.0,187.0,229.0,84.0,84.0,0.09091,-54.78,-1.316,0.2329,-1.475,0.0929,23.98,1.001,-1.118,2.0,-4.0,0.2988,92.0,2.146,4.0,0.06976,-0.595,-0.6133,0.05823,-15.78,-17.7,2.8,-0.10974,-0.1263,0.01941,-2.71615,-322.05475,0.00843
4,C_ID_cdbd2c0db2,0.00806,0.01417,0.01028,2017-11-01,0.0,-0.15975,4.0,480.0,480.0,1440.0,0.0,0.00208,0.00625,0.0,0.03251,0.01084,0.01417,0.00806,0.00309,17,66,26,4,5.406,1,12,22,13.05,0,23,17,21.78,1,52,3.24,30,16.1,1,-75.25,0.8,-0.746,-0.566,0.1039,3.268,182.0,12.0,1.368,3.598,5.418,1519850441.0,1510444800.0,0,-3,-1.286,1.054,-0.296,12,11,11.98,0.02222,-6.504,0.9624,0.3157,0.1128,3.443,1.053,133,133,-78.53185,-0.59047,0.3042,-0.74616,0.04965,7.79,0.0,0.0,0.0,0.0,0.6616,20.66,-6.773,-8.95,9.6,14.93,3.268,-0.04727,-0.06464,0.06665,0.00072,3.258,-0.6084,-0.599,108,0.812,360,469,10.0,36.0,17.0,3.555,3.0,4.0,14.0,14.72,5.0,23.0,8.0,13.36,9.0,17.0,3.277,0.0,6.0,22.0,14.586,1.0,31.0,-19.92,0.451,-0.7393,-0.5537,0.0501,2.893,36.0,2.0,1.028,0.02856,5.918,1524941425.0,1519991743.0,2.0,1.0,1.556,0.254,-0.2334,11.945,0.05396,-4.05,0.3333,0.05554,3.473,1.0,36.0,36.0,-0.55959,0.45089,-0.73939,0.0503,0.0,0.0,0.0,41.2,-6.605,-8.875,5.41,7.125,2.908,-0.04642,-0.0654,0.03757,0.00035,2.824,-0.5845,-0.607,57.0,1.583,301.0,358.0,11.0,119.0,121.0,178.0,169.0,169.0,0.27068,-95.2,-1.119,1.251,-1.485,0.2646,23.92,0.9976,0.2695,2.0,-2.0,0.1683,218.0,2.396,14.0,0.1978,-0.4365,-0.467,0.08936,-13.375,-17.83,15.016,-0.0937,-0.1301,0.10425,-60.03924,-835.65295,0.07185


In [9]:
used_columns = ['card_id'] + [x for x in train_last.columns if x not in train_df.columns]

In [10]:
train = train_df.merge(train_last[used_columns],how = 'left',on = 'card_id')

In [11]:
test = test_df.merge(test_last[used_columns],how = 'left',on = 'card_id')

In [12]:
del train['outliers'],test['outliers']
gc.collect()

116

In [13]:
print(train.shape,test.shape)

(201917, 479) (123623, 478)


In [14]:
#del train_df,test_df,train_last,test_last
#gc.collect()

In [15]:
"""def getDuplicateColumns(df):
    '''
    Get a list of duplicate columns.
    It will iterate over all the columns in dataframe and find the columns whose contents are duplicate.
    :param df: Dataframe object
    :return: List of columns whose contents are duplicates.
    '''
    duplicateColumnNames = set()
    # Iterate over all the columns in dataframe
    for x in tqdm(list(range(df.shape[1]))):
        # Select column at xth index.
        col = df.iloc[:, x]
        # Iterate over all the columns in DataFrame from (x+1)th index till end
        for y in range(x + 1, df.shape[1]):
            # Select column at yth index.
            otherCol = df.iloc[:, y]
            # Check if two columns at x 7 y index are equal
            if col.equals(otherCol):
                duplicateColumnNames.add(df.columns.values[y])
 
    return list(duplicateColumnNames)
duplicateColumnNames = getDuplicateColumns(train)"""

"def getDuplicateColumns(df):\n    '''\n    Get a list of duplicate columns.\n    It will iterate over all the columns in dataframe and find the columns whose contents are duplicate.\n    :param df: Dataframe object\n    :return: List of columns whose contents are duplicates.\n    '''\n    duplicateColumnNames = set()\n    # Iterate over all the columns in dataframe\n    for x in tqdm(list(range(df.shape[1]))):\n        # Select column at xth index.\n        col = df.iloc[:, x]\n        # Iterate over all the columns in DataFrame from (x+1)th index till end\n        for y in range(x + 1, df.shape[1]):\n            # Select column at yth index.\n            otherCol = df.iloc[:, y]\n            # Check if two columns at x 7 y index are equal\n            if col.equals(otherCol):\n                duplicateColumnNames.add(df.columns.values[y])\n \n    return list(duplicateColumnNames)\nduplicateColumnNames = getDuplicateColumns(train)"

In [16]:
duplicateColumnNames = ['h_month_lag_max', 'n_month_lag_min', 'new_card_id_count', 'new_transactions_count', 'n_af_sum', 'new_purchase_month_min', 'n_af_mean', 'h_max_min_diff', 'n_af_count', 'hist_card_id_count', 'n_how_many_unique_subsector_id', 'card_id_cnt_total', 'n_max_min_diff', 'installments_count_std', 'new_purchase_month_max', 'installments_count_mean', 'n_ins_count', 'n_af_max', 'a_transactions_count', 'n_how_many_merchant_category_id_visited', 'h_how_many_merchant_category_id_visited', 'n_af_median', 'h_ins_count', 'h_how_many_unique_subsector_id', 'h_how_many_merchant_id_visited', 'n_how_many_merchant_id_visited', 'h_month_lag_min', 'h_af_count', 'n_month_lag_max']

In [17]:
print(duplicateColumnNames)

['h_month_lag_max', 'n_month_lag_min', 'new_card_id_count', 'new_transactions_count', 'n_af_sum', 'new_purchase_month_min', 'n_af_mean', 'h_max_min_diff', 'n_af_count', 'hist_card_id_count', 'n_how_many_unique_subsector_id', 'card_id_cnt_total', 'n_max_min_diff', 'installments_count_std', 'new_purchase_month_max', 'installments_count_mean', 'n_ins_count', 'n_af_max', 'a_transactions_count', 'n_how_many_merchant_category_id_visited', 'h_how_many_merchant_category_id_visited', 'n_af_median', 'h_ins_count', 'h_how_many_unique_subsector_id', 'h_how_many_merchant_id_visited', 'n_how_many_merchant_id_visited', 'h_month_lag_min', 'h_af_count', 'n_month_lag_max']


In [18]:
train.drop(duplicateColumnNames,axis=1,inplace=True)
test.drop(duplicateColumnNames,axis=1,inplace=True)

In [19]:
print(train.shape,test.shape)

(201917, 450) (123623, 449)


In [20]:
train.head()

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,target,quarter,elapsed_time,days_feature1,days_feature2,days_feature3,days_feature1_ratio,days_feature2_ratio,days_feature3_ratio,feature_sum,feature_mean,feature_max,feature_min,feature_var,hist_subsector_id_nunique,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_month_nunique,hist_month_mean,hist_month_min,hist_month_max,hist_hour_nunique,hist_hour_mean,hist_hour_min,hist_hour_max,hist_weekofyear_nunique,hist_weekofyear_mean,hist_weekofyear_min,hist_weekofyear_max,hist_weekday_mean,hist_day_nunique,hist_day_mean,hist_day_min,hist_purchase_amount_sum,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_mean,hist_purchase_amount_var,hist_purchase_amount_skew,hist_installments_sum,hist_installments_max,hist_installments_mean,hist_installments_var,hist_installments_skew,hist_purchase_date_max,hist_purchase_date_min,hist_month_lag_max,hist_month_lag_min,hist_month_lag_mean,hist_month_lag_var,hist_month_lag_skew,hist_month_diff_max,hist_month_diff_min,hist_month_diff_mean,hist_month_diff_var,hist_month_diff_skew,hist_authorized_flag_mean,hist_weekend_mean,hist_category_1_mean,hist_category_2_mean,hist_category_3_mean,hist_card_id_size,hist_price_sum,hist_price_mean,hist_price_max,hist_price_min,hist_price_var,hist_Christmas_Day_2017_mean,hist_Mothers_Day_2017_mean,hist_fathers_day_2017_mean,hist_Children_day_2017_mean,hist_Valentine_Day_2017_mean,hist_Black_Friday_2017_mean,hist_Mothers_Day_2018_mean,hist_duration_mean,hist_duration_min,hist_duration_max,hist_duration_var,hist_duration_skew,hist_amount_month_ratio_mean,hist_amount_month_ratio_min,hist_amount_month_ratio_max,hist_amount_month_ratio_var,hist_amount_month_ratio_skew,hist_category_2_mean_mean,hist_category_3_mean_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,hist_purchase_date_uptomin,new_subsector_id_nunique,new_merchant_id_nunique,new_merchant_category_id_nunique,new_month_mean,new_month_min,new_month_max,new_hour_nunique,new_hour_mean,new_hour_min,new_hour_max,new_weekofyear_nunique,new_weekofyear_mean,new_weekofyear_min,new_weekofyear_max,new_weekday_mean,new_weekday_min,new_weekday_max,new_day_nunique,new_day_mean,new_day_min,new_day_max,new_purchase_amount_sum,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_mean,new_purchase_amount_var,new_purchase_amount_skew,new_installments_sum,new_installments_max,new_installments_mean,new_installments_var,new_installments_skew,new_purchase_date_max,new_purchase_date_min,new_month_lag_max,new_month_lag_min,new_month_lag_mean,new_month_lag_var,new_month_lag_skew,new_month_diff_mean,new_month_diff_var,new_month_diff_skew,new_weekend_mean,new_category_1_mean,new_category_2_mean,new_category_3_mean,new_card_id_size,new_price_mean,new_price_max,new_price_min,new_price_var,new_Christmas_Day_2017_mean,new_Children_day_2017_mean,new_Black_Friday_2017_mean,new_Mothers_Day_2018_mean,new_duration_mean,new_duration_min,new_duration_max,new_duration_var,new_duration_skew,new_amount_month_ratio_mean,new_amount_month_ratio_min,new_amount_month_ratio_max,new_amount_month_ratio_var,new_amount_month_ratio_skew,new_category_2_mean_mean,new_category_3_mean_mean,new_purchase_date_diff,new_purchase_date_average,new_purchase_date_uptonow,new_purchase_date_uptomin,hist_first_buy,hist_last_buy,new_first_buy,new_last_buy,card_id_total,card_id_cnt_ratio,purchase_amount_total,purchase_amount_mean,purchase_amount_max,purchase_amount_min,purchase_amount_ratio,month_diff_mean,month_diff_ratio,month_lag_mean,month_lag_max,month_lag_min,category_1_mean,installments_total,installments_mean,installments_max,installments_ratio,price_total,price_mean,price_max,duration_mean,duration_min,duration_max,amount_month_ratio_mean,amount_month_ratio_min,amount_month_ratio_max,new_CLV,hist_CLV,CLV_ratio,active_year,active_month,h_af_sum,h_af_min,h_af_max,h_af_std,h_af_mean,h_af_median,h_how_many_city_visited,h_most_visited_city,h_num_most_visited_city,h_second_most_visited_city,h_num_second_most_visited_city,h_third_most_visited_city,h_num_third_most_visited_city,h_most_common_category_1,h_num_most_common_category_1,h_second_most_common_category_1,h_num_second_most_common_category_1,h_ins_sum,h_ins_min,h_ins_max,h_ins_std,h_ins_mean,h_ins_median,h_most_common_category_3,h_num_most_common_category_3,h_second_most_common_category_3,h_num_second_most_common_category_3,h_third_most_common_category_3,h_num_third_most_common_category_3,h_most_common_merchant_category_id,h_num_most_common_merchant_category_id,h_second_most_common_merchant_category_id,h_num_second_most_common_merchant_category_id,h_third_most_common_merchant_category_id,h_num_third_most_common_merchant_category_id,h_most_common_merchant_id,h_num_most_common_merchant_id,h_second_most_common_merchant_id,h_num_second_most_common_merchant_id,h_third_most_common_merchant_id,h_num_third_most_common_merchant_id,h_binned_purchase_amount_sum,h_binned_purchase_amount_min,h_binned_purchase_amount_max,h_binned_purchase_amount_std,h_binned_purchase_amount_mean,h_binned_purchase_amount_median,h_month_lag_sum,h_month_lag_std,h_month_lag_mean,h_month_lag_median,h_purchase_amount_sum,h_purchase_amount_min,h_purchase_amount_max,h_purchase_amount_std,h_purchase_amount_mean,h_purchase_amount_median,h_max_75_percentile_diff,h_max_50_percentile_diff,h_max_25_percentile_diff,h_75_percentile_50_percentile_diff,h_75_percentile_25_percentile_diff,h_75_percentile_min_diff,h_50_percentile_25_percentile_diff,h_50_percentile_min_diff,h_25_percentile_min_diff,h_how_many_unique_category_2,h_most_common_category_2,h_num_most_common_category_2,h_second_most_common_category_2,h_num_second_most_common_category_2,h_third_most_common_category_2,h_num_third_most_common_category_2,h_how_many_unique_state_id,h_most_common_state_id,h_num_most_common_state_id,h_second_most_common_state_id,h_num_second_most_common_state_id,h_third_most_common_state_id,h_num_third_most_common_state_id,h_most_common_subsector_id,h_num_most_common_subsector_id,h_second_most_common_subsector_id,h_num_second_most_common_subsector_id,h_third_most_common_subsector_id,h_num_third_most_common_subsector_id,n_af_min,n_af_std,n_how_many_city_visited,n_most_visited_city,n_num_most_visited_city,n_second_most_visited_city,n_num_second_most_visited_city,n_third_most_visited_city,n_num_third_most_visited_city,n_most_common_category_1,n_num_most_common_category_1,n_second_most_common_category_1,n_num_second_most_common_category_1,n_ins_sum,n_ins_min,n_ins_max,n_ins_std,n_ins_mean,n_ins_median,n_most_common_category_3,n_num_most_common_category_3,n_second_most_common_category_3,n_num_second_most_common_category_3,n_third_most_common_category_3,n_num_third_most_common_category_3,n_most_common_merchant_category_id,n_num_most_common_merchant_category_id,n_second_most_common_merchant_category_id,n_num_second_most_common_merchant_category_id,n_third_most_common_merchant_category_id,n_num_third_most_common_merchant_category_id,n_most_common_merchant_id,n_num_most_common_merchant_id,n_second_most_common_merchant_id,n_num_second_most_common_merchant_id,n_third_most_common_merchant_id,n_num_third_most_common_merchant_id,n_binned_purchase_amount_sum,n_binned_purchase_amount_min,n_binned_purchase_amount_max,n_binned_purchase_amount_std,n_binned_purchase_amount_mean,n_binned_purchase_amount_median,n_month_lag_sum,n_month_lag_std,n_month_lag_mean,n_month_lag_median,n_purchase_amount_sum,n_purchase_amount_min,n_purchase_amount_max,n_purchase_amount_std,n_purchase_amount_mean,n_purchase_amount_median,n_max_75_percentile_diff,n_max_50_percentile_diff,n_max_25_percentile_diff,n_75_percentile_50_percentile_diff,n_75_percentile_25_percentile_diff,n_75_percentile_min_diff,n_50_percentile_25_percentile_diff,n_50_percentile_min_diff,n_25_percentile_min_diff,n_how_many_unique_category_2,n_most_common_category_2,n_num_most_common_category_2,n_second_most_common_category_2,n_num_second_most_common_category_2,n_third_most_common_category_2,n_num_third_most_common_category_2,n_how_many_unique_state_id,n_most_common_state_id,n_num_most_common_state_id,n_second_most_common_state_id,n_num_second_most_common_state_id,n_third_most_common_state_id,n_num_third_most_common_state_id,n_most_common_subsector_id,n_num_most_common_subsector_id,n_second_most_common_subsector_id,n_num_second_most_common_subsector_id,n_third_most_common_subsector_id,n_num_third_most_common_subsector_id,h_transactions_count,h_category_1_sum,h_category_1_mean,h_category_2_1.0_mean,h_category_2_2.0_mean,h_category_2_3.0_mean,h_category_2_4.0_mean,h_category_2_5.0_mean,h_category_3_A_mean,h_category_3_B_mean,h_category_3_C_mean,h_purchase_month_mean,h_purchase_month_max,h_purchase_month_min,h_purchase_month_std,h_month_diff_mean,a_category_1_sum,a_category_1_mean,a_category_2_1.0_mean,a_category_2_2.0_mean,a_category_2_3.0_mean,a_category_2_4.0_mean,a_category_2_5.0_mean,a_category_3_A_mean,a_category_3_B_mean,a_category_3_C_mean,a_purchase_month_mean,a_purchase_month_max,a_purchase_month_min,a_purchase_month_std,a_month_diff_mean,new_category_1_sum,new_category_2_1.0_mean,new_category_2_2.0_mean,new_category_2_3.0_mean,new_category_2_4.0_mean,new_category_2_5.0_mean,new_category_3_A_mean,new_category_3_B_mean,new_category_3_C_mean,new_purchase_month_mean,new_purchase_month_std,month_lag_std,purchase_amount_count_mean,purchase_amount_count_std,purchase_amount_sum_mean,purchase_amount_sum_std,purchase_amount_mean_mean,purchase_amount_mean_std,purchase_amount_min_mean,purchase_amount_min_std,purchase_amount_max_mean,purchase_amount_max_std,purchase_amount_std_mean,purchase_amount_std_std,installments_sum_mean,installments_sum_std,installments_mean_mean,installments_mean_std,installments_min_mean,installments_min_std,installments_max_mean,installments_max_std,installments_std_mean,installments_std_std,category_1_purchase_amount_mean,category_1_purchase_amount_min,category_1_purchase_amount_max,category_1_purchase_amount_std,installments_purchase_amount_mean,installments_purchase_amount_min,installments_purchase_amount_max,installments_purchase_amount_std,city_id_purchase_amount_mean,city_id_purchase_amount_min,city_id_purchase_amount_max,city_id_purchase_amount_std,category_1_installments_mean,category_1_installments_min,category_1_installments_max,category_1_installments_std
0,C_ID_92a2005557,0.01314,0.00875,0.01143,2017-06-01,-0.82028,2.0,633.0,3165.0,1266.0,633.0,0.0079,0.00316,0.00158,0.03332,0.01111,0.01314,0.00875,0.00221,21,94,41,9,8.055,1,12,23,13.31,0,23,35,33.06,1,52,3.21,31,15.51,1,-167.4,0.8,-0.7393,-0.644,0.02057,5.133,4.0,1.0,0.01538,0.01521,7.92,1519551075.0,1498573088.0,0,-8,-3.912,5.75,0.066,13,11,12.03,0.03766,3.775,0.95,0.3462,0.0,1.046,0.01538,260,,,inf,-inf,,13.125,0.0,6.266,26.77,0.0,16.47,7.754,-7.746,-9.46,9.6,2.994,5.062,-0.05353,-0.0616,0.06665,0.00014,5.105,-0.6064,-0.639,242,0.9307,364,606,10.0,23.0,14.0,3.479,3.0,4.0,8.0,12.87,8.0,16.0,7.0,13.305,10.0,17.0,3.13,0.0,6.0,17.0,16.44,5.0,31.0,-13.24,-0.2961,-0.7246,-0.5757,0.01843,0.896,0.0,0.0,0.0,0.0,0.0,1525000985.0,1520258676.0,2.0,1.0,1.479,0.261,0.09326,11.95,0.0435,-4.797,0.261,0.0,1.0,0.0,23.0,-inf,-inf,-inf,,0.0,0.0,0.0,41.75,-6.9,-8.695,-3.258,2.752,0.949,-0.0481,-0.06036,-0.02467,0.00012,0.8687,-0.585,-0.6123,54.0,2.348,300.0,355.0,26.0,269.0,277.0,332.0,283.0,0.08846,-180.6,-1.22,0.504,-1.464,0.0791,23.98,0.9937,-2.434,2.0,-7.0,0.0,4.0,0.01538,1.0,0.0,-45.16,-79.3,0.504,-14.64,-18.16,6.344,-0.1016,-0.12195,0.042,-25.48039,-3617.039,0.00704,2017,6,247,0,1,0.21837,0.95,1.0,7,69,248,131.0,4.0,231.0,3.0,N,260,,,4,0,1,0.12331,0.01538,0.0,A,256,B,4.0,,,560,61,80.0,31.0,454.0,27.0,M_ID_1a81c358a3,32,M_ID_b3c49066d8,23.0,M_ID_5ba019a379,17.0,1421,2,10,1.25616,5.46538,5.0,-1017,2.39769,-3.91154,-4.0,-165.96874,-0.73939,2.25839,0.21214,-0.63834,-0.69804,66,130,198,64,131,176,67,111,44,2,1.0,257,5.0,3.0,,,3,9,257,5.0,2.0,20.0,1.0,34,97,37.0,59.0,39.0,27.0,1.0,0.0,3.0,69.0,19.0,19.0,3.0,276.0,1.0,N,23.0,,,0.0,0.0,0.0,0.0,0.0,0.0,A,23.0,,,,,278.0,7.0,80.0,3.0,454.0,2.0,M_ID_56086ccdb1,1.0,M_ID_2637773dd2,1.0,M_ID_e1fd26e379,1.0,142.0,4.0,8.0,1.33662,6.17391,7.0,34.0,0.51075,1.47826,1.0,-13.2442,-0.72437,-0.29611,0.13581,-0.57583,-0.58118,15.0,28.0,44.0,12.0,29.0,38.0,16.0,26.0,9.0,1.0,1.0,23.0,,,,,1.0,9.0,23.0,,,,,37.0,11.0,39.0,2.0,19.0,2.0,13.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.69231,0.30769,0.0,9.53846,12.0,7.0,1.6641,12.0,0,0.0,0.98785,0.0,0.0,0.0,0.01215,1.0,0.0,0.0,7.97976,12,1,3.52857,12.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.47826,0.51075,2.73861,27.44444,16.62912,-17.48856,10.01566,-0.65048,0.05226,-0.73269,0.0088,-0.15882,0.91602,0.10958,0.11922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.57583,-0.57583,-0.57583,,-0.57583,-0.57583,-0.57583,,-0.45899,-0.60659,-0.29611,0.1558,0.0,0.0,0.0,
1,C_ID_3d0044924f,0.01071,0.01138,0.01028,2017-01-01,0.39291,1.0,784.0,3136.0,784.0,0.0,0.0051,0.00128,0.0,0.03238,0.01079,0.01138,0.01028,0.00056,24,142,57,12,6.22,1,12,24,14.72,0,23,50,25.22,1,52,3.363,31,16.67,1,-215.4,0.8,-0.742,-0.615,0.0586,3.744,545.0,10.0,1.566,2.258,3.8,1517437869.0,1483720182.0,0,-12,-5.03,14.48,-0.258,14,12,13.07,0.09845,1.569,0.9688,0.3772,0.08856,1.0,1.2,350,-202.21849,-0.58109,0.4,-0.7424,0.06027,10.65,8.734,12.555,13.79,6.57,9.836,0.0,-8.04,-10.33,10.4,10.01,3.73,-0.04715,-0.06137,0.06152,0.00035,3.713,-0.6064,-0.533,390,1.114,388,778,4.0,6.0,5.0,2.5,2.0,3.0,5.0,11.164,6.0,17.0,4.0,9.0,5.0,13.0,1.5,0.0,4.0,4.0,13.5,1.0,30.0,-4.355,-0.7017,-0.7393,-0.726,0.00021,1.256,6.0,1.0,1.0,0.0,0.0,1522392506.0,1517504874.0,2.0,1.0,1.5,0.3,0.0,13.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,-0.72596,-0.70186,-0.73941,0.00021,0.0,0.0,0.0,56.84,-9.44,-9.61,-9.125,0.03442,1.215,-0.05585,-0.0569,-0.054,0.0,0.0,-0.585,-0.6187,56.0,9.336,331.0,387.0,5.0,395.0,396.0,453.0,356.0,0.01714,-219.8,-1.342,0.09814,-1.481,0.02022,26.06,0.9946,-3.531,2.0,-11.0,0.08856,551.0,2.566,11.0,0.01101,-0.399,-0.523,0.00892,-17.47,-19.94,1.273,-0.103,-0.1183,0.00754,-2.01022,-5767.3643,0.00035,2017,1,339,0,1,0.17472,0.96857,1.0,9,69,289,-1.0,31.0,19.0,7.0,N,319,Y,31.0,543,-1,10,1.51078,1.55143,1.0,B,276,C,72.0,A,2.0,307,54,80.0,47.0,560.0,36.0,M_ID_940fb4498f,27,M_ID_2637773dd2,26.0,M_ID_5ba019a379,26.0,1849,2,10,1.68509,5.28286,5.0,-1761,3.80493,-5.03143,-5.0,-210.00634,-0.7424,4.6303,0.38497,-0.60002,-0.70859,56,180,248,124,191,333,67,209,142,1,1.0,350,,,,,3,9,309,-1.0,31.0,15.0,10.0,34,74,37.0,59.0,19.0,54.0,1.0,0.0,1.0,69.0,6.0,,,,,N,6.0,,,6.0,1.0,1.0,0.0,1.0,1.0,B,6.0,,,,,307.0,2.0,514.0,1.0,80.0,1.0,M_ID_4dbadbd1c9,1.0,M_ID_6e7c412a33,1.0,M_ID_7e93847d9a,1.0,21.0,2.0,5.0,1.22474,3.5,3.0,9.0,0.54772,1.5,1.5,-4.35573,-0.73941,-0.70186,0.01433,-0.72596,-0.73263,9.0,9.0,52.0,0.0,43.0,46.0,42.0,46.0,3.0,1.0,1.0,6.0,,,,,1.0,9.0,6.0,,,,,19.0,2.0,37.0,2.0,9.0,1.0,11.0,2.0,0.18182,0.81818,0.0,0.0,0.0,0.0,0.0,0.36364,0.63636,8.54545,12.0,3.0,2.6968,13.09091,29,0.08555,0.91445,0.0,0.0,0.0,0.0,0.0,0.80236,0.19174,6.14454,12,1,3.85918,13.0413,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.5,0.54772,3.89444,26.07692,13.76264,-16.06796,10.15388,-0.59336,0.09513,-0.73697,0.00514,0.51066,1.32703,0.28211,0.284,38.53846,13.34551,1.61839,0.45385,0.69231,0.75107,6.0,2.54951,1.36598,0.79249,-0.72596,-0.72596,-0.72596,,-0.72596,-0.72596,-0.72596,,-0.72596,-0.72596,-0.72596,,1.0,1.0,1.0,
2,C_ID_d639edf6cd,0.01061,0.00875,0.01028,2016-08-01,0.68806,3.0,937.0,1874.0,1874.0,0.0,0.00213,0.00213,0.0,0.02964,0.00988,0.01061,0.00875,0.00099,7,13,8,10,4.56,1,12,14,17.9,8,23,22,18.38,2,49,3.303,19,19.33,2,-29.17,-0.1459,-0.73,-0.678,0.00764,5.62,0.0,0.0,0.0,0.0,0.0,1519758505.0,1484122882.0,0,-13,-8.6,14.766,0.7256,13,12,12.02,0.02325,6.56,0.9536,0.2559,0.0,4.63,0.0,43,-inf,-inf,-inf,-inf,,9.67,17.0,8.91,7.51,17.66,8.91,1.721,-8.16,-9.46,-1.75,1.133,5.387,-0.05643,-0.06085,-0.01215,5e-05,5.668,-0.604,-0.6396,412,9.58,361,774,1.0,1.0,1.0,4.0,4.0,4.0,1.0,17.0,17.0,17.0,1.0,17.0,17.0,17.0,5.0,5.0,5.0,1.0,28.0,28.0,28.0,-0.7,-0.7,-0.7,-0.7,,,0.0,0.0,0.0,,,1524937391.0,1524937391.0,2.0,2.0,2.0,,,12.0,,,1.0,0.0,5.0,0.0,1.0,-inf,-inf,-inf,,0.0,0.0,0.0,14.0,-8.41,-8.41,-8.41,,,-0.05835,-0.05835,-0.05835,,,-0.5806,-0.6123,0.0,0.0,301.0,301.0,163.0,575.0,635.0,635.0,44.0,0.02326,-29.88,-1.379,-0.846,-1.43,0.024,24.03,0.998,-6.6,2.0,-11.0,0.0,0.0,0.0,0.0,,-inf,-inf,-inf,-16.56,-17.88,-10.16,-0.11475,-0.1192,-0.0705,-0.05835,-104.32878,0.00056,2016,8,41,0,1,0.21308,0.95349,1.0,5,143,39,209.0,1.0,233.0,1.0,N,43,,,0,0,0,0.0,0.0,0.0,A,43,,,,,705,32,367.0,3.0,80.0,3.0,M_ID_5634fd83e0,28,M_ID_17511eed62,3.0,M_ID_cea793c22f,2.0,232,4,9,0.82056,5.39535,5.0,-370,3.84299,-8.60465,-10.0,-29.16739,-0.73014,-0.14585,0.08738,-0.67831,-0.69887,177,310,368,133,191,234,58,101,43,2,5.0,39,1.0,4.0,,,2,5,39,9.0,4.0,,,33,32,37.0,4.0,16.0,3.0,1.0,,1.0,143.0,1.0,,,,,N,1.0,,,0.0,0.0,0.0,,0.0,0.0,A,1.0,,,,,528.0,1.0,,,,,M_ID_c84d28e906,1.0,,,,,5.0,5.0,5.0,,5.0,5.0,2.0,,2.0,2.0,-0.70033,-0.70033,-0.70033,,-0.70033,-0.70033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,1.0,,,,,1.0,5.0,1.0,,,,,25.0,1.0,,,,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,4.0,2.0,1.41421,12.0,0,0.0,0.09756,0.0,0.0,0.0,0.90244,1.0,0.0,0.0,4.63415,12,1,3.32984,12.02439,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,,4.39697,3.41667,2.10878,-2.31904,1.4985,-0.66932,0.08066,-0.70675,0.0188,-0.62109,0.15178,0.06594,0.11903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.70033,-0.70033,-0.70033,,-0.70033,-0.70033,-0.70033,,-0.70033,-0.70033,-0.70033,,0.0,0.0,0.0,
3,C_ID_186d6a6901,0.01071,0.01417,0.01028,2017-09-01,0.1425,3.0,541.0,2164.0,1623.0,0.0,0.00739,0.00555,0.0,0.03516,0.01172,0.01417,0.01028,0.00213,13,50,25,6,7.742,1,12,16,14.445,0,23,20,32.0,1,52,2.793,25,16.88,2,-50.12,0.8,-0.7407,-0.6514,0.03836,5.758,86.0,3.0,1.146,0.235,3.307,1519818280.0,1506442941.0,0,-5,-2.832,3.248,0.5576,12,11,11.984,0.01299,-8.77,1.0,0.1428,0.1559,3.064,1.064,77,-47.47444,-0.63299,0.8,-0.7409,0.05334,44.66,0.0,0.0,2.623,0.0,24.19,17.9,-7.805,-8.89,9.6,5.52,5.76,-0.05432,-0.0652,0.06665,0.00027,5.727,-0.608,-0.583,154,2.0,360,515,5.0,7.0,6.0,3.715,3.0,4.0,5.0,13.0,7.0,21.0,5.0,13.86,10.0,16.0,3.285,1.0,6.0,7.0,13.14,4.0,24.0,-4.656,-0.567,-0.7344,-0.665,0.00434,0.8086,6.0,1.0,1.0,0.0,0.0,1524049211.0,1520423706.0,2.0,1.0,1.714,0.238,-1.2295,12.0,0.0,0.0,0.4285,0.1428,3.572,0.857,7.0,-0.68127,-0.58165,-0.73414,0.00296,0.0,0.0,0.0,37.72,-7.98,-8.81,-6.8,0.6255,0.81,-0.05542,-0.0612,-0.04724,3e-05,0.9243,-0.5854,-0.6177,41.0,5.855,311.0,353.0,25.0,180.0,187.0,229.0,84.0,0.09091,-54.78,-1.316,0.2329,-1.475,0.0929,23.98,1.001,-1.118,2.0,-4.0,0.2988,92.0,2.146,4.0,0.06976,-0.595,-0.6133,0.05823,-15.78,-17.7,2.8,-0.10974,-0.1263,0.01941,-2.71615,-322.05475,0.00843,2017,9,77,1,1,0.0,1.0,1.0,7,17,52,-1.0,12.0,69.0,8.0,N,65,Y,12.0,84,-1,3,0.58897,1.09091,1.0,B,68,C,7.0,A,2.0,278,13,80.0,10.0,879.0,10.0,M_ID_00a6ca8a8a,9,M_ID_48257bb851,7.0,M_ID_309752ddea,4.0,389,2,10,1.55509,5.05195,5.0,-218,1.80206,-2.83117,-4.0,-49.49136,-0.7409,1.4456,0.26162,-0.64274,-0.70799,54,125,141,70,87,99,16,29,12,2,4.0,53,1.0,24.0,,,5,22,52,-1.0,12.0,9.0,10.0,37,24,27.0,13.0,29.0,12.0,1.0,0.0,2.0,17.0,6.0,-1.0,1.0,,,N,6.0,Y,1.0,5.0,-1.0,1.0,0.75593,0.71429,1.0,B,6.0,A,1.0,,,80.0,2.0,278.0,1.0,68.0,1.0,M_ID_aa13f8b4d9,1.0,M_ID_46534664f2,1.0,M_ID_a3196b9d3b,1.0,36.0,3.0,7.0,1.34519,5.14286,5.0,12.0,0.48795,1.71429,2.0,-4.65437,-0.73414,-0.56674,0.06588,-0.66491,-0.69161,1.0,9.0,24.0,8.0,23.0,40.0,15.0,32.0,17.0,2.0,4.0,6.0,1.0,1.0,,,2.0,22.0,6.0,-1.0,1.0,,,37.0,3.0,27.0,1.0,8.0,1.0,,,,,,,,,,,,,,,,,12,0.15584,0.15584,0.0,0.0,0.68831,0.0,0.0,0.88312,0.09091,7.74026,12,1,3.9048,11.97403,1.0,0.0,0.0,0.0,0.85714,0.0,0.0,0.85714,0.0,3.71429,0.48795,1.87083,12.83333,9.70395,-8.24856,7.14059,-0.60499,0.16704,-0.73427,0.00907,-0.09734,0.76609,0.20551,0.27744,14.0,11.52389,1.04482,0.21564,0.33333,1.0328,2.0,1.09545,0.51891,0.48328,-0.66408,-0.66524,-0.66291,0.00165,-0.62401,-0.68127,-0.56674,0.08099,-0.66408,-0.66524,-0.66291,0.00165,0.83333,0.66667,1.0,0.2357
4,C_ID_cdbd2c0db2,0.00806,0.01417,0.01028,2017-11-01,-0.15975,4.0,480.0,480.0,1440.0,0.0,0.00208,0.00625,0.0,0.03251,0.01084,0.01417,0.00806,0.00309,17,66,26,4,5.406,1,12,22,13.05,0,23,17,21.78,1,52,3.24,30,16.1,1,-75.25,0.8,-0.746,-0.566,0.1039,3.268,182.0,12.0,1.368,3.598,5.418,1519850441.0,1510444800.0,0,-3,-1.286,1.054,-0.296,12,11,11.98,0.02222,-6.504,0.9624,0.3157,0.1128,3.443,1.053,133,-78.53185,-0.59047,0.3042,-0.74616,0.04965,7.79,0.0,0.0,0.0,0.0,0.6616,20.66,-6.773,-8.95,9.6,14.93,3.268,-0.04727,-0.06464,0.06665,0.00072,3.258,-0.6084,-0.599,108,0.812,360,469,10.0,36.0,17.0,3.555,3.0,4.0,14.0,14.72,5.0,23.0,8.0,13.36,9.0,17.0,3.277,0.0,6.0,22.0,14.586,1.0,31.0,-19.92,0.451,-0.7393,-0.5537,0.0501,2.893,36.0,2.0,1.028,0.02856,5.918,1524941425.0,1519991743.0,2.0,1.0,1.556,0.254,-0.2334,11.945,0.05396,-4.05,0.3333,0.05554,3.473,1.0,36.0,-0.55959,0.45089,-0.73939,0.0503,0.0,0.0,0.0,41.2,-6.605,-8.875,5.41,7.125,2.908,-0.04642,-0.0654,0.03757,0.00035,2.824,-0.5845,-0.607,57.0,1.583,301.0,358.0,11.0,119.0,121.0,178.0,169.0,0.27068,-95.2,-1.119,1.251,-1.485,0.2646,23.92,0.9976,0.2695,2.0,-2.0,0.1683,218.0,2.396,14.0,0.1978,-0.4365,-0.467,0.08936,-13.375,-17.83,15.016,-0.0937,-0.1301,0.10425,-60.03924,-835.65295,0.07185,2017,11,128,0,1,0.19093,0.96241,1.0,6,17,100,-1.0,15.0,69.0,8.0,N,118,Y,15.0,182,1,12,1.89686,1.36842,1.0,B,126,C,7.0,,,278,62,80.0,14.0,879.0,12.0,M_ID_cecefd9589,28,M_ID_48257bb851,14.0,M_ID_00a6ca8a8a,9.0,776,1,10,1.5963,5.83459,5.0,-171,1.0267,-1.28571,-1.0,-48.68766,-0.74616,7.19304,1.35209,-0.36607,-0.68981,27,51,82,23,54,81,30,57,26,3,4.0,107,1.0,25.0,5.0,1.0,6,22,100,-1.0,15.0,9.0,8.0,37,79,29.0,12.0,27.0,8.0,1.0,0.0,5.0,17.0,21.0,302.0,7.0,107.0,4.0,N,34.0,Y,2.0,35.0,-1.0,2.0,0.37691,0.97222,1.0,B,34.0,A,1.0,C,1.0,278.0,11.0,80.0,5.0,307.0,4.0,M_ID_8442c31b02,1.0,M_ID_07c4c547b5,1.0,M_ID_40e0130f8d,1.0,215.0,2.0,9.0,1.50211,5.97222,6.0,56.0,0.50395,1.55556,2.0,-19.92624,-0.73939,0.45089,0.22382,-0.55351,-0.60745,14.0,26.0,42.0,12.0,28.0,43.0,16.0,31.0,15.0,3.0,4.0,25.0,3.0,7.0,1.0,4.0,5.0,22.0,21.0,7.0,7.0,4.0,4.0,37.0,17.0,19.0,4.0,25.0,3.0,5.0,3.0,0.6,0.0,0.0,0.0,0.4,0.0,0.0,0.4,0.6,1.6,2.0,1.0,0.54772,12.0,12,0.09375,0.07812,0.0,0.0,0.82031,0.00781,0.0,0.96875,0.03125,5.55469,12,1,5.04126,11.95312,2.0,0.05556,0.0,0.19444,0.69444,0.0,0.0,0.94444,0.02778,3.55556,0.50395,1.29099,32.0,9.4163,-17.26012,8.31636,-0.52549,0.13728,-0.73204,0.01627,2.25935,3.34428,0.56839,0.57826,36.0,9.12871,1.14162,0.15839,1.0,0.0,4.5,5.06623,0.68143,0.87902,-0.49277,-0.5611,-0.42443,0.09664,-0.35886,-0.5712,-0.17592,0.19928,-0.53472,-0.67117,-0.32676,0.15067,1.22059,0.94118,1.5,0.39515


In [26]:
FEATS_EXCLUDED = ['hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size']

In [28]:
train.drop(FEATS_EXCLUDED,axis=1,inplace=True)
test.drop(FEATS_EXCLUDED,axis=1,inplace=True)

In [29]:
print(train.shape,test.shape)

(201917, 444) (123623, 443)


In [30]:
train.to_csv('train_final.csv',index=False)
test.to_csv('test_final.csv',index=False)

In [None]:
y = train['target']
del train['target']

In [None]:
stratified = False

if stratified:
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=SEED)
else:
    folds = KFold(n_splits= 5, shuffle=True, random_state=SEED)

# Create arrays and dataframes to store results
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
feature_importance_df = pd.DataFrame()

In [None]:
temp = train.dtypes.reset_index(drop=False)
temp.columns = ['column','type']
for i in [x for x in list(temp[temp['type'] == 'object']['column']) if x not in ['card_id', 'first_active_month']]:
    print(i)
    le = preprocessing.LabelEncoder()
    le.fit(list(train[i].values) + list(test[i].values ) )
    train[i] = le.transform( list(train[i].values) )
    test[i] = le.transform( list(test[i].values) )
feats = [c for c in train.columns if c not in ['card_id', 'first_active_month'] + FEATS_EXCLUDED]

In [None]:
#categorical_features = ['h_most_common_category_1', 'h_second_most_common_category_1', 'h_most_common_category_3', 'h_second_most_common_category_3', 'h_third_most_common_category_3', 'h_most_common_merchant_id', 'h_second_most_common_merchant_id', 'h_third_most_common_merchant_id', 'n_most_common_category_1', 'n_second_most_common_category_1', 'n_most_common_category_3', 'n_second_most_common_category_3', 'n_third_most_common_category_3', 'n_most_common_merchant_id', 'n_second_most_common_merchant_id', 'n_third_most_common_merchant_id']

In [None]:
# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
# k-fold
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats],y)):
    train_x, train_y = train[feats].iloc[train_idx], y.iloc[train_idx]
    valid_x, valid_y = train[feats].iloc[valid_idx], y.iloc[valid_idx]

    # set data structure
    lgb_train = lgb.Dataset(train_x,
                            label=train_y,
                            free_raw_data=False)
    lgb_test = lgb.Dataset(valid_x,
                           label=valid_y,
                           free_raw_data=False)

    # params optimized by optuna
    params = {'num_leaves': 111,
         'min_data_in_leaf': 149, 
         'objective':'regression',
         'max_depth': 9,
         'n_jobs':-1,
         'learning_rate': 0.02,
         "boosting": "gbdt",
         "feature_fraction": 0.7522,
         "bagging_freq": 1,
         "bagging_fraction": 0.7083 ,
         "bagging_seed": SEED,
         "metric": 'rmse',
         "lambda_l1": 0.2634,
         "random_state": SEED,
         "verbosity": -1}

    reg = lgb.train(
                    params,
                    lgb_train,
                    valid_sets=[lgb_train, lgb_test],
                    valid_names=['train', 'test'],
                    num_boost_round=10000,
                    early_stopping_rounds= 200,
                    verbose_eval=200
                    )

    oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
    sub_preds += reg.predict(test[feats], num_iteration=reg.best_iteration) / folds.n_splits

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
    del reg, train_x, train_y, valid_x, valid_y
    gc.collect()
print("Average RMSE = {}".format(rmse(y,oof_preds)))

In [None]:
"""
def get_feature_importances(data, shuffle):
    # Gather real features
    #train_features = [f for f in data if f not in ['target', 'object_id']]
    # Go over fold and keep track of CV score (train and valid) and feature importances
    
    # Shuffle target if required
    y_ = y.copy()

    if shuffle:
        y_ = y_.copy().sample(frac=1.0)
        y_ = pd.Series(y_)
    
    dtrain = lgb.Dataset(data[feats], y_, free_raw_data=False, silent=True)
    
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    param = {'num_leaves': 111,
             'min_data_in_leaf': 149, 
             'objective':'regression',
             'max_depth': 9,
             'learning_rate': 0.02,
             "boosting": "gbdt",
             "feature_fraction": 0.7522,
             "bagging_freq": 1,
             "bagging_fraction": 0.7083 ,
             "bagging_seed": SEED,
             "metric": 'rmse',
             "lambda_l1": 0.2634,
             "random_state": 51,
             "verbosity": -1}
    clf = lgb.train(params=param, train_set=dtrain, num_boost_round=200)

    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(feats)
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    #imp_df['trn_score'] = roc_auc_score(pd.get_dummies(y), clf.predict_proba(data[train_features]))
    
    return imp_df
"""

In [None]:
"""actual_imp_df = get_feature_importances(data=train, shuffle=False)"""

In [None]:
"""
null_imp_df = pd.DataFrame()
nb_runs = 80
import time
start = time.time()
dsp = ''
for i in range(nb_runs):
    # Get current run importances
    imp_df = get_feature_importances(data=train, shuffle=True)
    imp_df['run'] = i + 1 
    # Concat the latest importances with the old ones
    null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    # Erase previous message
    for l in range(len(dsp)):
        print('\b', end='', flush=True)
    # Display current run and time used
    spent = (time.time() - start) / 60
    dsp = 'Done with %4d of %4d (Spent %5.1f min)' % (i + 1, nb_runs, spent)
    print(dsp, end='', flush=True)
"""

In [None]:
"""
temp = null_imp_df.groupby('feature')['importance_gain','importance_split'].agg(['mean','std']).reset_index(drop=False)
temp.columns = ['feature','gain_mean','gain_std','split_mean','split_std']
temp['gain_2_sigma'] = temp['gain_mean'] + temp['gain_std']*1
temp['split_2_sigma'] = temp['split_mean'] + temp['split_std']*1
temp2 = actual_imp_df.merge(temp[['feature','gain_2_sigma','split_2_sigma']],how='left',on = 'feature')
#temp2[(temp2['importance_gain'] > temp2['gain_2_sigma']) & (temp2['importance_split'] > temp2['split_2_sigma'] )].head()
imp_columns = list(temp2[(temp2['importance_gain'] > temp2['gain_2_sigma']) \
      & (temp2['importance_split'] > temp2['split_2_sigma'] ) ]['feature'])
"""

In [None]:
imp_columns = ['feature_1', 'elapsed_time', 'days_feature1', 'days_feature2', 'days_feature3', 'feature_sum', 'feature_mean', 'feature_min', 'hist_subsector_id_nunique', 'hist_merchant_id_nunique', 'hist_merchant_category_id_nunique', 'hist_month_nunique', 'hist_month_max', 'hist_weekofyear_nunique', 'hist_weekofyear_min', 'hist_weekofyear_max', 'hist_day_nunique', 'hist_installments_sum', 'hist_month_lag_min', 'hist_month_lag_mean', 'hist_month_lag_var', 'hist_month_diff_max', 'hist_month_diff_min', 'hist_month_diff_mean', 'hist_authorized_flag_mean', 'hist_category_1_mean', 'hist_price_sum', 'hist_price_var', 'hist_Mothers_Day_2017_mean', 'hist_fathers_day_2017_mean', 'hist_Children_day_2017_mean', 'hist_Valentine_Day_2017_mean', 'hist_duration_mean', 'hist_duration_min', 'hist_duration_max', 'hist_amount_month_ratio_min', 'hist_purchase_date_diff', 'hist_purchase_date_average', 'hist_purchase_date_uptonow', 'hist_purchase_date_uptomin', 'new_merchant_id_nunique', 'new_month_mean', 'new_month_max', 'new_weekofyear_mean', 'new_weekofyear_min', 'new_weekofyear_max', 'new_day_nunique', 'new_day_mean', 'new_day_min', 'new_day_max', 'new_purchase_amount_sum', 'new_purchase_amount_max', 'new_installments_mean', 'new_month_lag_max', 'new_month_lag_min', 'new_month_lag_mean', 'new_category_1_mean', 'new_Christmas_Day_2017_mean', 'new_Children_day_2017_mean', 'new_Black_Friday_2017_mean', 'new_Mothers_Day_2018_mean', 'new_duration_mean', 'new_duration_min', 'new_duration_max', 'new_amount_month_ratio_max', 'new_category_3_mean_mean', 'new_purchase_date_diff', 'new_purchase_date_average', 'new_purchase_date_uptonow', 'new_purchase_date_uptomin', 'hist_last_buy', 'card_id_cnt_ratio', 'purchase_amount_total', 'month_diff_mean', 'month_lag_max', 'month_lag_min', 'category_1_mean', 'installments_total', 'installments_ratio', 'price_total', 'duration_min', 'new_CLV', 'active_year', 'h_af_std', 'h_af_mean', 'h_most_common_category_1', 'h_num_second_most_common_category_1', 'h_ins_sum', 'h_ins_mean', 'h_num_most_common_merchant_id', 'h_num_second_most_common_merchant_id', 'h_binned_purchase_amount_sum', 'h_binned_purchase_amount_min', 'h_binned_purchase_amount_mean', 'h_month_lag_sum', 'h_month_lag_mean', 'h_purchase_amount_sum', 'h_purchase_amount_mean', 'h_purchase_amount_median', 'h_max_25_percentile_diff', 'h_75_percentile_min_diff', 'h_50_percentile_min_diff', 'h_most_common_subsector_id', 'n_num_most_common_category_1', 'n_ins_min', 'n_binned_purchase_amount_sum', 'n_month_lag_mean', 'n_purchase_amount_sum', 'n_purchase_amount_max', 'n_purchase_amount_std', 'n_purchase_amount_mean', 'n_max_25_percentile_diff', 'h_transactions_count', 'h_category_1_sum', 'h_category_1_mean', 'h_month_diff_mean', 'a_category_1_sum', 'a_category_1_mean', 'a_category_3_B_mean', 'a_purchase_month_max', 'a_purchase_month_std', 'a_month_diff_mean', 'new_category_1_sum', 'new_category_3_B_mean', 'new_purchase_month_mean', 'new_purchase_month_std', 'month_lag_std', 'purchase_amount_count_mean', 'purchase_amount_mean_mean', 'purchase_amount_min_mean', 'purchase_amount_max_mean', 'purchase_amount_max_std', 'installments_sum_mean', 'installments_sum_std', 'installments_max_mean', 'category_1_purchase_amount_mean', 'category_1_purchase_amount_max', 'installments_purchase_amount_mean', 'installments_purchase_amount_max', 'city_id_purchase_amount_mean', 'city_id_purchase_amount_max']

In [None]:
only_in_last = [x for x in imp_columns if x not in used_features]

In [None]:
train = train_df.merge(train_last[['card_id'] + only_in_last],how = 'left',on = 'card_id')

In [None]:
test = test_df.merge(test_last[['card_id'] + only_in_last],how = 'left',on = 'card_id')

In [None]:
print(train.shape,test.shape)

In [None]:
y = train['target']
del train['target']

In [None]:
train.head()

In [None]:
imp_columns = [x for x in train.columns if x != 'card_id']

In [None]:
temp = train.dtypes.reset_index(drop=False)
temp.columns = ['column','type']
for i in [x for x in list(temp[temp['type'] == 'object']['column']) if x not in ['card_id', 'first_active_month']]:
    print(i)
    le = preprocessing.LabelEncoder()
    le.fit(list(train[i].values) + list(test[i].values ) )
    train[i] = le.transform( list(train[i].values) )
    test[i] = le.transform( list(test[i].values) )
feats = [c for c in train.columns if c not in ['card_id', 'first_active_month'] + FEATS_EXCLUDED]

In [None]:
# k-fold
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats],y)):
    train_x, train_y = train[feats].iloc[train_idx], y.iloc[train_idx]
    valid_x, valid_y = train[feats].iloc[valid_idx], y.iloc[valid_idx]

    # set data structure
    lgb_train = lgb.Dataset(train_x,
                            label=train_y,
                            free_raw_data=False)
    lgb_test = lgb.Dataset(valid_x,
                           label=valid_y,
                           free_raw_data=False)

    # params optimized by optuna
    params = {'num_leaves': 111,
         'min_data_in_leaf': 149, 
         'objective':'regression',
         'max_depth': 9,
         'learning_rate': 0.02,
         "boosting": "gbdt",
         "feature_fraction": 0.7522,
         "bagging_freq": 1,
         "bagging_fraction": 0.7083 ,
         "bagging_seed": SEED,
         "metric": 'rmse',
         "lambda_l1": 0.2634,
         "random_state": SEED,
         "verbosity": -1}

    reg = lgb.train(
                    params,
                    lgb_train,
                    valid_sets=[lgb_train, lgb_test],
                    valid_names=['train', 'test'],
                    num_boost_round=10000,
                    early_stopping_rounds= 200,
                    verbose_eval=200
                    )

    oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
    sub_preds += reg.predict(test[feats], num_iteration=reg.best_iteration) / folds.n_splits

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
    del reg, train_x, train_y, valid_x, valid_y
    gc.collect()
print("Average RMSE = {}".format(rmse(y,oof_preds)))

In [None]:
test['target'] = sub_preds

In [None]:
submission_file_name = '../my_submissions/deneme2'
test[['card_id', 'target']].to_csv(submission_file_name, index=False)