In [1]:
import pandas as pd
import numpy as np

from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import warnings
import datetime
import time
import gc

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
import seaborn as sns
sns.set()

In [2]:
# Importation du dataset new_merchant_transactions
df_new_merchant = pd.read_csv("../data_sets/new_merchant_transactions.csv",parse_dates=['purchase_date'])
print(df_new_merchant.shape)
df_new_merchant.head(5)

(1963031, 14)


Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [3]:
# Importation du dataset historical_transactions
df_histo = pd.read_csv("../data_sets/historical_transactions.csv",parse_dates=['purchase_date'])
print(df_histo.shape)
df_histo.head(5)

(29112361, 14)


Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [4]:
# Importation du dataset train
df_train = pd.read_csv('../data_sets/train.csv')
# Importation du dataset test
df_test = pd.read_csv('../data_sets/test.csv')

In [5]:
# Remplacement des valeurs NaN
# Ces transformations sont pour le historical et le new merchant
for df in [df_histo,df_new_merchant]:
    df['category_2'].fillna(1.0,inplace=True)
    df['category_3'].fillna('A',inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)

In [6]:
# Transformer 'purchase_date' en datetime, et modifier les valeurs de 'authorized_flag' et 'category_1' par 0 et 1
# Calcule du différentiel temporel entre aujourd'hui et la date d'achat du client en mois et lui ajouter le décalage associé
# Ces transformations sont pour le historical et le new merchant
for df in [df_histo,df_new_merchant]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0}) 
    #https://www.kaggle.com/c/elo-merchant-category-recommendation/discussion/73244
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']

In [7]:
df_histo.head(5)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,month_diff
0,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,13
1,1,C_ID_4e6213e9bc,88,0,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,13
2,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,13
3,1,C_ID_4e6213e9bc,88,0,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,13
4,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,13


In [8]:
# Fonction pour le renommage des nouvelles colonnes
def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [9]:
# Calcule des nouvelles colonnes pour le historical_transactions
# Ces nouvelles colonnes seront ajoutées dans le train et le test
aggs = {}
for col in ['subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']

aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['authorized_flag'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    df_histo[col+'_mean'] = df_histo.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']    

new_columns = get_new_columns('hist',aggs)
print (new_columns)
df_hist_trans_group = df_histo.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['hist_purchase_date_diff'] = (df_hist_trans_group['hist_purchase_date_max'] - df_hist_trans_group['hist_purchase_date_min']).dt.days
df_hist_trans_group['hist_purchase_date_average'] = df_hist_trans_group['hist_purchase_date_diff']/df_hist_trans_group['hist_card_id_size']
df_hist_trans_group['hist_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['hist_purchase_date_max']).dt.days
df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
del df_hist_trans_group;gc.collect()

['hist_subsector_id_nunique', 'hist_merchant_id_nunique', 'hist_merchant_category_id_nunique', 'hist_purchase_amount_sum', 'hist_purchase_amount_max', 'hist_purchase_amount_min', 'hist_purchase_amount_mean', 'hist_purchase_amount_var', 'hist_installments_sum', 'hist_installments_max', 'hist_installments_min', 'hist_installments_mean', 'hist_installments_var', 'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_month_lag_max', 'hist_month_lag_min', 'hist_month_lag_mean', 'hist_month_lag_var', 'hist_month_diff_mean', 'hist_authorized_flag_sum', 'hist_authorized_flag_mean', 'hist_category_1_sum', 'hist_category_1_mean', 'hist_card_id_size', 'hist_category_2_mean_mean', 'hist_category_3_mean_mean']


42

In [10]:
# Calcule des nouvelles colonnes pour le new_merchant_transactions
# Ces nouvelles colonnes seront ajoutées dans le train et le test
aggs = {}
for col in ['subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']
aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    df_new_merchant[col+'_mean'] = df_new_merchant.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']
    
new_columns = get_new_columns('new_merchant',aggs)
df_hist_trans_group = df_new_merchant.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
print (new_columns)
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['new_merchant_purchase_date_diff'] = (df_hist_trans_group['new_merchant_purchase_date_max'] - df_hist_trans_group['new_merchant_purchase_date_min']).dt.days
df_hist_trans_group['new_merchant_purchase_date_average'] = df_hist_trans_group['new_merchant_purchase_date_diff']/df_hist_trans_group['new_merchant_card_id_size']
df_hist_trans_group['new_merchant_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['new_merchant_purchase_date_max']).dt.days
df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
del df_hist_trans_group;gc.collect()

['new_merchant_subsector_id_nunique', 'new_merchant_merchant_id_nunique', 'new_merchant_merchant_category_id_nunique', 'new_merchant_purchase_amount_sum', 'new_merchant_purchase_amount_max', 'new_merchant_purchase_amount_min', 'new_merchant_purchase_amount_mean', 'new_merchant_purchase_amount_var', 'new_merchant_installments_sum', 'new_merchant_installments_max', 'new_merchant_installments_min', 'new_merchant_installments_mean', 'new_merchant_installments_var', 'new_merchant_purchase_date_max', 'new_merchant_purchase_date_min', 'new_merchant_month_lag_max', 'new_merchant_month_lag_min', 'new_merchant_month_lag_mean', 'new_merchant_month_lag_var', 'new_merchant_month_diff_mean', 'new_merchant_category_1_sum', 'new_merchant_category_1_mean', 'new_merchant_card_id_size', 'new_merchant_category_2_mean_mean', 'new_merchant_category_3_mean_mean']


28

In [11]:
# Libérer de l'espace avec le garbage collector
del df_histo;gc.collect()
del df_new_merchant;gc.collect()
df_train.head(5)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,hist_subsector_id_nunique,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_purchase_amount_sum,...,new_merchant_month_lag_var,new_merchant_month_diff_mean,new_merchant_category_1_sum,new_merchant_category_1_mean,new_merchant_card_id_size,new_merchant_category_2_mean_mean,new_merchant_category_3_mean_mean,new_merchant_purchase_date_diff,new_merchant_purchase_date_average,new_merchant_purchase_date_uptonow
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283,21,94,41,-165.968739,...,0.26087,12.695652,0.0,0.0,23.0,-0.55016,-0.592993,54.0,2.347826,325.0
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913,24,142,57,-210.006336,...,0.3,13.833333,0.0,0.0,6.0,-0.55016,-0.606486,56.0,9.333333,355.0
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056,7,13,8,-29.167391,...,,12.0,0.0,0.0,1.0,-0.549015,-0.592993,0.0,0.0,326.0
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495,13,50,25,-49.491364,...,0.238095,13.0,1.0,0.142857,7.0,-0.556518,-0.604559,41.0,5.857143,336.0
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749,17,66,26,-48.687656,...,0.253968,12.833333,2.0,0.055556,36.0,-0.555446,-0.588217,57.0,1.583333,326.0


In [12]:
# Calcule de la nouvelle colonne 'outliers' pour le train
df_train['outliers'] = 0
df_train.loc[df_train['target'] < -30, 'outliers'] = 1
df_train['outliers'].value_counts()

0    199710
1      2207
Name: outliers, dtype: int64

In [13]:
# Transformer 'first_active_month' en datetime, et calcule du temps passé entre aujourd'hui et la date du 1er achat
# Calcule du nombre de fois qu'une 'card_id' a été utilisé dans le historical et le new merchant
for df in [df_train,df_test]:
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days
    
    df['card_id_total'] = df['new_merchant_card_id_size']+df['hist_card_id_size']
    df['purchase_amount_total'] = df['new_merchant_purchase_amount_sum']+df['hist_purchase_amount_sum']

# Calcule des nouveaux features qui correspondent au mean par outliers par rapport aux features
# Ces transformations sont pour le train et le test
for f in ['feature_1','feature_2','feature_3']:
    order_label = df_train.groupby([f])['outliers'].mean()
    print(f,"order_label",order_label)
    df_train[f] = df_train[f].map(order_label)
    df_test[f] = df_test[f].map(order_label)

feature_1 order_label feature_1
1    0.008058
2    0.010610
3    0.010479
4    0.010712
5    0.013145
Name: outliers, dtype: float64
feature_2 order_label feature_2
1    0.011385
2    0.008752
3    0.014166
Name: outliers, dtype: float64
feature_3 order_label feature_3
0    0.010283
1    0.011428
Name: outliers, dtype: float64


In [14]:
print(df_train.shape)
df_train.head(5)

(201917, 68)


Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,hist_subsector_id_nunique,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_purchase_amount_sum,...,new_merchant_card_id_size,new_merchant_category_2_mean_mean,new_merchant_category_3_mean_mean,new_merchant_purchase_date_diff,new_merchant_purchase_date_average,new_merchant_purchase_date_uptonow,outliers,elapsed_time,card_id_total,purchase_amount_total
0,2017-06-01,C_ID_92a2005557,0.013145,0.008752,0.011428,-0.820283,21,94,41,-165.968739,...,23.0,-0.55016,-0.592993,54.0,2.347826,325.0,0,657,283.0,-179.212942
1,2017-01-01,C_ID_3d0044924f,0.010712,0.011385,0.010283,0.392913,24,142,57,-210.006336,...,6.0,-0.55016,-0.606486,56.0,9.333333,355.0,0,808,356.0,-214.362071
2,2016-08-01,C_ID_d639edf6cd,0.01061,0.008752,0.010283,0.688056,7,13,8,-29.167391,...,1.0,-0.549015,-0.592993,0.0,0.0,326.0,0,961,44.0,-29.867717
3,2017-09-01,C_ID_186d6a6901,0.010712,0.014166,0.010283,0.142495,13,50,25,-49.491364,...,7.0,-0.556518,-0.604559,41.0,5.857143,336.0,0,565,84.0,-54.145736
4,2017-11-01,C_ID_cdbd2c0db2,0.008058,0.014166,0.010283,-0.159749,17,66,26,-48.687656,...,36.0,-0.555446,-0.588217,57.0,1.583333,326.0,0,504,169.0,-68.613893


In [15]:
# Retirer la colonnes target du train
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers']]
print(df_train_columns)
target = df_train['target']
del df_train['target']

['feature_1', 'feature_2', 'feature_3', 'hist_subsector_id_nunique', 'hist_merchant_id_nunique', 'hist_merchant_category_id_nunique', 'hist_purchase_amount_sum', 'hist_purchase_amount_max', 'hist_purchase_amount_min', 'hist_purchase_amount_mean', 'hist_purchase_amount_var', 'hist_installments_sum', 'hist_installments_max', 'hist_installments_min', 'hist_installments_mean', 'hist_installments_var', 'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_month_lag_max', 'hist_month_lag_min', 'hist_month_lag_mean', 'hist_month_lag_var', 'hist_month_diff_mean', 'hist_authorized_flag_sum', 'hist_authorized_flag_mean', 'hist_category_1_sum', 'hist_category_1_mean', 'hist_card_id_size', 'hist_category_2_mean_mean', 'hist_category_3_mean_mean', 'hist_purchase_date_diff', 'hist_purchase_date_average', 'hist_purchase_date_uptonow', 'new_merchant_subsector_id_nunique', 'new_merchant_merchant_id_nunique', 'new_merchant_merchant_category_id_nunique', 'new_merchant_purchase_amount_sum', 'new_merch

In [16]:
print(df_test.shape)
df_test.head(5)

(123623, 66)


Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,hist_subsector_id_nunique,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_max,...,new_merchant_category_1_mean,new_merchant_card_id_size,new_merchant_category_2_mean_mean,new_merchant_category_3_mean_mean,new_merchant_purchase_date_diff,new_merchant_purchase_date_average,new_merchant_purchase_date_uptonow,elapsed_time,card_id_total,purchase_amount_total
0,2017-04-01,C_ID_0ab67a22ab,0.010479,0.014166,0.011428,12,24,16,-40.733733,0.235676,...,0.0,3.0,-0.55016,-0.391755,25.0,8.333333,385.0,718.0,71.0,-42.510888
1,2017-01-01,C_ID_130fd0cbdd,0.01061,0.014166,0.010283,12,27,16,-49.136513,0.318817,...,0.222222,9.0,-0.55593,-0.534909,48.0,5.333333,334.0,808.0,87.0,-55.081212
2,2017-08-01,C_ID_b709037bc5,0.013145,0.011385,0.011428,6,9,8,4.52884,2.525866,...,0.5,2.0,-0.549587,-0.284389,11.0,5.5,372.0,596.0,15.0,4.708978
3,2017-12-01,C_ID_d27d835a9f,0.01061,0.011385,0.010283,11,23,18,-13.690715,0.087965,...,0.1,10.0,-0.550046,-0.284389,44.0,4.4,337.0,474.0,36.0,-19.434389
4,2015-12-01,C_ID_2b5e3df5c2,0.013145,0.011385,0.011428,15,47,31,25.139384,15.782255,...,0.0,6.0,-0.556457,-0.496872,39.0,6.5,342.0,1205.0,116.0,37.204381


In [17]:
df_train.columns.difference(df_test.columns)

Index(['outliers'], dtype='object')

In [18]:
# Mettre le 'card_id' comme index pour le train
df_train = df_train.set_index("card_id")
df_train.head(5)

Unnamed: 0_level_0,first_active_month,feature_1,feature_2,feature_3,hist_subsector_id_nunique,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_max,hist_purchase_amount_min,...,new_merchant_card_id_size,new_merchant_category_2_mean_mean,new_merchant_category_3_mean_mean,new_merchant_purchase_date_diff,new_merchant_purchase_date_average,new_merchant_purchase_date_uptonow,outliers,elapsed_time,card_id_total,purchase_amount_total
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_92a2005557,2017-06-01,0.013145,0.008752,0.011428,21,94,41,-165.968739,2.258395,-0.739395,...,23.0,-0.55016,-0.592993,54.0,2.347826,325.0,0,657,283.0,-179.212942
C_ID_3d0044924f,2017-01-01,0.010712,0.011385,0.010283,24,142,57,-210.006336,4.630299,-0.7424,...,6.0,-0.55016,-0.606486,56.0,9.333333,355.0,0,808,356.0,-214.362071
C_ID_d639edf6cd,2016-08-01,0.01061,0.008752,0.010283,7,13,8,-29.167391,-0.145847,-0.730138,...,1.0,-0.549015,-0.592993,0.0,0.0,326.0,0,961,44.0,-29.867717
C_ID_186d6a6901,2017-09-01,0.010712,0.014166,0.010283,13,50,25,-49.491364,1.445596,-0.740897,...,7.0,-0.556518,-0.604559,41.0,5.857143,336.0,0,565,84.0,-54.145736
C_ID_cdbd2c0db2,2017-11-01,0.008058,0.014166,0.010283,17,66,26,-48.687656,7.193041,-0.746156,...,36.0,-0.555446,-0.588217,57.0,1.583333,326.0,0,504,169.0,-68.613893


In [19]:
# Mettre le 'card_id' comme index pour le test
df_test = df_test.set_index("card_id")
df_test.head(5)

Unnamed: 0_level_0,first_active_month,feature_1,feature_2,feature_3,hist_subsector_id_nunique,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_max,hist_purchase_amount_min,...,new_merchant_category_1_mean,new_merchant_card_id_size,new_merchant_category_2_mean_mean,new_merchant_category_3_mean_mean,new_merchant_purchase_date_diff,new_merchant_purchase_date_average,new_merchant_purchase_date_uptonow,elapsed_time,card_id_total,purchase_amount_total
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_0ab67a22ab,2017-04-01,0.010479,0.014166,0.011428,12,24,16,-40.733733,0.235676,-0.743902,...,0.0,3.0,-0.55016,-0.391755,25.0,8.333333,385.0,718.0,71.0,-42.510888
C_ID_130fd0cbdd,2017-01-01,0.01061,0.014166,0.010283,12,27,16,-49.136513,0.318817,-0.731881,...,0.222222,9.0,-0.55593,-0.534909,48.0,5.333333,334.0,808.0,87.0,-55.081212
C_ID_b709037bc5,2017-08-01,0.013145,0.011385,0.011428,6,9,8,4.52884,2.525866,-0.536537,...,0.5,2.0,-0.549587,-0.284389,11.0,5.5,372.0,596.0,15.0,4.708978
C_ID_d27d835a9f,2017-12-01,0.01061,0.011385,0.010283,11,23,18,-13.690715,0.087965,-0.731881,...,0.1,10.0,-0.550046,-0.284389,44.0,4.4,337.0,474.0,36.0,-19.434389
C_ID_2b5e3df5c2,2015-12-01,0.013145,0.011385,0.011428,15,47,31,25.139384,15.782255,-0.746758,...,0.0,6.0,-0.556457,-0.496872,39.0,6.5,342.0,1205.0,116.0,37.204381


## Nettoyage des données

In [20]:
# Retirer les colonnes inutiles du train
listOfOuts = ["new_merchant_purchase_date_max","new_merchant_purchase_date_min",
              "new_merchant_merchant_category_id_nunique",
              "new_merchant_merchant_id_nunique","hist_authorized_flag_mean","hist_authorized_flag_sum",
              "hist_purchase_date_max","hist_purchase_date_min","hist_merchant_id_nunique",
              "hist_merchant_category_id_nunique"]

df_train = df_train.drop(listOfOuts, axis=1)
df_train = df_train.drop("outliers", axis=1)
df_train.head(5)

Unnamed: 0_level_0,first_active_month,feature_1,feature_2,feature_3,hist_subsector_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_mean,hist_purchase_amount_var,...,new_merchant_category_1_mean,new_merchant_card_id_size,new_merchant_category_2_mean_mean,new_merchant_category_3_mean_mean,new_merchant_purchase_date_diff,new_merchant_purchase_date_average,new_merchant_purchase_date_uptonow,elapsed_time,card_id_total,purchase_amount_total
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_92a2005557,2017-06-01,0.013145,0.008752,0.011428,21,-165.968739,2.258395,-0.739395,-0.638341,0.045003,...,0.0,23.0,-0.55016,-0.592993,54.0,2.347826,325.0,657,283.0,-179.212942
C_ID_3d0044924f,2017-01-01,0.010712,0.011385,0.010283,24,-210.006336,4.630299,-0.7424,-0.600018,0.1482,...,0.0,6.0,-0.55016,-0.606486,56.0,9.333333,355.0,808,356.0,-214.362071
C_ID_d639edf6cd,2016-08-01,0.01061,0.008752,0.010283,7,-29.167391,-0.145847,-0.730138,-0.678311,0.007635,...,0.0,1.0,-0.549015,-0.592993,0.0,0.0,326.0,961,44.0,-29.867717
C_ID_186d6a6901,2017-09-01,0.010712,0.014166,0.010283,13,-49.491364,1.445596,-0.740897,-0.642745,0.068447,...,0.142857,7.0,-0.556518,-0.604559,41.0,5.857143,336.0,565,84.0,-54.145736
C_ID_cdbd2c0db2,2017-11-01,0.008058,0.014166,0.010283,17,-48.687656,7.193041,-0.746156,-0.366073,1.82816,...,0.055556,36.0,-0.555446,-0.588217,57.0,1.583333,326.0,504,169.0,-68.613893


In [21]:
df_train.shape

(201917, 55)

In [22]:
# Affichage entier des colonnes
pd.set_option('display.max_columns', 100)
df_train.head(1)

Unnamed: 0_level_0,first_active_month,feature_1,feature_2,feature_3,hist_subsector_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_mean,hist_purchase_amount_var,hist_installments_sum,hist_installments_max,hist_installments_min,hist_installments_mean,hist_installments_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_mean,hist_month_lag_var,hist_month_diff_mean,hist_category_1_sum,hist_category_1_mean,hist_card_id_size,hist_category_2_mean_mean,hist_category_3_mean_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,new_merchant_subsector_id_nunique,new_merchant_purchase_amount_sum,new_merchant_purchase_amount_max,new_merchant_purchase_amount_min,new_merchant_purchase_amount_mean,new_merchant_purchase_amount_var,new_merchant_installments_sum,new_merchant_installments_max,new_merchant_installments_min,new_merchant_installments_mean,new_merchant_installments_var,new_merchant_month_lag_max,new_merchant_month_lag_min,new_merchant_month_lag_mean,new_merchant_month_lag_var,new_merchant_month_diff_mean,new_merchant_category_1_sum,new_merchant_category_1_mean,new_merchant_card_id_size,new_merchant_category_2_mean_mean,new_merchant_category_3_mean_mean,new_merchant_purchase_date_diff,new_merchant_purchase_date_average,new_merchant_purchase_date_uptonow,elapsed_time,card_id_total,purchase_amount_total
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1
C_ID_92a2005557,2017-06-01,0.013145,0.008752,0.011428,21,-165.968739,2.258395,-0.739395,-0.638341,0.045003,4,1,0,0.015385,0.015206,0,-8,-3.911538,5.748901,12.826923,0,0.0,260,0.072502,0.346719,242,0.930769,388,10.0,-13.244202,-0.296112,-0.724368,-0.575835,0.018445,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.478261,0.26087,12.695652,0.0,0.0,23.0,-0.55016,-0.592993,54.0,2.347826,325.0,657,283.0,-179.212942


In [23]:
df_train.columns

Index(['first_active_month', 'feature_1', 'feature_2', 'feature_3',
       'hist_subsector_id_nunique', 'hist_purchase_amount_sum',
       'hist_purchase_amount_max', 'hist_purchase_amount_min',
       'hist_purchase_amount_mean', 'hist_purchase_amount_var',
       'hist_installments_sum', 'hist_installments_max',
       'hist_installments_min', 'hist_installments_mean',
       'hist_installments_var', 'hist_month_lag_max', 'hist_month_lag_min',
       'hist_month_lag_mean', 'hist_month_lag_var', 'hist_month_diff_mean',
       'hist_category_1_sum', 'hist_category_1_mean', 'hist_card_id_size',
       'hist_category_2_mean_mean', 'hist_category_3_mean_mean',
       'hist_purchase_date_diff', 'hist_purchase_date_average',
       'hist_purchase_date_uptonow', 'new_merchant_subsector_id_nunique',
       'new_merchant_purchase_amount_sum', 'new_merchant_purchase_amount_max',
       'new_merchant_purchase_amount_min', 'new_merchant_purchase_amount_mean',
       'new_merchant_purchase_amount_

In [24]:
# Retrouver l'index d'une colonne
#df_train.columns.get_loc("new_merchant_purchase_date_max")

In [25]:
# Retirer les colonnes inutiles du test
listOfOuts = ["new_merchant_purchase_date_max","new_merchant_purchase_date_min",
              "new_merchant_merchant_category_id_nunique",
              "new_merchant_merchant_id_nunique","hist_authorized_flag_mean","hist_authorized_flag_sum",
              "hist_purchase_date_max","hist_purchase_date_min","hist_merchant_id_nunique",
              "hist_merchant_category_id_nunique"]



df_test = df_test.drop(listOfOuts, axis=1)
df_test.head(3)

Unnamed: 0_level_0,first_active_month,feature_1,feature_2,feature_3,hist_subsector_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_mean,hist_purchase_amount_var,hist_installments_sum,hist_installments_max,hist_installments_min,hist_installments_mean,hist_installments_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_mean,hist_month_lag_var,hist_month_diff_mean,hist_category_1_sum,hist_category_1_mean,hist_card_id_size,hist_category_2_mean_mean,hist_category_3_mean_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,new_merchant_subsector_id_nunique,new_merchant_purchase_amount_sum,new_merchant_purchase_amount_max,new_merchant_purchase_amount_min,new_merchant_purchase_amount_mean,new_merchant_purchase_amount_var,new_merchant_installments_sum,new_merchant_installments_max,new_merchant_installments_min,new_merchant_installments_mean,new_merchant_installments_var,new_merchant_month_lag_max,new_merchant_month_lag_min,new_merchant_month_lag_mean,new_merchant_month_lag_var,new_merchant_month_diff_mean,new_merchant_category_1_sum,new_merchant_category_1_mean,new_merchant_card_id_size,new_merchant_category_2_mean_mean,new_merchant_category_3_mean_mean,new_merchant_purchase_date_diff,new_merchant_purchase_date_average,new_merchant_purchase_date_uptonow,elapsed_time,card_id_total,purchase_amount_total
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1
C_ID_0ab67a22ab,2017-04-01,0.010479,0.014166,0.011428,12,-40.733733,0.235676,-0.743902,-0.599025,0.036967,141,12,1,2.073529,4.248244,0,-8,-3.632353,6.026997,14.838235,23,0.338235,68,0.074568,-0.239369,268,3.941176,446,3.0,-1.777156,-0.383266,-0.722114,-0.592385,0.033432,5.0,3.0,1.0,1.666667,1.333333,2.0,2.0,2.0,0.0,14.666667,0.0,0.0,3.0,-0.55016,-0.391755,25.0,8.333333,385.0,718.0,71.0,-42.510888
C_ID_130fd0cbdd,2017-01-01,0.01061,0.014166,0.010283,12,-49.136513,0.318817,-0.731881,-0.629955,0.024025,83,4,1,1.064103,0.164669,0,-13,-10.410256,4.686647,12.987179,2,0.025641,78,-0.148613,-0.391464,401,5.141026,395,6.0,-5.944698,-0.506484,-0.740897,-0.660522,0.005062,11.0,3.0,1.0,1.222222,0.444444,2.0,1.0,1.444444,0.277778,13.0,2.0,0.222222,9.0,-0.55593,-0.534909,48.0,5.333333,334.0,808.0,87.0,-55.081212
C_ID_b709037bc5,2017-08-01,0.013145,0.011385,0.011428,6,4.52884,2.525866,-0.536537,0.348372,0.821827,44,10,-1,3.384615,13.589744,0,-6,-2.076923,3.076923,13.0,1,0.076923,13,-0.076914,0.125002,161,12.384615,411,2.0,0.180138,0.904506,-0.724368,0.090069,1.326615,11.0,10.0,1.0,5.5,40.5,1.0,1.0,1.0,0.0,13.0,1.0,0.5,2.0,-0.549587,-0.284389,11.0,5.5,372.0,596.0,15.0,4.708978


In [26]:
df_test.shape

(123623, 55)

In [27]:
# Vérification des valeurs manquantes pour le train
df_train.isnull().values.any()

True

In [28]:
null_columns=df_train.columns[df_train.isnull().any()]
df_train[null_columns].isnull().sum()

new_merchant_subsector_id_nunique     21931
new_merchant_purchase_amount_sum      21931
new_merchant_purchase_amount_max      21931
new_merchant_purchase_amount_min      21931
new_merchant_purchase_amount_mean     21931
new_merchant_purchase_amount_var      48718
new_merchant_installments_sum         21931
new_merchant_installments_max         21931
new_merchant_installments_min         21931
new_merchant_installments_mean        21931
new_merchant_installments_var         48718
new_merchant_month_lag_max            21931
new_merchant_month_lag_min            21931
new_merchant_month_lag_mean           21931
new_merchant_month_lag_var            48718
new_merchant_month_diff_mean          21931
new_merchant_category_1_sum           21931
new_merchant_category_1_mean          21931
new_merchant_card_id_size             21931
new_merchant_category_2_mean_mean     21931
new_merchant_category_3_mean_mean     21931
new_merchant_purchase_date_diff       21931
new_merchant_purchase_date_avera

In [29]:
# Remplacer les valeur manquantes par 0
df_train = df_train.fillna(0)
df_train.isnull().values.any()

False

In [30]:
df_train.shape

(201917, 55)

In [31]:
# Verification des valeurs manquantes pour le test
df_test.isnull().values.any()

True

In [32]:
null_columns=df_test.columns[df_test.isnull().any()]
df_test[null_columns].isnull().sum()

first_active_month                        1
new_merchant_subsector_id_nunique     13608
new_merchant_purchase_amount_sum      13608
new_merchant_purchase_amount_max      13608
new_merchant_purchase_amount_min      13608
new_merchant_purchase_amount_mean     13608
new_merchant_purchase_amount_var      29831
new_merchant_installments_sum         13608
new_merchant_installments_max         13608
new_merchant_installments_min         13608
new_merchant_installments_mean        13608
new_merchant_installments_var         29831
new_merchant_month_lag_max            13608
new_merchant_month_lag_min            13608
new_merchant_month_lag_mean           13608
new_merchant_month_lag_var            29831
new_merchant_month_diff_mean          13608
new_merchant_category_1_sum           13608
new_merchant_category_1_mean          13608
new_merchant_card_id_size             13608
new_merchant_category_2_mean_mean     13608
new_merchant_category_3_mean_mean     13608
new_merchant_purchase_date_diff 

In [33]:
# Ajouter une date pour pallier le manque
df_test["first_active_month"] = df_test["first_active_month"].fillna("2018-02-01")

In [34]:
null_columns=df_test.columns[df_test.isnull().any()]
df_test[null_columns].isnull().sum()

new_merchant_subsector_id_nunique     13608
new_merchant_purchase_amount_sum      13608
new_merchant_purchase_amount_max      13608
new_merchant_purchase_amount_min      13608
new_merchant_purchase_amount_mean     13608
new_merchant_purchase_amount_var      29831
new_merchant_installments_sum         13608
new_merchant_installments_max         13608
new_merchant_installments_min         13608
new_merchant_installments_mean        13608
new_merchant_installments_var         29831
new_merchant_month_lag_max            13608
new_merchant_month_lag_min            13608
new_merchant_month_lag_mean           13608
new_merchant_month_lag_var            29831
new_merchant_month_diff_mean          13608
new_merchant_category_1_sum           13608
new_merchant_category_1_mean          13608
new_merchant_card_id_size             13608
new_merchant_category_2_mean_mean     13608
new_merchant_category_3_mean_mean     13608
new_merchant_purchase_date_diff       13608
new_merchant_purchase_date_avera

In [35]:
# Remplacer les valeur manquantes par 0
df_test = df_test.fillna(0)
df_test.isnull().values.any()

False

In [36]:
df_test.shape

(123623, 55)

In [37]:
# Transformer le 'first_active_month' en string pour l'utiliser avec notre model
df_train["first_active_month"] = df_train["first_active_month"].astype(str)

In [39]:
df_train.isnull().values.any()

False

In [40]:
df_train.dtypes

first_active_month                     object
feature_1                             float64
feature_2                             float64
feature_3                             float64
hist_subsector_id_nunique               int64
hist_purchase_amount_sum              float64
hist_purchase_amount_max              float64
hist_purchase_amount_min              float64
hist_purchase_amount_mean             float64
hist_purchase_amount_var              float64
hist_installments_sum                   int64
hist_installments_max                   int64
hist_installments_min                   int64
hist_installments_mean                float64
hist_installments_var                 float64
hist_month_lag_max                      int64
hist_month_lag_min                      int64
hist_month_lag_mean                   float64
hist_month_lag_var                    float64
hist_month_diff_mean                  float64
hist_category_1_sum                     int64
hist_category_1_mean              

# Notre modèle : CatBoostRegressor

In [41]:
data = df_train
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=789)

In [42]:
# Initialiser le Pool
train_pool = Pool(X_train, 
                  y_train, 
                  cat_features=[0,4,28])
test_pool = Pool(X_test, 
                 cat_features= [0,4,28])

In [43]:
# Spécifier les parametres du trainning
model = CatBoostRegressor(iterations=20, 
                          depth=7, 
                          learning_rate=0.3, 
                          loss_function='RMSE')

In [44]:
model.fit(train_pool)

0:	learn: 3.8356766	total: 250ms	remaining: 4.76s
1:	learn: 3.7894551	total: 355ms	remaining: 3.19s
2:	learn: 3.7615212	total: 449ms	remaining: 2.54s
3:	learn: 3.7440886	total: 559ms	remaining: 2.23s
4:	learn: 3.7319246	total: 664ms	remaining: 1.99s
5:	learn: 3.7241104	total: 760ms	remaining: 1.77s
6:	learn: 3.7204115	total: 870ms	remaining: 1.62s
7:	learn: 3.7144419	total: 971ms	remaining: 1.46s
8:	learn: 3.7105946	total: 1.08s	remaining: 1.32s
9:	learn: 3.7077311	total: 1.21s	remaining: 1.21s
10:	learn: 3.7013104	total: 1.33s	remaining: 1.09s
11:	learn: 3.6988460	total: 1.43s	remaining: 953ms
12:	learn: 3.6971689	total: 1.55s	remaining: 835ms
13:	learn: 3.6950839	total: 1.65s	remaining: 706ms
14:	learn: 3.6924113	total: 1.75s	remaining: 585ms
15:	learn: 3.6913901	total: 1.88s	remaining: 470ms
16:	learn: 3.6852347	total: 1.99s	remaining: 351ms
17:	learn: 3.6821008	total: 2.11s	remaining: 234ms
18:	learn: 3.6805280	total: 2.23s	remaining: 118ms
19:	learn: 3.6758118	total: 2.36s	remaini

<catboost.core.CatBoostRegressor at 0x254e56dc400>

In [45]:
preds = model.predict(test_pool)
print(preds)

[-0.46460011  0.35042705  0.33927911 ... -0.6433819   0.01117008
 -0.20801942]


In [46]:
print("score train :", model.score(X_train, y_train))
print("score test :", model.score(X_test, y_test))

score train : 3.675811855696966
score test : 3.5588789730252652


In [47]:
model_pred_train = model.predict(X_train)
model_pred_test = model.predict(X_test)

print("mse train:", mean_squared_error(model_pred_train, y_train))
print("mse test:", mean_squared_error(model_pred_test, y_test))

mse train: 13.511592798482374
mse test: 12.665619544641368


# Prédictions pour le  Test

In [48]:
df_test["first_active_month"] = df_test["first_active_month"].astype(str)

In [49]:
predictions = model.predict(df_test)

In [50]:
df_test = df_test.reset_index()

In [51]:
df_test.head(3)

Unnamed: 0,card_id,first_active_month,feature_1,feature_2,feature_3,hist_subsector_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_mean,hist_purchase_amount_var,hist_installments_sum,hist_installments_max,hist_installments_min,hist_installments_mean,hist_installments_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_mean,hist_month_lag_var,hist_month_diff_mean,hist_category_1_sum,hist_category_1_mean,hist_card_id_size,hist_category_2_mean_mean,hist_category_3_mean_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,new_merchant_subsector_id_nunique,new_merchant_purchase_amount_sum,new_merchant_purchase_amount_max,new_merchant_purchase_amount_min,new_merchant_purchase_amount_mean,new_merchant_purchase_amount_var,new_merchant_installments_sum,new_merchant_installments_max,new_merchant_installments_min,new_merchant_installments_mean,new_merchant_installments_var,new_merchant_month_lag_max,new_merchant_month_lag_min,new_merchant_month_lag_mean,new_merchant_month_lag_var,new_merchant_month_diff_mean,new_merchant_category_1_sum,new_merchant_category_1_mean,new_merchant_card_id_size,new_merchant_category_2_mean_mean,new_merchant_category_3_mean_mean,new_merchant_purchase_date_diff,new_merchant_purchase_date_average,new_merchant_purchase_date_uptonow,elapsed_time,card_id_total,purchase_amount_total
0,C_ID_0ab67a22ab,2017-04-01 00:00:00,0.010479,0.014166,0.011428,12,-40.733733,0.235676,-0.743902,-0.599025,0.036967,141,12,1,2.073529,4.248244,0,-8,-3.632353,6.026997,14.838235,23,0.338235,68,0.074568,-0.239369,268,3.941176,446,3.0,-1.777156,-0.383266,-0.722114,-0.592385,0.033432,5.0,3.0,1.0,1.666667,1.333333,2.0,2.0,2.0,0.0,14.666667,0.0,0.0,3.0,-0.55016,-0.391755,25.0,8.333333,385.0,718.0,71.0,-42.510888
1,C_ID_130fd0cbdd,2017-01-01 00:00:00,0.01061,0.014166,0.010283,12,-49.136513,0.318817,-0.731881,-0.629955,0.024025,83,4,1,1.064103,0.164669,0,-13,-10.410256,4.686647,12.987179,2,0.025641,78,-0.148613,-0.391464,401,5.141026,395,6.0,-5.944698,-0.506484,-0.740897,-0.660522,0.005062,11.0,3.0,1.0,1.222222,0.444444,2.0,1.0,1.444444,0.277778,13.0,2.0,0.222222,9.0,-0.55593,-0.534909,48.0,5.333333,334.0,808.0,87.0,-55.081212
2,C_ID_b709037bc5,2017-08-01 00:00:00,0.013145,0.011385,0.011428,6,4.52884,2.525866,-0.536537,0.348372,0.821827,44,10,-1,3.384615,13.589744,0,-6,-2.076923,3.076923,13.0,1,0.076923,13,-0.076914,0.125002,161,12.384615,411,2.0,0.180138,0.904506,-0.724368,0.090069,1.326615,11.0,10.0,1.0,5.5,40.5,1.0,1.0,1.0,0.0,13.0,1.0,0.5,2.0,-0.549587,-0.284389,11.0,5.5,372.0,596.0,15.0,4.708978


In [52]:
sub_df = pd.DataFrame({"card_id":df_test["card_id"].values})
sub_df["target"] = predictions

sub_df.to_csv("submission_6.csv", index=False)

In [53]:
sub_df.head(5)

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-3.125384
1,C_ID_130fd0cbdd,-0.414614
2,C_ID_b709037bc5,-0.526047
3,C_ID_d27d835a9f,-0.484135
4,C_ID_2b5e3df5c2,-1.217794


In [54]:
# la difference avec la submission 4 (learning_rate=1),est le 5(learning_rate=0.3)