## Compare Preprocessing Results with [Kernel](https://www.kaggle.com/sudalairajkumar/when-less-is-more)

In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
train = pd.read_hdf('../input/train_ver3.hdf', 'train_ver3')

In [3]:
test = pd.read_hdf('../input/test_ver3.hdf', 'test_ver3')

In [4]:
cat_cols = ['ncodpers',
 'canal_entrada',
 'conyuemp',
 'ind_actividad_cliente',
 'ind_empleado',
 'ind_nuevo',
 'indext',
 'indfall',
 'indrel',
 'indrel_1mes',
 'indresi',
 'pais_residencia',
 'segmento',
 'sexo',
 'tipodom',
 'tiprel_1mes',
 'age',
 'antiguedad',
 'renta']

target_cols = ['ind_cco_fin_ult1',
 'ind_cder_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1',
 'ind_ctpp_fin_ult1',
 'ind_deco_fin_ult1',
 'ind_dela_fin_ult1',
 'ind_deme_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1',
 'ind_nom_pens_ult1',
 'ind_nomina_ult1',
 'ind_plan_fin_ult1',
 'ind_pres_fin_ult1',
 'ind_reca_fin_ult1',
 'ind_recibo_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1',
 'ind_viv_fin_ult1']

## 2015-05-28 and 2015-06-28

In [5]:
train1 = train.loc[train.fecha_dato=='2015-05-28']
train2 = train.loc[train.fecha_dato=='2015-06-28']

# products in 2015-06-28
target = train2.loc[:, ['ncodpers']+target_cols].copy()
target.set_index('ncodpers', inplace=True, drop=False)
# a dataframe containing the ncodpers only
target_ncodpers = pd.DataFrame(target.ncodpers)
# drop ncodpers from target
target.drop('ncodpers', axis=1, inplace=True)

# products in 2015-05-28
prev_target = train1.loc[:, ['ncodpers']+target_cols].copy()
prev_target.set_index('ncodpers', inplace=True, drop=True)
# join to target_ncodpers, so that prev_target includes all customers in 2015-05-28
prev_target = target_ncodpers.join(prev_target, how='left')
prev_target.fillna(0.0, inplace=True)
prev_target.drop('ncodpers', axis=1, inplace=True)

# new products in 2015-06-28
target = target.subtract(prev_target)
target[target<0] = 0

# train set of 2015-06-28 includes customer features and products in 2015-05-28
x_vars = train2[cat_cols].copy()
x_vars.reset_index(inplace=True, drop=True)
x_vars.reset_index(inplace=True, drop=False)
x_vars_cols = x_vars.columns.tolist()
x_vars_cols[0] = 'sample_order'
x_vars.columns = x_vars_cols
x_vars.set_index('ncodpers', drop=True, inplace=True)
x_vars = x_vars.join(prev_target)

# get samples for each new product

# join target to x_vars
x_vars_new = x_vars.join(target, rsuffix='_t')

# set ncodpers as one column
x_vars_new.reset_index(inplace=True)
x_vars.reset_index(inplace=True)

# melt
x_vars_new = x_vars_new.melt(id_vars=x_vars.columns)
# mapping from target_cols to index
target_cols_mapping = {c+'_t': n for (n, c) in enumerate(target_cols)}
# replace column name by index
x_vars_new.variable.replace(target_cols_mapping, inplace=True)
# reorder rows
x_vars_new.sort_values(['sample_order', 'variable'], inplace=True)
# keep new products
x_vars_new = x_vars_new[x_vars_new.value>0]
# drop value
x_vars_new.drop(['sample_order', 'value'], inplace=True, axis=1)
# keep the order of samples as in the original data set
x_vars_new.reset_index(drop=True, inplace=True)

# variables
x_vars = x_vars_new.iloc[:, :-1].copy()
# target
target = x_vars_new.iloc[:, [0, -1]].copy()

## 2016-05-28 and 2016-06-28

In [6]:
train1 = train.loc[train.fecha_dato=='2016-05-28']
train2 = test.loc[test.fecha_dato=='2016-06-28']

# products in 2016-05-28
prev_target = train1.loc[:, ['ncodpers']+target_cols].copy()
prev_target.set_index('ncodpers', inplace=True, drop=True)

prev_target = target_ncodpers.join(prev_target, how='left')
prev_target.fillna(0.0, inplace=True)
prev_target.drop('ncodpers', axis=1, inplace=True)

# train set of 2016-06-28 includes customer features and products in 2016-05-28
x_vars_test = train2[cat_cols].copy()
x_vars_test.set_index('ncodpers', drop=False, inplace=True)
x_vars_test = x_vars_test.join(prev_target)

## Train model

In [58]:
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8, 
         'silent': 0, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}
num_rounds = 50

dtrain = xgb.DMatrix(x_vars.values, target.values[:, 1])
model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train')], verbose_eval=True)

[0]	train-mlogloss:2.91645
[1]	train-mlogloss:2.78563
[2]	train-mlogloss:2.6751
[3]	train-mlogloss:2.58366
[4]	train-mlogloss:2.49453
[5]	train-mlogloss:2.41242
[6]	train-mlogloss:2.34612
[7]	train-mlogloss:2.28669
[8]	train-mlogloss:2.22937
[9]	train-mlogloss:2.17384
[10]	train-mlogloss:2.12307
[11]	train-mlogloss:2.08031
[12]	train-mlogloss:2.04
[13]	train-mlogloss:1.99893
[14]	train-mlogloss:1.95945
[15]	train-mlogloss:1.92307
[16]	train-mlogloss:1.88998
[17]	train-mlogloss:1.85772
[18]	train-mlogloss:1.82859
[19]	train-mlogloss:1.80279
[20]	train-mlogloss:1.77791
[21]	train-mlogloss:1.75428
[22]	train-mlogloss:1.72977
[23]	train-mlogloss:1.70668
[24]	train-mlogloss:1.68579
[25]	train-mlogloss:1.66468
[26]	train-mlogloss:1.6446
[27]	train-mlogloss:1.62652
[28]	train-mlogloss:1.61053
[29]	train-mlogloss:1.59396
[30]	train-mlogloss:1.57689
[31]	train-mlogloss:1.56132
[32]	train-mlogloss:1.54642
[33]	train-mlogloss:1.5324
[34]	train-mlogloss:1.5179
[35]	train-mlogloss:1.5043
[36]	train

Prediction from my model

In [59]:
preds = model.predict(xgb.DMatrix(x_vars_test.values))
preds = np.argsort(preds, axis=1)
preds = np.fliplr(preds)[:, :7]

Prediction from kernel model

In [60]:
preds_r = np.load('preds_raw.npy')
preds_r = np.argsort(preds_r, axis=1)
preds_r = np.fliplr(preds_r)[:, :7]

Difference between the two models

Note that the difference is not zero even though random seeds are the same in training

In [61]:
a = preds-preds_r
a = pd.DataFrame(a)

Write out prediction results from my model

In [45]:
test_id = x_vars_test.loc[:, 'ncodpers'].values
final_preds = [' '.join([target_cols[k] for k in pred]) for pred in preds]

out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
out_df.to_csv('eda_4_6.csv.gz', compression='gzip', index=False)