## Compare Preprocessing Results with [Kernel](https://www.kaggle.com/sudalairajkumar/when-less-is-more) with RAM-Limited Data

In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
cat_cols = ['ncodpers',
 'canal_entrada',
 'conyuemp',
 'ind_actividad_cliente',
 'ind_empleado',
 'ind_nuevo',
 'indext',
 'indfall',
 'indrel',
 'indrel_1mes',
 'indresi',
 'pais_residencia',
 'segmento',
 'sexo',
 'tipodom',
 'tiprel_1mes',
 'age',
 'antiguedad',
 'renta']

target_cols = ['ind_cco_fin_ult1',
 'ind_cder_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1',
 'ind_ctpp_fin_ult1',
 'ind_deco_fin_ult1',
 'ind_dela_fin_ult1',
 'ind_deme_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1',
 'ind_nom_pens_ult1',
 'ind_nomina_ult1',
 'ind_plan_fin_ult1',
 'ind_pres_fin_ult1',
 'ind_reca_fin_ult1',
 'ind_recibo_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1',
 'ind_viv_fin_ult1']

## 2015-05-28 and 2015-06-28

In [3]:
train1 = pd.read_hdf('../input/data_month_2015-05-28.hdf', 'data_month')
train2 = pd.read_hdf('../input/data_month_2015-06-28.hdf', 'data_month')

# products in 2015-06-28
target = train2.loc[:, ['ncodpers']+target_cols].copy()
target.set_index('ncodpers', inplace=True, drop=False)
# a dataframe containing the ncodpers only
target_ncodpers = pd.DataFrame(target.ncodpers)
# drop ncodpers from target
target.drop('ncodpers', axis=1, inplace=True)

# products in 2015-05-28
prev_target = train1.loc[:, ['ncodpers']+target_cols].copy()
prev_target.set_index('ncodpers', inplace=True, drop=True)
# join to target_ncodpers, so that prev_target includes all customers in 2015-05-28
prev_target = target_ncodpers.join(prev_target, how='left')
prev_target.fillna(0.0, inplace=True)
prev_target.drop('ncodpers', axis=1, inplace=True)

# new products in 2015-06-28
target = target.subtract(prev_target)
target[target<0] = 0

# train set of 2015-06-28 includes customer features and products in 2015-05-28
x_vars = train2[cat_cols].copy()
x_vars.reset_index(inplace=True, drop=True)
x_vars.reset_index(inplace=True, drop=False)
x_vars_cols = x_vars.columns.tolist()
x_vars_cols[0] = 'sample_order'
x_vars.columns = x_vars_cols
x_vars.set_index('ncodpers', drop=True, inplace=True)
x_vars = x_vars.join(prev_target)

# get samples for each new product

# join target to x_vars
x_vars_new = x_vars.join(target, rsuffix='_t')

# set ncodpers as one column
x_vars_new.reset_index(inplace=True)
x_vars.reset_index(inplace=True)

# melt
x_vars_new = x_vars_new.melt(id_vars=x_vars.columns)
# mapping from target_cols to index
target_cols_mapping = {c+'_t': n for (n, c) in enumerate(target_cols)}
# replace column name by index
x_vars_new.variable.replace(target_cols_mapping, inplace=True)
# reorder rows
x_vars_new.sort_values(['sample_order', 'variable'], inplace=True)
# keep new products
x_vars_new = x_vars_new[x_vars_new.value>0]
# drop value
x_vars_new.drop(['sample_order', 'value'], inplace=True, axis=1)
# keep the order of samples as in the original data set
x_vars_new.reset_index(drop=True, inplace=True)

# variables
x_vars = x_vars_new.iloc[:, :-1].copy()
# target
target = x_vars_new.iloc[:, [0, -1]].copy()

## 2016-05-28 and 2016-06-28

In [4]:
train1 = pd.read_hdf('../input/data_month_2016-05-28.hdf', 'data_month')
train2 = pd.read_hdf('../input/data_month_2016-06-28.hdf', 'data_month')

# customer id in 2016-06-28
target_ncodpers = train2.loc[:, ['ncodpers']].copy()
target_ncodpers.set_index('ncodpers', inplace=True, drop=False)

# products in 2016-05-28
prev_target = train1.loc[:, ['ncodpers']+target_cols].copy()
prev_target.set_index('ncodpers', inplace=True, drop=True)

# join prev_target to target_ncodpers
prev_target = target_ncodpers.join(prev_target, how='left')
prev_target.fillna(0.0, inplace=True)
prev_target.drop('ncodpers', axis=1, inplace=True)

# train set of 2016-06-28 includes customer features and products in 2016-05-28
x_vars_test = train2[cat_cols].copy()
x_vars_test.set_index('ncodpers', drop=False, inplace=True)
x_vars_test = x_vars_test.join(prev_target)

In [5]:
train1 = pd.read_hdf('../input/data_month_2016-05-28.hdf', 'data_month')
train2 = pd.read_hdf('../input/data_month_2016-06-28.hdf', 'data_month')

In [6]:
target_ncodpers = train2.loc[:, ['ncodpers']].copy()
target_ncodpers.set_index('ncodpers', inplace=True, drop=False)

In [7]:
# products in 2016-05-28
prev_target = train1.loc[:, ['ncodpers']+target_cols].copy()
prev_target.set_index('ncodpers', inplace=True, drop=True)

# join prev_target to target_ncodpers
prev_target = target_ncodpers.join(prev_target, how='left')
prev_target.fillna(0.0, inplace=True)
prev_target.drop('ncodpers', axis=1, inplace=True)

In [8]:
# train set of 2016-06-28 includes customer features and products in 2016-05-28
x_vars_test = train2[cat_cols].copy()
x_vars_test.set_index('ncodpers', drop=False, inplace=True)
x_vars_test = x_vars_test.join(prev_target)

In [9]:
x_vars_test.head()

Unnamed: 0_level_0,ncodpers,canal_entrada,conyuemp,ind_actividad_cliente,ind_empleado,ind_nuevo,indext,indfall,indrel,indrel_1mes,...,ind_hip_fin_ult1,ind_nom_pens_ult1,ind_nomina_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_recibo_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1
ncodpers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15889,15889,5.0,2,1.0,3,0.0,2,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1170544,1170544,5.0,0,0.0,1,0.0,2,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1170545,1170545,1.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1170547,1170547,1.0,0,0.0,1,0.0,2,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1170548,1170548,1.0,0,0.0,1,0.0,2,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Train model

In [10]:
x_vars.head()

Unnamed: 0,ncodpers,canal_entrada,conyuemp,ind_actividad_cliente,ind_empleado,ind_nuevo,indext,indfall,indrel,indrel_1mes,...,ind_hip_fin_ult1,ind_nom_pens_ult1,ind_nomina_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_recibo_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1
0,1063111,1.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1063111,1.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1063273,4.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1063273,4.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1063250,4.0,0,1.0,1,0.0,2,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
param = {'objective': 'multi:softprob', 
         'eta': 0.05, 
         'max_depth': 8, 
         'silent': 1, 
         'num_class': len(target_cols),
         'eval_metric': 'mlogloss',
         'min_child_weight': 1,
         'subsample': 0.7,
         'colsample_bytree': 0.7,
         'seed': 0}
num_rounds = 50

dtrain = xgb.DMatrix(x_vars.values[:, 1:], target.values[:, 1])
model = xgb.train(param, dtrain, num_rounds, evals=[(dtrain, 'train')], verbose_eval=True)

[0]	train-mlogloss:2.94041
[1]	train-mlogloss:2.79691
[2]	train-mlogloss:2.6867
[3]	train-mlogloss:2.59454
[4]	train-mlogloss:2.50696
[5]	train-mlogloss:2.42464
[6]	train-mlogloss:2.35786
[7]	train-mlogloss:2.29334
[8]	train-mlogloss:2.24309
[9]	train-mlogloss:2.18779
[10]	train-mlogloss:2.13427
[11]	train-mlogloss:2.08658
[12]	train-mlogloss:2.04404
[13]	train-mlogloss:2.00543
[14]	train-mlogloss:1.96945
[15]	train-mlogloss:1.93285
[16]	train-mlogloss:1.89895
[17]	train-mlogloss:1.87128
[18]	train-mlogloss:1.84025
[19]	train-mlogloss:1.81159
[20]	train-mlogloss:1.78408
[21]	train-mlogloss:1.75943
[22]	train-mlogloss:1.73534
[23]	train-mlogloss:1.71198
[24]	train-mlogloss:1.69173
[25]	train-mlogloss:1.67087
[26]	train-mlogloss:1.65222
[27]	train-mlogloss:1.63484
[28]	train-mlogloss:1.61706
[29]	train-mlogloss:1.59944
[30]	train-mlogloss:1.58254
[31]	train-mlogloss:1.56774
[32]	train-mlogloss:1.55323
[33]	train-mlogloss:1.53804
[34]	train-mlogloss:1.52404
[35]	train-mlogloss:1.51142
[36

Prediction from my model

In [12]:
preds = model.predict(xgb.DMatrix(x_vars_test.values[:, 1:]))
preds = np.argsort(preds, axis=1)
preds = np.fliplr(preds)[:, :7]

Write out prediction results from my model

In [13]:
test_id = x_vars_test.loc[:, 'ncodpers'].values
final_preds = [' '.join([target_cols[k] for k in pred]) for pred in preds]

out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
out_df.to_csv('eda_4_11.csv.gz', compression='gzip', index=False)

In [14]:
train_X = np.load('train_X.npy')

In [15]:
x_vars.shape

(45679, 41)

In [16]:
train_X.shape

(45679, 41)

In [19]:
u = x_vars.values

In [20]:
a = u-train_X

In [21]:
np.abs(a).max()

5.000000000000143e-05

In [22]:
train_y = np.load('train_y.npy')
u = target.values

In [23]:
a = train_y-u

ValueError: operands could not be broadcast together with shapes (45679,) (45679,2) 

In [None]:
np.abs(a).max()

In [None]:
test_X = np.load('test_X.npy')
u = x_vars_test.fillna(0.0).values[:, 1:]

In [None]:
a = test_X-u

In [None]:
np.abs(a).max()

In [None]:
x_vars_test.isnull().sum()