## Compare Preprocessing Results with [Kernel](https://www.kaggle.com/sudalairajkumar/when-less-is-more)

In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
train = pd.read_hdf('../input/train_ver3.hdf', 'train_ver3')

In [3]:
test = pd.read_hdf('../input/test_ver3.hdf', 'test_ver3')

In [4]:
cat_cols = ['ncodpers',
 'canal_entrada',
 'conyuemp',
 'ind_actividad_cliente',
 'ind_empleado',
 'ind_nuevo',
 'indext',
 'indfall',
 'indrel',
 'indrel_1mes',
 'indresi',
 'pais_residencia',
 'segmento',
 'sexo',
 'tipodom',
 'tiprel_1mes',
 'age',
 'antiguedad',
 'renta']

target_cols = ['ind_cco_fin_ult1',
 'ind_cder_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1',
 'ind_ctpp_fin_ult1',
 'ind_deco_fin_ult1',
 'ind_dela_fin_ult1',
 'ind_deme_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1',
 'ind_nom_pens_ult1',
 'ind_nomina_ult1',
 'ind_plan_fin_ult1',
 'ind_pres_fin_ult1',
 'ind_reca_fin_ult1',
 'ind_recibo_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1',
 'ind_viv_fin_ult1']

## 2015-05-28 and 2015-06-28

In [5]:
train1 = train.loc[train.fecha_dato=='2015-05-28']
train2 = train.loc[train.fecha_dato=='2015-06-28']

# products in 2015-06-28
target = train2.loc[:, ['ncodpers']+target_cols].copy()
target.set_index('ncodpers', inplace=True, drop=False)
# a dataframe containing the ncodpers only
target_ncodpers = pd.DataFrame(target.ncodpers)
# drop ncodpers from target
target.drop('ncodpers', axis=1, inplace=True)

# products in 2015-05-28
prev_target = train1.loc[:, ['ncodpers']+target_cols].copy()
prev_target.set_index('ncodpers', inplace=True, drop=True)
# join to target_ncodpers, so that prev_target includes all customers in 2015-05-28
prev_target = target_ncodpers.join(prev_target, how='left')
prev_target.fillna(0.0, inplace=True)
prev_target.drop('ncodpers', axis=1, inplace=True)

# new products in 2015-06-28
target = target.subtract(prev_target)
target[target<0] = 0

# train set of 2015-06-28 includes customer features and products in 2015-05-28
x_vars = train2[cat_cols].copy()
x_vars.reset_index(inplace=True, drop=True)
x_vars.reset_index(inplace=True, drop=False)
x_vars_cols = x_vars.columns.tolist()
x_vars_cols[0] = 'sample_order'
x_vars.columns = x_vars_cols
x_vars.set_index('ncodpers', drop=True, inplace=True)
x_vars = x_vars.join(prev_target)

# get samples for each new product

# join target to x_vars
x_vars_new = x_vars.join(target, rsuffix='_t')

# set ncodpers as one column
x_vars_new.reset_index(inplace=True)
x_vars.reset_index(inplace=True)

# melt
x_vars_new = x_vars_new.melt(id_vars=x_vars.columns)
# mapping from target_cols to index
target_cols_mapping = {c+'_t': n for (n, c) in enumerate(target_cols)}
# replace column name by index
x_vars_new.variable.replace(target_cols_mapping, inplace=True)
# reorder rows
x_vars_new.sort_values(['sample_order', 'variable'], inplace=True)
# keep new products
x_vars_new = x_vars_new[x_vars_new.value>0]
# drop value
x_vars_new.drop(['sample_order', 'value'], inplace=True, axis=1)
# keep the order of samples as in the original data set
x_vars_new.reset_index(drop=True, inplace=True)

# variables
x_vars = x_vars_new.iloc[:, :-1].copy()
# target
target = x_vars_new.iloc[:, [0, -1]].copy()

## 2016-05-28 and 2016-06-28

In [6]:
train1 = train.loc[train.fecha_dato=='2016-05-28']
train2 = test.loc[test.fecha_dato=='2016-06-28']

# products in 2016-05-28
prev_target = train1.loc[:, ['ncodpers']+target_cols].copy()
prev_target.set_index('ncodpers', inplace=True, drop=True)

prev_target = target_ncodpers.join(prev_target, how='left')
prev_target.fillna(0.0, inplace=True)
prev_target.drop('ncodpers', axis=1, inplace=True)

# train set of 2016-06-28 includes customer features and products in 2016-05-28
x_vars_test = train2[cat_cols].copy()
x_vars_test.set_index('ncodpers', drop=False, inplace=True)
x_vars_test = x_vars_test.join(prev_target)

### Compare with kernel results

In [7]:
train_X = np.load('train_X.npy')

train_y = np.load('train_y.npy')

test_X = np.load('test_X.npy')

#### Compare train_X and x_vars

In [8]:
train_X = pd.DataFrame(train_X, index=x_vars.index, columns=x_vars.columns)

train_X.sort_values('ncodpers', inplace=True)
train_X.reset_index(drop=True, inplace=True)

x_vars.sort_values('ncodpers', inplace=True)
x_vars.reset_index(drop=True, inplace=True)

In [9]:
a = train_X.subtract(x_vars)

In [10]:
np.abs(a).max().max()

5.000000000000143e-05

#### Compare train_y and target

In [11]:
train_y = np.concatenate((x_vars_new.iloc[:, 0].values.reshape((train_y.shape[0], 1)), train_y.reshape((train_y.shape[0], 1))), axis=1)

In [12]:
train_y = pd.DataFrame(train_y, index=target.index, columns=target.columns)

train_y.sort_values('ncodpers', inplace=True)
train_y.reset_index(drop=True, inplace=True)
train_y.set_index('ncodpers', inplace=True)

target.sort_values('ncodpers', inplace=True)
target.reset_index(drop=True, inplace=True)
target.set_index('ncodpers', inplace=True)

In [13]:
a = train_y.subtract(target)

In [14]:
np.abs(a).max()

variable    0
dtype: int64

#### Compare test_X and x_vars_test

In [15]:
test_X = pd.DataFrame(test_X, index=x_vars_test.index, columns=x_vars_test.columns)

test_X.sort_values('ncodpers', inplace=True)
test_X.reset_index(drop=True, inplace=True)

x_vars_test.sort_values('ncodpers', inplace=True)
x_vars_test.reset_index(drop=True, inplace=True)

Defaulting to column, but this will raise an ambiguity error in a future version
  This is separate from the ipykernel package so we can avoid doing imports until
Defaulting to column, but this will raise an ambiguity error in a future version
  


In [16]:
a = test_X.subtract(x_vars_test)

In [17]:
np.abs(a).max().max()

5.000000000000143e-05