In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
train = pd.read_hdf('../input/train_ver3.hdf', 'train_ver3')

In [3]:
test = pd.read_hdf('../input/test_ver3.hdf', 'test_ver3')

In [4]:
cat_cols = ['ncodpers',
 'canal_entrada',
 'conyuemp',
 'ind_actividad_cliente',
 'ind_empleado',
 'ind_nuevo',
 'indext',
 'indfall',
 'indrel',
 'indrel_1mes',
 'indresi',
 'pais_residencia',
 'segmento',
 'sexo',
 'tipodom',
 'tiprel_1mes',
 'age',
 'antiguedad',
 'renta']

target_cols = ['ind_cco_fin_ult1',
 'ind_cder_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1',
 'ind_ctpp_fin_ult1',
 'ind_deco_fin_ult1',
 'ind_dela_fin_ult1',
 'ind_deme_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1',
 'ind_nom_pens_ult1',
 'ind_nomina_ult1',
 'ind_plan_fin_ult1',
 'ind_pres_fin_ult1',
 'ind_reca_fin_ult1',
 'ind_recibo_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1',
 'ind_viv_fin_ult1']

In [5]:
df_train = train.loc[:, cat_cols+target_cols]
df_test = test.loc[:, cat_cols]

In [7]:
df_train.columns

Index(['ncodpers', 'canal_entrada', 'conyuemp', 'ind_actividad_cliente',
       'ind_empleado', 'ind_nuevo', 'indext', 'indfall', 'indrel',
       'indrel_1mes', 'indresi', 'pais_residencia', 'segmento', 'sexo',
       'tipodom', 'tiprel_1mes', 'age', 'antiguedad', 'renta',
       'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1',
       'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
       'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_dela_fin_ult1',
       'ind_deme_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1',
       'ind_hip_fin_ult1', 'ind_nom_pens_ult1', 'ind_nomina_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_recibo_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
       'ind_viv_fin_ult1'],
      dtype='object')

## 2015-05-28 and 2015-06-28

In [67]:
train1 = train.loc[train.fecha_dato=='2015-05-28']
train2 = train.loc[train.fecha_dato=='2015-06-28']

# products in 2015-06-28
target = train2.loc[:, ['ncodpers']+target_cols].copy()
target.set_index('ncodpers', inplace=True, drop=False)

target_ncodpers = pd.DataFrame(target.ncodpers)

target.drop('ncodpers', axis=1, inplace=True)

# products in 2015-05-28
prev_target = train1.loc[:, ['ncodpers']+target_cols].copy()
prev_target.set_index('ncodpers', inplace=True, drop=True)

prev_target = target_ncodpers.join(prev_target, how='left')
prev_target.fillna(0.0, inplace=True)
prev_target.drop('ncodpers', axis=1, inplace=True)

# new products in 2015-06-28
target = target.subtract(prev_target)
target[target<0] = 0

# train set of 2015-06-28 includes customer features and products in 2015-05-28
x_vars = train2[cat_cols].copy()
x_vars.set_index('ncodpers', drop=True, inplace=True)
x_vars = x_vars.join(prev_target)

# get samples for each new product

# join target to x_vars
x_vars_new = x_vars.join(target, rsuffix='_t')

# set ncodpers as one column
x_vars_new.reset_index(inplace=True)
x_vars.reset_index(inplace=True)

# melt
x_vars_new = x_vars_new.melt(id_vars=x_vars.columns)
# mapping from target_cols to index
target_cols_mapping = {c+'_t': n for (n, c) in enumerate(target_cols)}
# replace column name by index
x_vars_new.variable.replace(target_cols_mapping, inplace=True)
# keep new products
x_vars_new = x_vars_new[x_vars_new.value>0]
# drop value
x_vars_new.drop('value', inplace=True, axis=1)
# reset ncodpers
#x_vars_new.reset_index(inplace=True)

# variables
x_vars = x_vars_new.iloc[:, :-1].copy()
# target
target = x_vars_new.iloc[:, [0, -1]].copy()

### Compare with kernel results

In [68]:
train_X = np.load('train_X.npy')

train_y = np.load('train_y.npy')

test_X = np.load('test_X.npy')

In [69]:
train_X = pd.DataFrame(train_X, index=x_vars.index, columns=x_vars.columns)

train_X.sort_values('ncodpers', inplace=True)
train_X.reset_index(drop=True, inplace=True)

x_vars.sort_values('ncodpers', inplace=True)
x_vars.reset_index(drop=True, inplace=True)

In [70]:
a = train_X.subtract(x_vars)

In [86]:
x_vars.columns

Index(['ncodpers', 'canal_entrada', 'conyuemp', 'ind_actividad_cliente',
       'ind_empleado', 'ind_nuevo', 'indext', 'indfall', 'indrel',
       'indrel_1mes', 'indresi', 'pais_residencia', 'segmento', 'sexo',
       'tipodom', 'tiprel_1mes', 'age', 'antiguedad', 'renta',
       'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1',
       'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
       'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_dela_fin_ult1',
       'ind_deme_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1',
       'ind_hip_fin_ult1', 'ind_nom_pens_ult1', 'ind_nomina_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_recibo_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
       'ind_viv_fin_ult1'],
      dtype='object')

In [93]:
a.sum()

ncodpers                     0.000000
canal_entrada               72.000000
conyuemp                     0.000000
ind_actividad_cliente       16.000000
ind_empleado                 0.000000
ind_nuevo                   16.000000
indext                       0.000000
indfall                      0.000000
indrel                  -51184.000000
indrel_1mes                  0.000000
indresi                      0.000000
pais_residencia              8.000000
segmento                    16.000000
sexo                        16.000000
tipodom                      0.000000
tiprel_1mes                  0.000000
age                          0.012686
antiguedad                  -0.121875
renta                       -0.000007
ind_cco_fin_ult1             0.000000
ind_cder_fin_ult1            0.000000
ind_cno_fin_ult1             0.000000
ind_ctju_fin_ult1            0.000000
ind_ctma_fin_ult1            0.000000
ind_ctop_fin_ult1            0.000000
ind_ctpp_fin_ult1            0.000000
ind_deco_fin

In [97]:
train_X.iloc[3278]

ncodpers                 98561.0000
canal_entrada                6.0000
conyuemp                     0.0000
ind_actividad_cliente        2.0000
ind_empleado                 0.0000
ind_nuevo                    2.0000
indext                       0.0000
indfall                      0.0000
indrel                       2.0000
indrel_1mes                  0.0000
indresi                      0.0000
pais_residencia              1.0000
segmento                     2.0000
sexo                         2.0000
tipodom                      0.0000
tiprel_1mes                  0.0000
age                          0.2857
antiguedad                   0.0000
renta                        0.0679
ind_cco_fin_ult1             0.0000
ind_cder_fin_ult1            0.0000
ind_cno_fin_ult1             1.0000
ind_ctju_fin_ult1            0.0000
ind_ctma_fin_ult1            0.0000
ind_ctop_fin_ult1            0.0000
ind_ctpp_fin_ult1            0.0000
ind_deco_fin_ult1            0.0000
ind_dela_fin_ult1           

In [98]:
x_vars.iloc[3278]

ncodpers                 98561.000000
canal_entrada                0.000000
conyuemp                     0.000000
ind_actividad_cliente        0.000000
ind_empleado                 0.000000
ind_nuevo                    0.000000
indext                       0.000000
indfall                      0.000000
indrel                       0.000000
indrel_1mes                  0.000000
indresi                      0.000000
pais_residencia              0.000000
segmento                     0.000000
sexo                         0.000000
tipodom                      0.000000
tiprel_1mes                  0.000000
age                          0.285714
antiguedad                   0.000000
renta                        0.067900
ind_cco_fin_ult1             0.000000
ind_cder_fin_ult1            0.000000
ind_cno_fin_ult1             1.000000
ind_ctju_fin_ult1            0.000000
ind_ctma_fin_ult1            0.000000
ind_ctop_fin_ult1            0.000000
ind_ctpp_fin_ult1            0.000000
ind_deco_fin

In [94]:
a.loc[a.sexo!=0].T

Unnamed: 0,3278,17302,18433,18434,24542,28798,31228,34845
ncodpers,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
canal_entrada,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
conyuemp,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ind_actividad_cliente,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
ind_empleado,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ind_nuevo,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
indext,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
indfall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
indrel,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
indrel_1mes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
df2 = pd.read_csv('../input/train_ver2.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [99]:
df2.loc[df2.ncodpers==98561]

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
406225,2015-01-28,98561,,,,,,,,,...,0,0,0,1,0,0,0,,,0
846812,2015-02-28,98561,,,,,,,,,...,0,0,0,1,0,0,0,,,0
1487575,2015-03-28,98561,,,,,,,,,...,0,0,0,1,0,0,0,,,0
2102731,2015-04-28,98561,,,,,,,,,...,0,0,0,1,0,0,0,,,0
2923233,2015-05-28,98561,,,,,,,,,...,0,0,0,1,0,0,0,,,0
3554541,2015-06-28,98561,,,,,,,,,...,0,0,0,1,0,0,0,,,0
