## Preprocess Output of eda_4_7 and Save

In [1]:
import os
if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

tqdm.tqdm.pandas()

%matplotlib inline

In [2]:
mapping_dict = {
'ind_empleado'  : {-99:0, 'N':1, 'B':2, 'F':3, 'A':4, 'S':5},
'sexo'          : {'V':0, 'H':1, -99:2},
'ind_nuevo'     : {'0':0, '1':1, -99:2},
'indrel'        : {'1':0, '99':1, -99:2},
'indrel_1mes'   : {-99:0, '1.0':1, '1':1, '2.0':2, '2':2, '3.0':3, '3':3, '4.0':4, '4':4, 'P':5},
'tiprel_1mes'   : {-99:0, 'I':1, 'A':2, 'P':3, 'R':4, 'N':5},
'indresi'       : {-99:0, 'S':1, 'N':2},
'indext'        : {-99:0, 'S':1, 'N':2},
'conyuemp'      : {-99:0, 'S':1, 'N':2},
'indfall'       : {-99:0, 'S':1, 'N':2},
'tipodom'       : {-99:0, '1':1},
'ind_actividad_cliente' : {'0':0, '1':1, -99:2},
'segmento'      : {'02 - PARTICULARES':0, '03 - UNIVERSITARIO':1, '01 - TOP':2, -99:2},
'pais_residencia' : {'LV': 102, 'BE': 12, 'BG': 50, 'BA': 61, 'BM': 117, 'BO': 62, 'JP': 82, 'JM': 116, 'BR': 17, 'BY': 64, 'BZ': 113, 'RU': 43, 'RS': 89, 'RO': 41, 'GW': 99, 'GT': 44, 'GR': 39, 'GQ': 73, 'GE': 78, 'GB': 9, 'GA': 45, 'GN': 98, 'GM': 110, 'GI': 96, 'GH': 88, 'OM': 100, 'HR': 67, 'HU': 106, 'HK': 34, 'HN': 22, 'AD': 35, 'PR': 40, 'PT': 26, 'PY': 51, 'PA': 60, 'PE': 20, 'PK': 84, 'PH': 91, 'PL': 30, 'EE': 52, 'EG': 74, 'ZA': 75, 'EC': 19, 'AL': 25, 'VN': 90, 'ET': 54, 'ZW': 114, 'ES': 0, 'MD': 68, 'UY': 77, 'MM': 94, 'ML': 104, 'US': 15, 'MT': 118, 'MR': 48, 'UA': 49, 'MX': 16, 'IL': 42, 'FR': 8, 'MA': 38, 'FI': 23, 'NI': 33, 'NL': 7, 'NO': 46, 'NG': 83, 'NZ': 93, 'CI': 57, 'CH': 3, 'CO': 21, 'CN': 28, 'CM': 55, 'CL': 4, 'CA': 2, 'CG': 101, 'CF': 109, 'CD': 112, 'CZ': 36, 'CR': 32, 'CU': 72, 'KE': 65, 'KH': 95, 'SV': 53, 'SK': 69, 'KR': 87, 'KW': 92, 'SN': 47, 'SL': 97, 'KZ': 111, 'SA': 56, 'SG': 66, 'SE': 24, 'DO': 11, 'DJ': 115, 'DK': 76, 'DE': 10, 'DZ': 80, 'MK': 105, -99: 1, 'LB': 81, 'TW': 29, 'TR': 70, 'TN': 85, 'LT': 103, 'LU': 59, 'TH': 79, 'TG': 86, 'LY': 108, 'AE': 37, 'VE': 14, 'IS': 107, 'IT': 18, 'AO': 71, 'AR': 13, 'AU': 63, 'AT': 6, 'IN': 31, 'IE': 5, 'QA': 58, 'MZ': 27},
'canal_entrada' : {'013': 49, 'KHP': 160, 'KHQ': 157, 'KHR': 161, 'KHS': 162, 'KHK': 10, 'KHL': 0, 'KHM': 12, 'KHN': 21, 'KHO': 13, 'KHA': 22, 'KHC': 9, 'KHD': 2, 'KHE': 1, 'KHF': 19, '025': 159, 'KAC': 57, 'KAB': 28, 'KAA': 39, 'KAG': 26, 'KAF': 23, 'KAE': 30, 'KAD': 16, 'KAK': 51, 'KAJ': 41, 'KAI': 35, 'KAH': 31, 'KAO': 94, 'KAN': 110, 'KAM': 107, 'KAL': 74, 'KAS': 70, 'KAR': 32, 'KAQ': 37, 'KAP': 46, 'KAW': 76, 'KAV': 139, 'KAU': 142, 'KAT': 5, 'KAZ': 7, 'KAY': 54, 'KBJ': 133, 'KBH': 90, 'KBN': 122, 'KBO': 64, 'KBL': 88, 'KBM': 135, 'KBB': 131, 'KBF': 102, 'KBG': 17, 'KBD': 109, 'KBE': 119, 'KBZ': 67, 'KBX': 116, 'KBY': 111, 'KBR': 101, 'KBS': 118, 'KBP': 121, 'KBQ': 62, 'KBV': 100, 'KBW': 114, 'KBU': 55, 'KCE': 86, 'KCD': 85, 'KCG': 59, 'KCF': 105, 'KCA': 73, 'KCC': 29, 'KCB': 78, 'KCM': 82, 'KCL': 53, 'KCO': 104, 'KCN': 81, 'KCI': 65, 'KCH': 84, 'KCK': 52, 'KCJ': 156, 'KCU': 115, 'KCT': 112, 'KCV': 106, 'KCQ': 154, 'KCP': 129, 'KCS': 77, 'KCR': 153, 'KCX': 120, 'RED': 8, 'KDL': 158, 'KDM': 130, 'KDN': 151, 'KDO': 60, 'KDH': 14, 'KDI': 150, 'KDD': 113, 'KDE': 47, 'KDF': 127, 'KDG': 126, 'KDA': 63, 'KDB': 117, 'KDC': 75, 'KDX': 69, 'KDY': 61, 'KDZ': 99, 'KDT': 58, 'KDU': 79, 'KDV': 91, 'KDW': 132, 'KDP': 103, 'KDQ': 80, 'KDR': 56, 'KDS': 124, 'K00': 50, 'KEO': 96, 'KEN': 137, 'KEM': 155, 'KEL': 125, 'KEK': 145, 'KEJ': 95, 'KEI': 97, 'KEH': 15, 'KEG': 136, 'KEF': 128, 'KEE': 152, 'KED': 143, 'KEC': 66, 'KEB': 123, 'KEA': 89, 'KEZ': 108, 'KEY': 93, 'KEW': 98, 'KEV': 87, 'KEU': 72, 'KES': 68, 'KEQ': 138, -99: 6, 'KFV': 48, 'KFT': 92, 'KFU': 36, 'KFR': 144, 'KFS': 38, 'KFP': 40, 'KFF': 45, 'KFG': 27, 'KFD': 25, 'KFE': 148, 'KFB': 146, 'KFC': 4, 'KFA': 3, 'KFN': 42, 'KFL': 34, 'KFM': 141, 'KFJ': 33, 'KFK': 20, 'KFH': 140, 'KFI': 134, '007': 71, '004': 83, 'KGU': 149, 'KGW': 147, 'KGV': 43, 'KGY': 44, 'KGX': 24, 'KGC': 18, 'KGN': 11}
}
cat_cols = list(mapping_dict.keys())
cat_cols = sorted(cat_cols)
cat_cols.insert(0, 'ncodpers')

target_cols = ['ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1','ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1','ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1','ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1']
target_cols = target_cols[2:]
target_cols = sorted(target_cols)

In [3]:
df2 = pd.read_hdf('../input/train_test.hdf', 'train_test')

canal_entrada

In [4]:
df2.canal_entrada.replace(mapping_dict['canal_entrada'], inplace=True)
df2.canal_entrada.value_counts(dropna=False)

 1.0      4306935
 5.0      3474042
 4.0      3299057
 157.0     666008
 3.0       435322
 10.0      258002
 12.0      217308
NaN        188207
 21.0      134739
 2.0       124281
 70.0       91805
 8.0        81935
 26.0       78958
 54.0       71791
 39.0       70951
 28.0       66395
 30.0       54057
 29.0       52427
 67.0       49455
 0.0        48076
 25.0       47412
 35.0       40127
 93.0       37419
 76.0       36481
 32.0       34822
 7.0        34325
 23.0       32472
 71.0       31260
 49.0       28798
 65.0       28323
           ...   
 83.0         226
 154.0        210
 151.0        210
 153.0        206
 14.0         204
 104.0        192
 152.0        186
 129.0        168
 50.0         162
 119.0        156
 138.0        156
 112.0        114
 146.0        114
 139.0        113
 116.0        108
 121.0         90
 155.0         72
 48.0          72
 120.0         72
 122.0         66
 22.0          54
 18.0          30
 149.0         30
 162.0         27
 11.0     

In [5]:
df2.canal_entrada.fillna(mapping_dict['canal_entrada'][-99], inplace=True)

pais_residencia

In [6]:
mapping_dict['pais_residencia'][np.nan] = mapping_dict['pais_residencia'][-99]
df2.pais_residencia.replace(mapping_dict['pais_residencia'], inplace=True)
df2.pais_residencia.value_counts(dropna=False)

0      14479329
1         27734
8          5472
13         5123
10         4906
9          4891
15         3882
21         3737
18         3126
41         3106
16         2729
17         2492
14         2467
19         2299
3          2119
12         1619
62         1607
51         1517
26         1509
4          1048
20          954
43          814
7           804
72          802
24          640
30          637
28          596
77          540
49          522
6           505
         ...   
101          36
90           36
89           36
54           36
100          24
110          18
109          18
111          18
108          18
107          18
112          18
104          18
113          18
81           18
97           18
102          18
85           18
96           18
95           18
94           18
25           18
92           18
88           18
78           18
86           18
116          13
114          12
115          12
117           7
118           3
Name: pais_residencia, L

segmento

In [7]:
df2.segmento.value_counts(dropna=False)

02 - PARTICULARES     8505598
03 - UNIVERSITARIO    5281607
01 - TOP               598103
NaN                    191616
Name: segmento, dtype: int64

In [8]:
df2.segmento.replace( {'02 - PARTICULARES':0, '03 - UNIVERSITARIO':1, '01 - TOP':2, np.nan:2}, inplace=True)
df2.segmento.value_counts(dropna=False)

0    8505598
1    5281607
2     789719
Name: segmento, dtype: int64

ind_activadad_cliente

In [9]:
df2.ind_actividad_cliente.value_counts(dropna=False)

 0.0    7918666
 1.0    6630524
NaN       27734
Name: ind_actividad_cliente, dtype: int64

In [10]:
df2.ind_actividad_cliente.replace({0:0, 1:1, np.nan:2}, inplace=True)
df2.ind_actividad_cliente.value_counts(dropna=False)

0.0    7918666
1.0    6630524
2.0      27734
Name: ind_actividad_cliente, dtype: int64

tipodom

In [11]:
df2.tipodom.value_counts(dropna=False)

 1.0    14549189
NaN        27735
Name: tipodom, dtype: int64

In [12]:
df2.tipodom.replace({np.nan:0, 1:1}, inplace=True)
df2.tipodom.value_counts(dropna=False)

1.0    14549189
0.0       27735
Name: tipodom, dtype: int64

indfall

In [13]:
df2.indfall.value_counts(dropna=False)

N      14512028
S         37162
NaN       27734
Name: indfall, dtype: int64

In [14]:
df2.indfall.replace({np.nan:0, 'S':1, 'N':2}, inplace=True)
df2.indfall.value_counts(dropna=False)

2    14512028
1       37162
0       27734
Name: indfall, dtype: int64

conyuemp

In [15]:
df2.conyuemp.value_counts(dropna=False)

NaN    14575012
N          1894
S            18
Name: conyuemp, dtype: int64

In [16]:
df2.conyuemp.replace({np.nan:0, 'S':1, 'N':2}, inplace=True)
df2.conyuemp.value_counts(dropna=False)

0    14575012
2        1894
1          18
Name: conyuemp, dtype: int64

indext

In [17]:
df2.indext.value_counts(dropna=False)

N      13858387
S        690803
NaN       27734
Name: indext, dtype: int64

In [18]:
df2.indext.replace({np.nan:0, 'S':1, 'N':2}, inplace=True)
df2.indext.value_counts(dropna=False)

2    13858387
1      690803
0       27734
Name: indext, dtype: int64

indresi

In [19]:
df2.indresi.value_counts(dropna=False)

S      14479331
N         69859
NaN       27734
Name: indresi, dtype: int64

In [20]:
df2.indresi.replace({np.nan:0, 'S':1, 'N':2}, inplace=True)
df2.indresi.value_counts(dropna=False)

1    14479331
2       69859
0       27734
Name: indresi, dtype: int64

tiprel_1mes

In [21]:
df2.tiprel_1mes.value_counts(dropna=False)

I      7840818
A      6580745
NaN     149804
P         4683
R          870
N            4
Name: tiprel_1mes, dtype: int64

In [22]:
df2.tiprel_1mes.replace(mapping_dict['tiprel_1mes'], inplace=True)
df2.tiprel_1mes.fillna(0, inplace=True)
df2.tiprel_1mes.value_counts(dropna=False)

1.0    7840818
2.0    6580745
0.0     149804
3.0       4683
4.0        870
5.0          4
Name: tiprel_1mes, dtype: int64

indrel_1mes

In [23]:
df2.indrel_1mes.replace('P', 5, inplace=True)
df2.indrel_1mes.fillna(0, inplace=True)

In [24]:
df2.indrel_1mes = pd.to_numeric(df2.indrel_1mes, errors='coerce')

In [25]:
df2.indrel_1mes.value_counts(dropna=False)

1.0    14420246
0.0      149804
3.0        4377
2.0        1317
5.0         874
4.0         306
Name: indrel_1mes, dtype: int64

indrel

In [26]:
df2.indrel.value_counts(dropna=False)

 1.0     14522714
NaN         27734
 99.0       26476
Name: indrel, dtype: int64

In [27]:
df2.indrel.replace({np.nan: 2, 1: 0, 99: 1}, inplace=True)
df2.indrel.value_counts(dropna=False)

0.0    14522714
2.0       27734
1.0       26476
Name: indrel, dtype: int64

ind_nuevo

In [28]:
df2.ind_nuevo.value_counts(dropna=False)

 0.0    13712094
 1.0      837096
NaN        27734
Name: ind_nuevo, dtype: int64

In [29]:
df2.ind_nuevo.fillna(2, inplace=True)
df2.ind_nuevo.value_counts(dropna=False)

0.0    13712094
1.0      837096
2.0       27734
Name: ind_nuevo, dtype: int64

sexo

In [30]:
df2.sexo.value_counts(dropna=False)

V      7928767
H      6620348
NaN      27809
Name: sexo, dtype: int64

In [31]:
df2.sexo.fillna(2, inplace=True)
df2.sexo.replace(mapping_dict['sexo'], inplace=True)
df2.sexo.value_counts(dropna=False)

0    7928767
1    6620348
2      27809
Name: sexo, dtype: int64

ind_empleado

In [32]:
df2.ind_empleado.value_counts(dropna=False)

N      14540073
NaN       27734
B          3784
F          2675
A          2640
S            18
Name: ind_empleado, dtype: int64

In [33]:
df2.ind_empleado.fillna(0, inplace=True)
df2.ind_empleado.replace(mapping_dict['ind_empleado'], inplace=True)
df2.ind_empleado.value_counts(dropna=False)

1    14540073
0       27734
2        3784
3        2675
4        2640
5          18
Name: ind_empleado, dtype: int64

In [34]:
df2.age = pd.to_numeric(df2.age, errors='coerce')

mean_age = 40.
min_age = 20.
max_age = 90.
range_age = max_age - min_age
df2.age.fillna(mean_age, inplace=True)
df2.loc[df2.age>max_age, 'age'] = max_age
df2.loc[df2.age<min_age, 'age'] = min_age
df2.age = (df2.age-min_age) / range_age

df2.antiguedad = pd.to_numeric(df2.antiguedad, errors='coerce')

min_value = 0.
max_value = 256.
range_value = max_value - min_value
missing_value = 0.
df2.antiguedad.fillna(0.0, inplace=True)
df2.loc[df2.antiguedad<min_value, 'antiguedad'] = min_value
df2.loc[df2.antiguedad>max_value, 'antiguedad'] = max_value
df2.antiguedad = (df2.antiguedad-min_value)/range_value

df2.renta = pd.to_numeric(df2.renta, errors='coerce')

min_value = 0.
max_value = 1500000.
range_value = max_value - min_value
missing_value = 101850.
df2.renta.fillna(missing_value, inplace=True)
df2.loc[df2.renta<min_value, 'renta'] = min_value
df2.loc[df2.renta>max_value, 'renta'] = max_value
df2.renta = (df2.renta-min_value)/range_value

#df2.fillna(0.0, inplace=True)

In [38]:
df2.to_hdf('../input/train_test_features_converted.hdf', 'train_test')