In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
plt.style.use('ggplot')

path = '../data/input'
train_file = 'train_ver2.csv'
test_file = 'test_ver2.csv'

In [35]:
train = pd.read_csv(os.path.join(path, train_file), dtype={"sexo": str,
                                                           "ind_nuevo": str,
                                                           "indext": str
                                                            },
                    skipinitialspace=True)

In [36]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13647309 entries, 0 to 13647308
Data columns (total 48 columns):
fecha_dato               object
ncodpers                 int64
ind_empleado             object
pais_residencia          object
sexo                     object
age                      float64
fecha_alta               object
ind_nuevo                object
antiguedad               float64
indrel                   float64
ult_fec_cli_1t           object
indrel_1mes              object
tiprel_1mes              object
indresi                  object
indext                   object
conyuemp                 object
canal_entrada            object
indfall                  object
tipodom                  float64
cod_prov                 float64
nomprov                  object
ind_actividad_cliente    float64
renta                    float64
segmento                 object
ind_ahor_fin_ult1        int64
ind_aval_fin_ult1        int64
ind_cco_fin_ult1         int64
ind_cder_fin_ult1 

In [37]:
print '{} rows, {} columns'.format(train.shape[0], train.shape[1])
print '-'*30
for col, tp, cnt in zip(train.columns, train.dtypes, train.isnull().sum(axis=0)):
    print '{:22s}\t\t{}\t\t{}'.format(col, tp, cnt)

13647309 rows, 48 columns
------------------------------
fecha_dato            		object		0
ncodpers              		int64		0
ind_empleado          		object		27734
pais_residencia       		object		27734
sexo                  		object		27804
age                   		float64		27734
fecha_alta            		object		27734
ind_nuevo             		object		27734
antiguedad            		float64		27734
indrel                		float64		27734
ult_fec_cli_1t        		object		13622516
indrel_1mes           		object		149781
tiprel_1mes           		object		149781
indresi               		object		27734
indext                		object		27734
conyuemp              		object		13645501
canal_entrada         		object		186126
indfall               		object		27734
tipodom               		float64		27735
cod_prov              		float64		93591
nomprov               		object		93591
ind_actividad_cliente 		float64		27734
renta                 		float64		2794375
segmento              		object		189368
ind_ahor_fin_ult1    

In [38]:
train['indrel_1mes'].unique()

array([1.0, nan, 3.0, 2.0, '1.0', '1', '3', '3.0', '2.0', '4.0', 'P', '4',
       4.0, '2'], dtype=object)

In [39]:
def clean_indrel_lmes(arr):
    arr = arr.fillna(-1)
    for i, item in enumerate(arr):
        if item == 'P':
            arr[i] = 5
    arr = pd.to_numeric(arr)
    arr = arr.astype(int)
    return arr
train['indrel_1mes'] = clean_indrel_lmes(train['indrel_1mes'])

In [40]:
train['indrel_1mes'].unique()

array([ 1, -1,  3,  2,  4,  5])

In [41]:
train['ind_empleado'].unique()

array(['N', nan, 'A', 'B', 'F', 'S'], dtype=object)

In [43]:
def clean_ind_empleado(arr):
    mapping_ind_empleado = {'N':1, 'B':2, 'F':3, 'A':4, 'S':5}
    arr = arr.apply(lambda x: mapping_ind_empleado[x] if x in mapping_ind_empleado else -1)
    return arr
train['ind_empleado'] = clean_ind_empleado(train['ind_empleado'])
train['ind_empleado'].unique()

array([-1])

In [44]:
train['pais_residencia'].unique()

array(['ES', nan, 'CA', 'CH', 'CL', 'IE', 'AT', 'NL', 'FR', 'GB', 'DE',
       'DO', 'BE', 'AR', 'VE', 'US', 'MX', 'BR', 'IT', 'EC', 'PE', 'CO',
       'HN', 'FI', 'SE', 'AL', 'PT', 'MZ', 'CN', 'TW', 'PL', 'IN', 'CR',
       'NI', 'HK', 'AD', 'CZ', 'AE', 'MA', 'GR', 'PR', 'RO', 'IL', 'RU',
       'GT', 'GA', 'NO', 'SN', 'MR', 'UA', 'BG', 'PY', 'EE', 'SV', 'ET',
       'CM', 'SA', 'CI', 'QA', 'LU', 'PA', 'BA', 'BO', 'AU', 'BY', 'KE',
       'SG', 'HR', 'MD', 'SK', 'TR', 'AO', 'CU', 'GQ', 'EG', 'ZA', 'DK',
       'UY', 'GE', 'TH', 'DZ', 'LB', 'JP', 'NG', 'PK', 'TN', 'TG', 'KR',
       'GH', 'RS', 'VN', 'PH', 'KW', 'NZ', 'MM', 'KH', 'GI', 'SL', 'GN',
       'GW', 'OM', 'CG', 'LV', 'LT', 'ML', 'MK', 'HU', 'IS', 'LY', 'CF',
       'GM', 'KZ', 'CD', 'BZ', 'ZW', 'DJ', 'JM', 'BM', 'MT'], dtype=object)

In [45]:
def clean_pais(arr):
    mapping_pais = {'LV': 102, 'BE': 12, 'BG': 50, 'BA': 61, 'BM': 117, 'BO': 62,
                    'JP': 82, 'JM': 116, 'BR': 17, 'BY': 64, 'BZ': 113, 'RU': 43,
                    'RS': 89, 'RO': 41, 'GW': 99, 'GT': 44, 'GR': 39, 'GQ': 73,
                    'GE': 78, 'GB': 9, 'GA': 45, 'GN': 98, 'GM': 110, 'GI': 96,
                    'GH': 88, 'OM': 100, 'HR': 67, 'HU': 106, 'HK': 34, 'HN': 22,
                    'AD': 35, 'PR': 40, 'PT': 26, 'PY': 51, 'PA': 60, 'PE': 20,
                    'PK': 84, 'PH': 91, 'PL': 30, 'EE': 52, 'EG': 74, 'ZA': 75,
                    'EC': 19, 'AL': 25, 'VN': 90, 'ET': 54, 'ZW': 114, 'ES': 1,
                    'MD': 68, 'UY': 77, 'MM': 94, 'ML': 104, 'US': 15, 'MT': 118,
                    'MR': 48, 'UA': 49, 'MX': 16, 'IL': 42, 'FR': 8, 'MA': 38,
                    'FI': 23, 'NI': 33, 'NL': 7, 'NO': 46, 'NG': 83, 'NZ': 93,
                    'CI': 57, 'CH': 3, 'CO': 21, 'CN': 28, 'CM': 55, 'CL': 4,
                    'CA': 2, 'CG': 101, 'CF': 109, 'CD': 112, 'CZ': 36, 'CR': 32,
                    'CU': 72, 'KE': 65, 'KH': 95, 'SV': 53, 'SK': 69, 'KR': 87,
                    'KW': 92, 'SN': 47, 'SL': 97, 'KZ': 111, 'SA': 56, 'SG': 66,
                    'SE': 24, 'DO': 11, 'DJ': 115, 'DK': 76, 'DE': 10, 'DZ': 80,
                    'LT': 103, 'LU': 59, 'TH': 79, 'TG': 86, 'LY': 108, 'AE': 37,
                    'VE': 14, 'IS': 107, 'IT': 18, 'AO': 71, 'AR': 13, 'AU': 63,
                    'AT': 6, 'IN': 31, 'IE': 5, 'QA': 58, 'MZ': 27, 'MK': 105,
                    'LB': 81, 'TW': 29, 'TR': 70, 'TN': 85
                    }
    arr = arr.apply(lambda x: mapping_pais[x] if x in mapping_pais else -1)
    return arr
train['pais_residencia'] = clean_pais(train['pais_residencia'])
train['pais_residencia'].unique()

array([  1,  -1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118])

In [46]:
train['sexo'].unique()

array(['H', 'V', nan], dtype=object)

In [47]:
def clean_sexo(arr):
    mapping_dict = {'H': 1, 'V':2}
    arr = arr.apply(lambda x: mapping_dict.get(x, -1))
    return arr
train['sexo'] = clean_sexo(train['sexo'])
train['sexo'].unique()

array([ 1,  2, -1])

In [49]:
train['age'].unique()

array([  35.,   23.,   22.,   24.,   65.,   28.,   25.,   26.,   53.,
         27.,   32.,   37.,   31.,   39.,   63.,   33.,   55.,   42.,
         58.,   38.,   50.,   30.,   45.,   44.,   36.,   29.,   60.,
         57.,   67.,   47.,   nan,   34.,   48.,   46.,   54.,   84.,
         15.,   12.,    8.,    6.,   83.,   40.,   77.,   69.,   52.,
         59.,   43.,   10.,    9.,   49.,   41.,   51.,   78.,   16.,
         11.,   73.,   62.,   66.,   17.,   68.,   82.,   95.,   96.,
         56.,   61.,   79.,   72.,   14.,   19.,   13.,   86.,   64.,
         20.,   89.,   71.,    7.,   70.,   74.,   21.,   18.,   75.,
          4.,   80.,   81.,    5.,   76.,   92.,   93.,   85.,   91.,
         87.,   90.,   94.,   99.,   98.,   88.,   97.,  100.,  101.,
        106.,  103.,    3.,    2.,  102.,  104.,  111.,  107.,  109.,
        105.,  112.,  115.,  110.,  116.,  108.,  113.,  126.,  117.,
        163.,  127.,  114.,  164.])

In [52]:
train['age'].isnull().sum()

0

In [53]:
train['age'] = train['age'].fillna(-1)
train['age'] = train['age'].astype(int)
train['age'].isnull().sum()

0

In [54]:
print 'min age: ', train['age'].min()
print 'max age: ', train['age'].max()

min age:  -1
max age:  164


In [61]:
train['antiguedad'] = train['antiguedad'].fillna(-1)
train['antiguedad'].unique()

array([  6.00000000e+00,   3.50000000e+01,   3.40000000e+01,
        -1.00000000e+00,   3.30000000e+01,   3.10000000e+01,
         2.10000000e+01,   1.60000000e+01,   2.70000000e+01,
         9.00000000e+00,   2.20000000e+01,   1.30000000e+01,
         2.90000000e+01,   8.00000000e+00,   1.10000000e+01,
         1.00000000e+01,   2.80000000e+01,   2.40000000e+01,
         7.00000000e+00,   2.50000000e+01,   1.40000000e+01,
         2.60000000e+01,   1.20000000e+01,   2.30000000e+01,
         1.00000000e+00,   1.80000000e+01,   4.00000000e+00,
         3.00000000e+00,   1.70000000e+01,   3.20000000e+01,
         3.00000000e+01,   2.00000000e+01,   1.50000000e+01,
         1.90000000e+01,   1.57000000e+02,   3.60000000e+01,
         5.00000000e+00,   4.00000000e+01,   3.80000000e+01,
         3.70000000e+01,   3.90000000e+01,   0.00000000e+00,
         2.00000000e+00,   4.70000000e+01,   4.40000000e+01,
         4.20000000e+01,   4.60000000e+01,   4.50000000e+01,
         4.30000000e+01,

In [62]:
train['ind_nuevo'].unique()

array(['0', nan, '1'], dtype=object)

In [64]:
train['ind_nuevo'] = train['ind_nuevo'].fillna(-1)
train['ind_nuevo'] = train['ind_nuevo'].astype(int)
train['ind_nuevo'].unique()

array([ 0, -1,  1])

In [66]:
train['indrel'].unique()

array([  1.,  nan,  99.])

In [67]:
train['indrel'] = train['indrel'].fillna(-1)
train['indrel'] = train['indrel'].astype(int)
train['indrel'].unique()

array([ 1, -1, 99])

In [68]:
train['ult_fec_cli_1t'].unique()

array([nan, '2015-07-02', '2015-07-23', '2015-07-06', '2015-07-30',
       '2015-07-20', '2015-07-08', '2015-07-22', '2015-07-17',
       '2015-07-09', '2015-07-03', '2015-07-29', '2015-07-13',
       '2015-07-21', '2015-07-27', '2015-07-14', '2015-07-01',
       '2015-07-24', '2015-07-15', '2015-07-16', '2015-07-28',
       '2015-07-07', '2015-07-10', '2015-08-21', '2015-08-19',
       '2015-08-25', '2015-08-14', '2015-08-24', '2015-08-17',
       '2015-08-18', '2015-08-10', '2015-08-13', '2015-08-27',
       '2015-08-03', '2015-08-06', '2015-08-20', '2015-08-26',
       '2015-08-28', '2015-08-05', '2015-08-11', '2015-08-07',
       '2015-08-04', '2015-08-12', '2015-09-17', '2015-09-01',
       '2015-09-18', '2015-09-03', '2015-09-02', '2015-09-14',
       '2015-09-16', '2015-09-29', '2015-09-28', '2015-09-09',
       '2015-09-22', '2015-09-08', '2015-09-11', '2015-09-21',
       '2015-09-04', '2015-09-25', '2015-09-07', '2015-09-10',
       '2015-09-23', '2015-09-24', '2015-09-15', '

In [69]:
train['tiprel_1mes'].unique()

array(['A', 'I', nan, 'P', 'R', 'N'], dtype=object)

In [70]:
mapping_triprel = {'I':1, 'A':2, 'P':3, 'R':4, 'N':5}
train['tiprel_1mes'] = train['tiprel_1mes'].apply(lambda x: mapping_triprel.get(x, -1))
train['tiprel_1mes'].unique()

array([ 2,  1, -1,  3,  4,  5])

In [71]:
train['indresi'].unique()

array(['S', nan, 'N'], dtype=object)

In [75]:
mapping_SN = {'S': 1, 'N':2}
train['indresi'] = train['indresi'].apply(lambda x: mapping_SN.get(x, -1))
train['indresi'].unique()

array([-1])

In [73]:
train['indext'].unique()

array(['N', 'S', nan], dtype=object)

In [76]:
train['indext'] = train['indext'].apply(lambda x: mapping_SN.get(x, -1))
train['indext'].unique()

array([ 2,  1, -1])

In [78]:
train['conyuemp'].unique()

array([nan, 'N', 'S'], dtype=object)

In [79]:
train['conyuemp'] = train['conyuemp'].apply(lambda x: mapping_SN.get(x, -1))
train['conyuemp'].unique()

array([-1,  2,  1])

In [80]:
train['canal_entrada'].unique()

array(['KHL', 'KHE', 'KHD', 'KFA', 'KFC', 'KAT', nan, 'KAZ', 'RED', 'KHC',
       'KHK', 'KGN', 'KHM', 'KHO', 'KDH', 'KEH', 'KAD', 'KBG', 'KGC',
       'KHF', 'KFK', 'KHN', 'KHA', 'KAF', 'KGX', 'KFD', 'KAG', 'KFG',
       'KAB', 'KCC', 'KAE', 'KAH', 'KAR', 'KFJ', 'KFL', 'KAI', 'KFU',
       'KAQ', 'KFS', 'KAA', 'KFP', 'KAJ', 'KFN', 'KGV', 'KGY', 'KFF',
       'KAP', 'KDE', 'KFV', '013', 'K00', 'KAK', 'KCK', 'KCL', 'KAY',
       'KBU', 'KDR', 'KAC', 'KDT', 'KCG', 'KDO', 'KDY', 'KBQ', 'KDA',
       'KBO', 'KCI', 'KEC', 'KBZ', 'KES', 'KDX', 'KAS', '007', 'KEU',
       'KCA', 'KAL', 'KDC', 'KAW', 'KCS', 'KCB', 'KDU', 'KDQ', 'KCN',
       'KCM', '004', 'KCH', 'KCD', 'KCE', 'KEV', 'KBL', 'KEA', 'KBH',
       'KDV', 'KFT', 'KEY', 'KAO', 'KEJ', 'KEO', 'KEI', 'KEW', 'KDZ',
       'KBV', 'KBR', 'KBF', 'KDP', 'KCO', 'KCF', 'KCV', 'KAM', 'KEZ',
       'KBD', 'KAN', 'KBY', 'KCT', 'KDD', 'KBW', 'KCU', 'KBX', 'KDB',
       'KBS', 'KBE', 'KCX', 'KBP', 'KBN', 'KEB', 'KDS', 'KEL', 'KDG',
       'KDF', '

In [81]:
mapping_canal_entrada = {'013': 49, 'KHP': 160, 'KHQ': 157, 'KHR': 161,
                         'KHS': 162, 'KHK': 10, 'KHL': 6, 'KHM': 12,
                         'KHN': 21, 'KHO': 13, 'KHA': 22, 'KHC': 9,
                         'KHD': 2, 'KHE': 1, 'KHF': 19, '025': 159,
                         'KAC': 57, 'KAB': 28, 'KAA': 39, 'KAG': 26,
                         'KAF': 23, 'KAE': 30, 'KAD': 16, 'KAK': 51,
                         'KAJ': 41, 'KAI': 35, 'KAH': 31, 'KAO': 94,
                         'KAN': 110, 'KAM': 107, 'KAL': 74, 'KAS': 70,
                         'KAR': 32, 'KAQ': 37, 'KAP': 46, 'KAW': 76,
                         'KAV': 139, 'KAU': 142, 'KAT': 5, 'KAZ': 7,
                         'KAY': 54, 'KBJ': 133, 'KBH': 90, 'KBN': 122,
                         'KBO': 64, 'KBL': 88, 'KBM': 135, 'KBB': 131,
                         'KBF': 102, 'KBG': 17, 'KBD': 109, 'KBE': 119,
                         'KBZ': 67, 'KBX': 116, 'KBY': 111, 'KBR': 101,
                         'KBS': 118, 'KBP': 121, 'KBQ': 62, 'KBV': 100,
                         'KBW': 114, 'KBU': 55, 'KCE': 86, 'KCD': 85,
                         'KCG': 59, 'KCF': 105, 'KCA': 73, 'KCC': 29,
                         'KCB': 78, 'KCM': 82, 'KCL': 53, 'KCO': 104,
                         'KCN': 81, 'KCI': 65, 'KCH': 84, 'KCK': 52,
                         'KCJ': 156, 'KCU': 115, 'KCT': 112, 'KCV': 106,
                         'KCQ': 154, 'KCP': 129, 'KCS': 77, 'KCR': 153,
                         'KCX': 120, 'RED': 8, 'KDL': 158, 'KDM': 130,
                         'KDN': 151, 'KDO': 60, 'KDH': 14, 'KDI': 150,
                         'KDD': 113, 'KDE': 47, 'KDF': 127, 'KDG': 126,
                         'KDA': 63, 'KDB': 117, 'KDC': 75, 'KDX': 69,
                         'KDY': 61, 'KDZ': 99, 'KDT': 58, 'KDU': 79,
                         'KDV': 91, 'KDW': 132, 'KDP': 103, 'KDQ': 80,
                         'KDR': 56, 'KDS': 124, 'K00': 50, 'KEO': 96,
                         'KEN': 137, 'KEM': 155, 'KEL': 125, 'KEK': 145,
                         'KEJ': 95, 'KEI': 97, 'KEH': 15, 'KEG': 136,
                         'KEF': 128, 'KEE': 152, 'KED': 143, 'KEC': 66,
                         'KEB': 123, 'KEA': 89, 'KEZ': 108, 'KEY': 93,
                         'KEW': 98, 'KEV': 87, 'KEU': 72, 'KES': 68,
                         'KEQ': 138, 'KFV': 48, 'KFT': 92, 'KFU': 36,
                         'KFR': 144, 'KFS': 38, 'KFP': 40, 'KFF': 45,
                         'KFG': 27, 'KFD': 25, 'KFE': 148, 'KFB': 146,
                         'KFC': 4, 'KFA': 3, 'KFN': 42, 'KFL': 34,
                         'KFM': 141, 'KFJ': 33, 'KFK': 20, 'KFH': 140,
                         'KFI': 134, '007': 71, '004': 83, 'KGU': 149,
                         'KGW': 147, 'KGV': 43, 'KGY': 44, 'KGX': 24,
                         'KGC': 18, 'KGN': 11}

train['canal_entrada'] = train['canal_entrada'].apply(lambda x: mapping_canal_entrada.get(x, -1))
train['canal_entrada'].unique()

array([  6,   1,   2,   3,   4,   5,  -1,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162])

In [83]:
train['indfall'].unique()

array(['N', nan, 'S'], dtype=object)

In [84]:
train['indfall'] = train['indfall'].apply(lambda x: mapping_SN.get(x, -1))
train['indfall'].unique()

array([ 2, -1,  1])

In [85]:
train['tipodom'].unique()

array([  1.,  nan])

In [87]:
train['tipodom'] = train['tipodom'].fillna(-1)
train['tipodom'] = train['tipodom'].astype(int)
train['tipodom'].unique()

array([ 1, -1])

In [89]:
train['cod_prov'].unique()

array([ 29.,  13.,  50.,  45.,  24.,  20.,  10.,  17.,  49.,   8.,  37.,
         9.,  22.,  31.,   5.,  40.,  27.,  25.,  28.,   3.,  42.,  41.,
        39.,   7.,  47.,  36.,  46.,  44.,  15.,  32.,  23.,  16.,  48.,
        12.,  26.,   2.,   6.,  30.,  11.,  nan,   4.,  19.,  34.,  35.,
        14.,  21.,  18.,  33.,  38.,  52.,  43.,   1.,  51.])

In [90]:
train['cod_prov'] = train['cod_prov'].fillna(-1)
train['cod_prov'] = train['cod_prov'].astype(int)
train['cod_prov'].unique()

array([29, 13, 50, 45, 24, 20, 10, 17, 49,  8, 37,  9, 22, 31,  5, 40, 27,
       25, 28,  3, 42, 41, 39,  7, 47, 36, 46, 44, 15, 32, 23, 16, 48, 12,
       26,  2,  6, 30, 11, -1,  4, 19, 34, 35, 14, 21, 18, 33, 38, 52, 43,
        1, 51])

In [91]:
train['ind_actividad_cliente'].unique()

array([  1.,   0.,  nan])

In [92]:
train['ind_actividad_cliente'] = train['ind_actividad_cliente'].fillna(-1)
train['ind_actividad_cliente'] = train['ind_actividad_cliente'].astype(int)
train['ind_actividad_cliente'].unique()

array([ 1,  0, -1])

In [96]:
train['renta'].isnull().sum()

2794375

In [97]:
train['renta'] = train['renta'].fillna(-1)
train['renta'].isnull().sum()

0

In [98]:
train['segmento'].unique()

array(['02 - PARTICULARES', '03 - UNIVERSITARIO', nan, '01 - TOP'], dtype=object)

In [104]:
mapping_segmento = {'01 - TOP': 1, '02 - PARTICULARES': 2, '03 - UNIVERSITARIO':3}
train['segmento'] = train['segmento'].apply(lambda x: mapping_segmento.get(x, -1))
train['segmento'].unique()

array([ 2,  3, -1,  1])

In [105]:
train['nomprov'].unique()

array(['MALAGA', 'CIUDAD REAL', 'ZARAGOZA', 'TOLEDO', 'LEON', 'GIPUZKOA',
       'CACERES', 'GIRONA', 'ZAMORA', 'BARCELONA', 'SALAMANCA', 'BURGOS',
       'HUESCA', 'NAVARRA', 'AVILA', 'SEGOVIA', 'LUGO', 'LERIDA', 'MADRID',
       'ALICANTE', 'SORIA', 'SEVILLA', 'CANTABRIA', 'BALEARS, ILLES',
       'VALLADOLID', 'PONTEVEDRA', 'VALENCIA', 'TERUEL',
       'CORU\xc3\x91A, A', 'OURENSE', 'JAEN', 'CUENCA', 'BIZKAIA',
       'CASTELLON', 'RIOJA, LA', 'ALBACETE', 'BADAJOZ', 'MURCIA', 'CADIZ',
       nan, 'ALMERIA', 'GUADALAJARA', 'PALENCIA', 'PALMAS, LAS', 'CORDOBA',
       'HUELVA', 'GRANADA', 'ASTURIAS', 'SANTA CRUZ DE TENERIFE',
       'MELILLA', 'TARRAGONA', 'ALAVA', 'CEUTA'], dtype=object)

In [107]:
train['ind_nomina_ult1'] = train['ind_nomina_ult1'].fillna(0)
train['ind_nom_pens_ult1'] = train['ind_nom_pens_ult1'].fillna(0)

In [108]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13647309 entries, 0 to 13647308
Data columns (total 48 columns):
fecha_dato               object
ncodpers                 int64
ind_empleado             int64
pais_residencia          int64
sexo                     int64
age                      int64
fecha_alta               object
ind_nuevo                int64
antiguedad               float64
indrel                   int64
ult_fec_cli_1t           object
indrel_1mes              int64
tiprel_1mes              int64
indresi                  int64
indext                   int64
conyuemp                 int64
canal_entrada            int64
indfall                  int64
tipodom                  int64
cod_prov                 int64
nomprov                  object
ind_actividad_cliente    int64
renta                    float64
segmento                 int64
ind_ahor_fin_ult1        int64
ind_aval_fin_ult1        int64
ind_cco_fin_ult1         int64
ind_cder_fin_ult1        int64
ind_cno_f