<!-- filename = 'train_ver2.csv'

chunksize = 10 ** 3

for chunk in pd.read_csv(filename, chunksize=chunksize):
    print(chunk) -->

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pyarrow import csv
import pyarrow as pa
import pyarrow.parquet as pq

import warnings
warnings.filterwarnings(action='ignore')


In [None]:
test = pd.read_parquet('test_pq.parquet', engine='pyarrow')

In [4]:
test.columns

Index(['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
       'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel',
       'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext',
       'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'cod_prov',
       'nomprov', 'ind_actividad_cliente', 'renta', 'segmento'],
      dtype='object')

In [5]:
train = pd.read_parquet('train_pq.parquet', engine='pyarrow', columns=['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
       'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel',
       'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext',
       'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'cod_prov',
       'nomprov', 'ind_actividad_cliente', 'renta', 'segmento'])

In [6]:
train_and_test = [train, test]

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13647309 entries, 0 to 13647308
Data columns (total 24 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   fecha_dato             object 
 1   ncodpers               int64  
 2   ind_empleado           object 
 3   pais_residencia        object 
 4   sexo                   object 
 5   age                    object 
 6   fecha_alta             object 
 7   ind_nuevo              float64
 8   antiguedad             object 
 9   indrel                 float64
 10  ult_fec_cli_1t         object 
 11  indrel_1mes            object 
 12  tiprel_1mes            object 
 13  indresi                object 
 14  indext                 object 
 15  conyuemp               object 
 16  canal_entrada          object 
 17  indfall                object 
 18  tipodom                float64
 19  cod_prov               float64
 20  nomprov                object 
 21  ind_actividad_cliente  float64
 22  renta           

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929615 entries, 0 to 929614
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   fecha_dato             929615 non-null  object 
 1   ncodpers               929615 non-null  int64  
 2   ind_empleado           929615 non-null  object 
 3   pais_residencia        929615 non-null  object 
 4   sexo                   929615 non-null  object 
 5   age                    929615 non-null  int64  
 6   fecha_alta             929615 non-null  object 
 7   ind_nuevo              929615 non-null  int64  
 8   antiguedad             929615 non-null  int64  
 9   indrel                 929615 non-null  int64  
 10  ult_fec_cli_1t         1683 non-null    object 
 11  indrel_1mes            929592 non-null  float64
 12  tiprel_1mes            929615 non-null  object 
 13  indresi                929615 non-null  object 
 14  indext                 929615 non-nu

In [None]:
print(train.shape, test.shape)

(13647309, 48) (929615, 24)


- --
# Preprocessing

In [4]:
# dato, alta

for dataset in train_and_test:
    dataset["fecha_dato"] = pd.to_datetime(dataset["fecha_dato"], format="%Y-%m-%d")
    dataset['fecha_alta'] = pd.to_datetime(dataset['fecha_alta'], format="%Y-%m-%d")

In [5]:
train['fecha_alta'].unique()

array(['2015-01-12T00:00:00.000000000', '2012-08-10T00:00:00.000000000',
                                 'NaT', ...,
       '2016-05-25T00:00:00.000000000', '2016-05-01T00:00:00.000000000',
       '2016-05-15T00:00:00.000000000'], dtype='datetime64[ns]')

- --

In [6]:
# ncodpers

train = train.drop('ncodpers', axis=1)
test = test.drop('ncodpers', axis = 1)

- --

In [7]:
# ind_empleado

train['ind_empleado'] = train['ind_empleado'].replace('','P')

In [8]:
train['ind_empleado'].unique()

array(['N', 'P', 'A', 'B', 'F', 'S'], dtype=object)

- --

In [9]:
# pais_residencia memory saving
pais = train['pais_residencia'].unique()
pais

array(['ES', '', 'CA', 'CH', 'CL', 'IE', 'AT', 'NL', 'FR', 'GB', 'DE',
       'DO', 'BE', 'AR', 'VE', 'US', 'MX', 'BR', 'IT', 'EC', 'PE', 'CO',
       'HN', 'FI', 'SE', 'AL', 'PT', 'MZ', 'CN', 'TW', 'PL', 'IN', 'CR',
       'NI', 'HK', 'AD', 'CZ', 'AE', 'MA', 'GR', 'PR', 'RO', 'IL', 'RU',
       'GT', 'GA', 'NO', 'SN', 'MR', 'UA', 'BG', 'PY', 'EE', 'SV', 'ET',
       'CM', 'SA', 'CI', 'QA', 'LU', 'PA', 'BA', 'BO', 'AU', 'BY', 'KE',
       'SG', 'HR', 'MD', 'SK', 'TR', 'AO', 'CU', 'GQ', 'EG', 'ZA', 'DK',
       'UY', 'GE', 'TH', 'DZ', 'LB', 'JP', 'NG', 'PK', 'TN', 'TG', 'KR',
       'GH', 'RS', 'VN', 'PH', 'KW', 'NZ', 'MM', 'KH', 'GI', 'SL', 'GN',
       'GW', 'OM', 'CG', 'LV', 'LT', 'ML', 'MK', 'HU', 'IS', 'LY', 'CF',
       'GM', 'KZ', 'CD', 'BZ', 'ZW', 'DJ', 'JM', 'BM', 'MT'], dtype=object)

In [10]:
train['pais_residencia'] = train['pais_residencia'].replace('','PP')
train['pais_residencia'].unique()

array(['ES', 'PP', 'CA', 'CH', 'CL', 'IE', 'AT', 'NL', 'FR', 'GB', 'DE',
       'DO', 'BE', 'AR', 'VE', 'US', 'MX', 'BR', 'IT', 'EC', 'PE', 'CO',
       'HN', 'FI', 'SE', 'AL', 'PT', 'MZ', 'CN', 'TW', 'PL', 'IN', 'CR',
       'NI', 'HK', 'AD', 'CZ', 'AE', 'MA', 'GR', 'PR', 'RO', 'IL', 'RU',
       'GT', 'GA', 'NO', 'SN', 'MR', 'UA', 'BG', 'PY', 'EE', 'SV', 'ET',
       'CM', 'SA', 'CI', 'QA', 'LU', 'PA', 'BA', 'BO', 'AU', 'BY', 'KE',
       'SG', 'HR', 'MD', 'SK', 'TR', 'AO', 'CU', 'GQ', 'EG', 'ZA', 'DK',
       'UY', 'GE', 'TH', 'DZ', 'LB', 'JP', 'NG', 'PK', 'TN', 'TG', 'KR',
       'GH', 'RS', 'VN', 'PH', 'KW', 'NZ', 'MM', 'KH', 'GI', 'SL', 'GN',
       'GW', 'OM', 'CG', 'LV', 'LT', 'ML', 'MK', 'HU', 'IS', 'LY', 'CF',
       'GM', 'KZ', 'CD', 'BZ', 'ZW', 'DJ', 'JM', 'BM', 'MT'], dtype=object)

In [11]:
pais_dict = {pais[i] : i for i in range(len(pais))}
pais_dict['PP'] = pais_dict['']
del pais_dict['']
pais_dict

{'ES': 0,
 'CA': 2,
 'CH': 3,
 'CL': 4,
 'IE': 5,
 'AT': 6,
 'NL': 7,
 'FR': 8,
 'GB': 9,
 'DE': 10,
 'DO': 11,
 'BE': 12,
 'AR': 13,
 'VE': 14,
 'US': 15,
 'MX': 16,
 'BR': 17,
 'IT': 18,
 'EC': 19,
 'PE': 20,
 'CO': 21,
 'HN': 22,
 'FI': 23,
 'SE': 24,
 'AL': 25,
 'PT': 26,
 'MZ': 27,
 'CN': 28,
 'TW': 29,
 'PL': 30,
 'IN': 31,
 'CR': 32,
 'NI': 33,
 'HK': 34,
 'AD': 35,
 'CZ': 36,
 'AE': 37,
 'MA': 38,
 'GR': 39,
 'PR': 40,
 'RO': 41,
 'IL': 42,
 'RU': 43,
 'GT': 44,
 'GA': 45,
 'NO': 46,
 'SN': 47,
 'MR': 48,
 'UA': 49,
 'BG': 50,
 'PY': 51,
 'EE': 52,
 'SV': 53,
 'ET': 54,
 'CM': 55,
 'SA': 56,
 'CI': 57,
 'QA': 58,
 'LU': 59,
 'PA': 60,
 'BA': 61,
 'BO': 62,
 'AU': 63,
 'BY': 64,
 'KE': 65,
 'SG': 66,
 'HR': 67,
 'MD': 68,
 'SK': 69,
 'TR': 70,
 'AO': 71,
 'CU': 72,
 'GQ': 73,
 'EG': 74,
 'ZA': 75,
 'DK': 76,
 'UY': 77,
 'GE': 78,
 'TH': 79,
 'DZ': 80,
 'LB': 81,
 'JP': 82,
 'NG': 83,
 'PK': 84,
 'TN': 85,
 'TG': 86,
 'KR': 87,
 'GH': 88,
 'RS': 89,
 'VN': 90,
 'PH': 91,
 'KW': 9

In [12]:
train['pais_residencia'] = train['pais_residencia'].map(lambda x : pais_dict[x]).astype(np.int8)
test['pais_residencia'] = test['pais_residencia'].map(lambda x : pais_dict[x]).astype(np.int8)

In [13]:
train['pais_residencia'].unique()

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118], dtype=int8)

- --

In [15]:
test.to_parquet('../Data/test_pre_1.parquet', engine='pyarrow', index = False)
train.to_parquet('../Data/train_pre_2.parquet',engine='pyarrow', index = False)