In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pyarrow import csv
import pyarrow as pa
import pyarrow.parquet as pq

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
test = pd.read_parquet('../Data/test_pre_1.parquet', engine='pyarrow')

In [3]:
train = pd.read_parquet('../Data/train_pre_1.parquet', engine='pyarrow')

In [4]:
train_and_test = [train, test]

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929615 entries, 0 to 929614
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   fecha_dato             929615 non-null  datetime64[ns]
 1   ind_empleado           929615 non-null  object        
 2   pais_residencia        929615 non-null  int8          
 3   sexo                   929615 non-null  object        
 4   age                    929615 non-null  int64         
 5   fecha_alta             929615 non-null  datetime64[ns]
 6   ind_nuevo              929615 non-null  int64         
 7   antiguedad             929615 non-null  int64         
 8   indrel                 929615 non-null  int64         
 9   ult_fec_cli_1t         1683 non-null    object        
 10  indrel_1mes            929592 non-null  float64       
 11  tiprel_1mes            929615 non-null  object        
 12  indresi                929615 non-null  obje

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13647309 entries, 0 to 13647308
Data columns (total 23 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   fecha_dato             datetime64[ns]
 1   ind_empleado           object        
 2   pais_residencia        int8          
 3   sexo                   object        
 4   age                    object        
 5   fecha_alta             datetime64[ns]
 6   ind_nuevo              float64       
 7   antiguedad             object        
 8   indrel                 float64       
 9   ult_fec_cli_1t         object        
 10  indrel_1mes            object        
 11  tiprel_1mes            object        
 12  indresi                object        
 13  indext                 object        
 14  conyuemp               object        
 15  canal_entrada          object        
 16  indfall                object        
 17  tipodom                float64       
 18  cod_prov            

# 결측치 체크

In [7]:
train['sexo'].unique()

array(['H', 'V', ''], dtype=object)

In [8]:
train.isnull().sum() # ult_fec_cli_1t 와 renta 는 결측지가 지나치게 많다.

fecha_dato                      0
ind_empleado                    0
pais_residencia                 0
sexo                            0
age                             0
fecha_alta                  27734
ind_nuevo                   27734
antiguedad                      0
indrel                      27734
ult_fec_cli_1t           13622516
indrel_1mes                     0
tiprel_1mes                     0
indresi                         0
indext                          0
conyuemp                        0
canal_entrada                   0
indfall                         0
tipodom                     27735
cod_prov                    93591
nomprov                         0
ind_actividad_cliente       27734
renta                     2794375
segmento                        0
dtype: int64

# WhiteSpace 처리

info 에서 type 이 object인 col들은 뭘까? 이런 고민을 했었는데 문제가 대략적으로 해결됐다. <br> 공백(whitespace), 이 공백을 가진 놈들이 있는데 꼭 처리를 해주고 가야된다. 
결측지를 대체하거나 value 들을 조정할 때 아주 골머리를 썪는다.

<br>type 이 object 인 애들을 모와 strip 을 실행하면 된다.

In [9]:
# antiguedad

train['antiguedad'].unique() # 잘 보면 '6' 이 아니라 '     6' 이다. 우리는 오른쪽의 빈공간을 처리해야한다.

array(['      6', '     35', '     34', '     NA', '     33', '     31',
       '     21', '     16', '     27', '      9', '     22', '     13',
       '     29', '      8', '     11', '     10', '     28', '     24',
       '      7', '     25', '     14', '     26', '     12', '     23',
       '      1', '     18', '      4', '      3', '     17', '     32',
       '     30', '     20', '     15', '     19', '    157', '     36',
       '      5', '     40', '     38', '     37', '     39', '      0',
       '      2', '     47', '     44', '     42', '     46', '     45',
       '     43', '     41', '     57', '     48', '     52', '     49',
       '     50', '     56', '     58', '     51', '     55', '     54',
       '     53', '     59', '     62', '     61', '     60', '     63',
       '    139', '    165', '    118', '    164', '    142', '     94',
       '    159', '    143', '    105', '    151', '    162', '    137',
       '    150', '    128', '    122', '    156', 

In [10]:
train['antiguedad'] = [x.strip() for x in train['antiguedad']]

- --


In [None]:
#train['antiguedad'] = train['antiguedad'].astype(str)

In [11]:
train['antiguedad'] = train['antiguedad'].fillna('0') # NA 대체
train['antiguedad'] = train['antiguedad'].replace('NA', '0')

In [15]:
train['antiguedad'] = train['antiguedad'].replace('-999999', '0') # outlier 대체
test['antiguedad'] = test['antiguedad'].replace('-999999', '0')

In [13]:
test['antiguedad'] = [str(x) for x in test['antiguedad']]

In [17]:
train['antiguedad'].unique()

array(['6', '35', '34', '0', '33', '31', '21', '16', '27', '9', '22',
       '13', '29', '8', '11', '10', '28', '24', '7', '25', '14', '26',
       '12', '23', '1', '18', '4', '3', '17', '32', '30', '20', '15',
       '19', '157', '36', '5', '40', '38', '37', '39', '2', '47', '44',
       '42', '46', '45', '43', '41', '57', '48', '52', '49', '50', '56',
       '58', '51', '55', '54', '53', '59', '62', '61', '60', '63', '139',
       '165', '118', '164', '142', '94', '159', '143', '105', '151',
       '162', '137', '150', '128', '122', '156', '119', '160', '79', '95',
       '132', '161', '98', '127', '72', '155', '108', '163', '102', '148',
       '115', '146', '107', '81', '216', '135', '92', '121', '198', '134',
       '93', '140', '110', '120', '147', '116', '64', '77', '85', '99',
       '78', '100', '113', '154', '166', '133', '124', '141', '66', '117',
       '86', '193', '80', '144', '87', '126', '158', '101', '235', '88',
       '145', '103', '149', '109', '131', '97', '68', '8

In [18]:
test.to_parquet('../Data/test_pre_2.parquet', engine='pyarrow', index = False)

In [19]:
train.to_parquet('../Data/train_pre_2.parquet',engine='pyarrow', index = False)