In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pyarrow import csv
import pyarrow as pa
import pyarrow.parquet as pq

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
test = pd.read_parquet('../Data/test_pre_2.parquet', engine='pyarrow')

In [3]:
train = pd.read_parquet('../Data/train_pre_2.parquet', engine='pyarrow')

In [4]:
train_and_test = [train, test]

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929615 entries, 0 to 929614
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   fecha_dato             929615 non-null  datetime64[ns]
 1   ind_empleado           929615 non-null  object        
 2   pais_residencia        929615 non-null  int8          
 3   sexo                   929615 non-null  object        
 4   age                    929615 non-null  int64         
 5   fecha_alta             929615 non-null  datetime64[ns]
 6   ind_nuevo              929615 non-null  int64         
 7   antiguedad             929615 non-null  object        
 8   indrel                 929615 non-null  int64         
 9   ult_fec_cli_1t         1683 non-null    object        
 10  indrel_1mes            929592 non-null  float64       
 11  tiprel_1mes            929615 non-null  object        
 12  indresi                929615 non-null  obje

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13647309 entries, 0 to 13647308
Data columns (total 23 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   fecha_dato             datetime64[ns]
 1   ind_empleado           object        
 2   pais_residencia        int8          
 3   sexo                   object        
 4   age                    object        
 5   fecha_alta             datetime64[ns]
 6   ind_nuevo              float64       
 7   antiguedad             object        
 8   indrel                 float64       
 9   ult_fec_cli_1t         object        
 10  indrel_1mes            object        
 11  tiprel_1mes            object        
 12  indresi                object        
 13  indext                 object        
 14  conyuemp               object        
 15  canal_entrada          object        
 16  indfall                object        
 17  tipodom                float64       
 18  cod_prov            

- --
# indrel

1. train, test indrel features 의 type 불일치 > int8 로 변경
2. train only Nan > 최빈값인 1로 대체

In [7]:
train['indrel'].unique()

array([ 1., nan, 99.])

In [8]:
test['indrel'].unique()

array([ 1, 99], dtype=int64)

In [9]:
train['indrel'].isnull().sum()

27734

In [10]:
test['indrel'].isnull().sum()

0

In [11]:
train['indrel'].value_counts()

1.0     13594782
99.0       24793
Name: indrel, dtype: int64

In [12]:
train['indrel'] = train['indrel'].fillna(3) 

In [13]:
train['indrel'] = train['indrel'].replace(3,1) # 3으로 대체할려고 했는데 test 셋에 Nan 값이 없다. train 측면에서 효율적이지 않아 3 을 최빈값인 1로 바꿔주겠다.

In [14]:
train['indrel'] = train['indrel'].astype(np.int8) # train['indrel'] 은 float64 타입 test['indrel'] 은 int64 타입불일치 타입을 int8 로 통일

In [15]:
test['indrel'] = test['indrel'].astype(np.int8)

# ult_fec_cli_1t

train, test 에 전반적으로 NUll 값이 많다. train 셋의 경우 null 값의 비율이 80% 
일단 드랍

In [16]:
train['ult_fec_cli_1t'].isnull().sum()

13622516

In [17]:
test['ult_fec_cli_1t'].isnull().sum()

927932

In [18]:
features_drop = ['ult_fec_cli_1t']

train = train.drop(features_drop, axis = 1)
test = test.drop(features_drop, axis = 1)

In [21]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929615 entries, 0 to 929614
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   fecha_dato             929615 non-null  datetime64[ns]
 1   ind_empleado           929615 non-null  object        
 2   pais_residencia        929615 non-null  int8          
 3   sexo                   929615 non-null  object        
 4   age                    929615 non-null  int64         
 5   fecha_alta             929615 non-null  datetime64[ns]
 6   ind_nuevo              929615 non-null  int64         
 7   antiguedad             929615 non-null  object        
 8   indrel                 929615 non-null  int8          
 9   indrel_1mes            929592 non-null  float64       
 10  tiprel_1mes            929615 non-null  object        
 11  indresi                929615 non-null  object        
 12  indext                 929615 non-null  obje

In [22]:
test.to_parquet('../Data/test_pre_3.parquet', engine='pyarrow', index = False)
train.to_parquet('../Data/train_pre_3.parquet',engine='pyarrow', index = False)