# Objetivos

- Realizar o pré-processamento dos dados de TREINO e VALIDAÇÃO.

- As variáveis categóricas já estão dicotomizadas. A princípio, somente a aplicação do logarítmo será feita. No notebook de Exploracao_inicial, foi visto que a variável transformada por esta função se aproxima de uma normal e com diferentes parâmetros de centralidade e variação.

# Pacotes e funções

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
%run /home/hugo/Documents/Git_GitHub/Estudo_Fraude_CC/vFraude_CC/99.Funcoes_auxiliares/Funcoes.py

In [3]:
def Aplica_Log(dataframe, variavel):
    # dataframe é um pandas dataframe
    # variavel é uma string com o nome da variável que se deseja aplicar o log

    return(np.log(dataframe[variavel]))

# Leitura dos dados de treino e validação

In [4]:
treino = pd.read_parquet('../1.Base_de_dados/treino.parquet')
val = pd.read_parquet('../1.Base_de_dados/validacao.parquet')

In [8]:
# repeat_retailer * online_order (RROO) = se a transação foi feita online E a transação ocorreu em lojas que já 
# haviam sido visitadas anteriormente então será 1, caso contrário será zero
treino['RROO'] = treino['repeat_retailer'] * treino['online_order']
treino['RRUC'] = treino['repeat_retailer'] * treino['used_chip']

# repeat_retailer * used_chip (RRUC) = se a transação foi feita online E a transação ocorreu com o uso do chip do cartão de crédito (provávelmente, em local física);
val['RRUC'] = val['repeat_retailer'] * val['used_chip']
val['RROO'] = val['repeat_retailer'] * val['online_order']

In [9]:
treino.info()

<class 'pandas.core.frame.DataFrame'>
Index: 783000 entries, 353955 to 748258
Data columns (total 10 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   distance_from_home              783000 non-null  float64
 1   distance_from_last_transaction  783000 non-null  float64
 2   ratio_to_median_purchase_price  783000 non-null  float64
 3   repeat_retailer                 783000 non-null  int64  
 4   used_chip                       783000 non-null  int64  
 5   used_pin_number                 783000 non-null  int64  
 6   online_order                    783000 non-null  int64  
 7   fraud                           783000 non-null  int64  
 8   RROO                            783000 non-null  int64  
 9   RRUC                            783000 non-null  int64  
dtypes: float64(3), int64(7)
memory usage: 65.7 MB


## Aplica logarítmo nas variáveis numéricas

In [10]:
treino['log_DFH'] = Aplica_Log(treino, 'distance_from_home')
treino['log_DFLT'] = Aplica_Log(treino, 'distance_from_last_transaction')
treino['log_RTMPP'] = Aplica_Log(treino, 'ratio_to_median_purchase_price')
treino.head()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud,RROO,RRUC,log_DFH,log_DFLT,log_RTMPP
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
353955,5.689623,0.3472,1.105109,1,0,0,1,0,1,0,1.738644,-1.057854,0.099944
4582,58.380209,0.124193,0.334676,1,0,0,1,0,1,0,4.066977,-2.085922,-1.094593
995260,21.729568,2.809812,0.352684,1,0,0,0,0,0,0,3.078674,1.033118,-1.042184
713182,49.862629,1.040213,0.943036,1,0,0,1,0,1,0,3.909272,0.039426,-0.058651
309053,13.585701,3.446598,1.715601,1,0,0,0,0,0,0,2.609018,1.237388,0.539764


In [11]:
val['log_DFH'] = Aplica_Log(val, 'distance_from_home')
val['log_DFLT'] = Aplica_Log(val, 'distance_from_last_transaction')
val['log_RTMPP'] = Aplica_Log(val, 'ratio_to_median_purchase_price')
val.head()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud,RRUC,RROO,log_DFH,log_DFLT,log_RTMPP
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
310970,58.42688,0.958514,0.26922,1,0,0,1,0,0,1,4.067776,-0.042371,-1.312228
888993,6.989511,26.921132,2.569289,1,0,0,1,0,0,1,1.944411,3.292912,0.943629
318920,52.611446,1.684453,14.136228,1,0,0,1,1,0,1,3.962934,0.521441,2.648741
478495,23.463881,4.677475,0.925283,1,0,0,1,0,0,1,3.155462,1.542758,-0.077656
463796,2.183154,0.214157,0.1117,1,0,0,1,0,0,1,0.780771,-1.541047,-2.191934


## Padronização

In [12]:
numericas = ['log_DFH', 'log_DFLT', 'log_RTMPP']
df_num_treino = Padronizacao(treino, numericas, tipo='padro', nome_sclr = 'scaler_SC_v1.pkl')[0]
df_num_treino.columns = ['log_DFH_padro', 'log_DFLT_padro', 'log_RTMPP_padro']
df_num_treino.head()

O StandardScaler será salvo no caminho: /home/hugo/Documents/Git_GitHub/Estudo_Fraude_CC/vFraude_CC/3.Pre_processamento/scaler_SC_v1.pkl


Unnamed: 0,log_DFH_padro,log_DFLT_padro,log_RTMPP_padro
0,-0.400708,-0.586039,0.092383
1,1.261517,-1.157395,-0.995176
2,0.555955,0.576032,-0.947461
3,1.14893,0.023781,-0.052008
4,0.220662,0.689556,0.492815


In [13]:
with open('/home/hugo/Documents/Git_GitHub/Estudo_Fraude_CC/vFraude_CC/3.Pre_processamento/scaler_SC_v1.pkl','rb') as f:
    sc_v1 = pickle.load(f)

In [14]:
df_num_val = pd.DataFrame(sc_v1.transform(val[numericas]), columns=sc_v1.get_feature_names_out())
df_num_val.columns = ['log_DFH_padro', 'log_DFLT_padro', 'log_RTMPP_padro']
df_num_val.head()

Unnamed: 0,log_DFH_padro,log_DFLT_padro,log_RTMPP_padro
0,1.262088,-0.021678,-1.19332
1,-0.253809,1.831927,0.860512
2,1.187239,0.291664,2.412922
3,0.610776,0.859268,-0.069311
4,-1.084546,-0.854577,-1.994244


## Une todas as transformações

In [15]:
treino = pd.concat([treino.reset_index(drop=True),df_num_treino], axis=1)
treino.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud,RROO,RRUC,log_DFH,log_DFLT,log_RTMPP,log_DFH_padro,log_DFLT_padro,log_RTMPP_padro
0,5.689623,0.3472,1.105109,1,0,0,1,0,1,0,1.738644,-1.057854,0.099944,-0.400708,-0.586039,0.092383
1,58.380209,0.124193,0.334676,1,0,0,1,0,1,0,4.066977,-2.085922,-1.094593,1.261517,-1.157395,-0.995176
2,21.729568,2.809812,0.352684,1,0,0,0,0,0,0,3.078674,1.033118,-1.042184,0.555955,0.576032,-0.947461
3,49.862629,1.040213,0.943036,1,0,0,1,0,1,0,3.909272,0.039426,-0.058651,1.14893,0.023781,-0.052008
4,13.585701,3.446598,1.715601,1,0,0,0,0,0,0,2.609018,1.237388,0.539764,0.220662,0.689556,0.492815


In [16]:
pd.concat([val.reset_index(drop=True),df_num_val], axis=1)

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud,RRUC,RROO,log_DFH,log_DFLT,log_RTMPP,log_DFH_padro,log_DFLT_padro,log_RTMPP_padro
0,58.426880,0.958514,0.269220,1,0,0,1,0,0,1,4.067776,-0.042371,-1.312228,1.262088,-0.021678,-1.193320
1,6.989511,26.921132,2.569289,1,0,0,1,0,0,1,1.944411,3.292912,0.943629,-0.253809,1.831927,0.860512
2,52.611446,1.684453,14.136228,1,0,0,1,1,0,1,3.962934,0.521441,2.648741,1.187239,0.291664,2.412922
3,23.463881,4.677475,0.925283,1,0,0,1,0,0,1,3.155462,1.542758,-0.077656,0.610776,0.859268,-0.069311
4,2.183154,0.214157,0.111700,1,0,0,1,0,0,1,0.780771,-1.541047,-2.191934,-1.084546,-0.854577,-1.994244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116995,3.284154,2.413704,0.747849,1,1,0,1,0,1,1,1.189109,0.881163,-0.290554,-0.793028,0.491582,-0.263144
116996,3.238161,1.696026,1.312267,1,0,0,0,0,0,0,1.175005,0.528288,0.271756,-0.803097,0.295470,0.248809
116997,10.294053,10.678856,0.852263,1,0,0,1,0,0,1,2.331566,2.368266,-0.159860,0.022586,1.318049,-0.144153
116998,17.223394,4.106925,0.811681,1,0,0,1,0,0,1,2.846269,1.412675,-0.208648,0.390038,0.786973,-0.188572


In [17]:
val = pd.concat([val.reset_index(drop=True),df_num_val], axis=1)
val.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud,RRUC,RROO,log_DFH,log_DFLT,log_RTMPP,log_DFH_padro,log_DFLT_padro,log_RTMPP_padro
0,58.42688,0.958514,0.26922,1,0,0,1,0,0,1,4.067776,-0.042371,-1.312228,1.262088,-0.021678,-1.19332
1,6.989511,26.921132,2.569289,1,0,0,1,0,0,1,1.944411,3.292912,0.943629,-0.253809,1.831927,0.860512
2,52.611446,1.684453,14.136228,1,0,0,1,1,0,1,3.962934,0.521441,2.648741,1.187239,0.291664,2.412922
3,23.463881,4.677475,0.925283,1,0,0,1,0,0,1,3.155462,1.542758,-0.077656,0.610776,0.859268,-0.069311
4,2.183154,0.214157,0.1117,1,0,0,1,0,0,1,0.780771,-1.541047,-2.191934,-1.084546,-0.854577,-1.994244


# Salva as bases pré-processadas

In [18]:
treino.to_parquet('treino_pp.parquet', engine='fastparquet')
val.to_parquet('validacao_pp.parquet', engine='fastparquet')