In [1]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import os

RAW_DATA_PATH = "../data/raw/creditcard.csv"
PROCESSED_DATA_PATH = "../data/processed"

# Garante que a pasta processed existe
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

In [2]:

df = pd.read_csv(RAW_DATA_PATH)

# Inicializando o Scaler Robusto
rob_scaler = RobustScaler()

# Transformando e criando novas colunas
df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

# Removendo as originais que não servem mais
df.drop(['Time','Amount'], axis=1, inplace=True)

# Reordenando para deixar as escalas no começo
scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

print("Dados escalados com sucesso!")
display(df.head())

Dados escalados com sucesso!


Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.783274,-0.994983,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,-0.269825,-0.994983,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,4.983721,-0.994972,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,1.418291,-0.994972,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,0.670579,-0.99496,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


In [4]:
#Split e Exportação

# Separando X e y
X = df.drop('Class', axis=1)
y = df['Class']

# Divisão Estratificada (Mantém a proporção de fraudes em ambos)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# Juntando de volta para salvar em CSV
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Salvando na pasta processed
train_df.to_csv(f"{PROCESSED_DATA_PATH}/train.csv", index=False)
test_df.to_csv(f"{PROCESSED_DATA_PATH}/test.csv", index=False)

print(f"Arquivos salvos em: {PROCESSED_DATA_PATH}")
print(f"Treino shape: {train_df.shape}")
print(f"Teste shape: {test_df.shape}")

Arquivos salvos em: ../data/processed
Treino shape: (227845, 31)
Teste shape: (56962, 31)
