In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE # Adding this for smote [not compatible with sklearn version] (changing to compatible version in requirements.txt)
from sklearn.metrics import precision_recall_curve

df = pd.read_csv("./data/preprocessed_data.csv")

In [2]:
cols_to_drop = ["fraud_bool", "month", "x1", "x2"]
X = df.drop(columns=cols_to_drop)
y = df["fraud_bool"]

In [3]:
# Identificar variables categóricas
cat_cols = X.select_dtypes(include="object").columns

# Aplicar one-hot encoding
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [4]:
# División temporal: entrenamiento de 0–5, testing de 6–7
df_train = df[df["month"] <= 5]
df_test = df[df["month"] >= 6]

X_train = df_train.drop(columns=cols_to_drop)
X_test = df_test.drop(columns=cols_to_drop)

y_train = df_train["fraud_bool"]
y_test = df_test["fraud_bool"]

# One-hot encoding (aplicado a ambos)
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Alinear columnas entre train y test
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)


In [5]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
np.save("./data/X_train_scaled.npy", X_train_scaled)
np.save("./data/X_test_scaled.npy", X_test_scaled)
np.save("./data/y_train.npy", y_train.to_numpy())
np.save("./data/y_test.npy", y_test.to_numpy())


In [7]:
# Implementing the files of smote
# [1][auto] -> Increase the number of frauds to an acceptable level [now in ≈1%]
# [2][0.3] -> Trying to increase the numbers of the training model
smote = SMOTE(random_state=42, sampling_strategy=0.3)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"Distribución original: {np.bincount(y_train)}")
print(f"Distribución después de SMOTE: {np.bincount(y_train_smote)}")

np.save('./data/X_train_smote.npy', X_train_smote)
np.save('./data/y_train_smote.npy', y_train_smote)

Distribución original: [749950   8378]
Distribución después de SMOTE: [749950 224985]
