In [50]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from imblearn.combine import SMOTETomek

In [51]:
#cargar los datos desde csv
df = pd.read_csv("../data/fraudDataset.csv")
df.head()

Unnamed: 0,a,b,c,d,e,f,g,h,j,k,l,m,n,o,p,fecha,monto,score,fraude
0,4,0.6812,50084.12,50.0,0.0,20.0,AR,1,cat_d26ab52,0.365475,2479.0,952.0,1,,Y,2020-03-20 09:28:19,57.63,100,0
1,4,0.6694,66005.49,0.0,0.0,2.0,AR,1,cat_ea962fb,0.612728,2603.0,105.0,1,Y,Y,2020-03-09 13:58:28,40.19,25,0
2,4,0.4718,7059.05,4.0,0.463488,92.0,BR,25,cat_4c2544e,0.651835,2153.0,249.0,1,Y,Y,2020-04-08 12:25:55,5.77,23,0
3,4,0.726,10043.1,24.0,0.046845,43.0,BR,43,cat_1b59ee3,0.692728,4845.0,141.0,1,N,Y,2020-03-14 11:46:13,40.89,23,0
4,4,0.7758,16584.42,2.0,0.154616,54.0,BR,0,cat_9bacaa5,0.201354,2856.0,18.0,1,Y,N,2020-03-23 14:17:13,18.98,71,0


## Limpieza de Datos

Tratamiento de datos faltantes (nan)

In [52]:
# Imputar con la mediana
df_processed = df.copy()
df_processed['b'].fillna(df_processed['b'].median(), inplace=True) #por tener distribución sesgada
df_processed['c'].fillna(df_processed['c'].median(), inplace=True) #altamente sesgada con outliers
df_processed['d'].fillna(df_processed['d'].median(), inplace=True) #sesgada
df_processed['f'].fillna(df_processed['f'].median(), inplace=True) #sesgada
df_processed['l'].fillna(df_processed['l'].median(), inplace=True) #sesgada
df_processed['m'].fillna(df_processed['m'].median(), inplace=True) #sesgada

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['b'].fillna(df_processed['b'].median(), inplace=True) #por tener distribución sesgada
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['c'].fillna(df_processed['c'].median(), inplace=True) #altamente sesgada con outliers
The behavior will change in pandas

In [53]:
df_processed['o'].fillna('UN', inplace=True) #tiene muchos valores nulos luego llenamos con UNKNOWN
df_processed['g'].fillna(df_processed['g'].mode()[0], inplace=True) #categórica luego llenamos con la moda

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['o'].fillna('UN', inplace=True) #tiene muchos valores nulos luego llenamos con UNKNOWN
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['g'].fillna(df_processed['g'].mode()[0], inplace=True) #categórica luego llenamos con la moda


Dividir la fecha en valores numéricos para el tratamiento en el modelo

In [54]:
# convertir fecha a date_time y extraer características
df_processed['fecha'] = pd.to_datetime(df_processed['fecha'])
df_processed['dia'] = df_processed['fecha'].dt.day
df_processed['mes'] = df_processed['fecha'].dt.month
df_processed['hora'] = df_processed['fecha'].dt.hour

In [55]:


# Normalización estándar
scaler = StandardScaler()
df_processed[['normalized_monto', 'score', 'b', 'k']] = scaler.fit_transform(df_processed[['monto', 'score', 'b', 'k']])

# Transformación logarítmica para distribuciones sesgadas con valores positivos
df_processed['c'] = np.log1p(df_processed['c'])
df_processed['m'] = np.log1p(df_processed['m'])

# Escalado robusto por la presencia de outliers y valores negativos
robust_scaler = RobustScaler()
df_processed[['l','f']] = robust_scaler.fit_transform(df_processed[['l','f']])

# One-Hot Encoding para variables categóricas
df_processed = pd.get_dummies(df_processed, columns=['a', 'g', 'o', 'p'], drop_first=True)

Normalizaciones y escalado

Verificar coeficiente de Cramer para ver correlación con la variable objetivo de la j que tiene muchas categorías.

In [56]:
import pandas as pd
import scipy.stats as stats

# Crear una tabla de contingencia entre 'j' y 'fraude'
contingency_table = pd.crosstab(df_processed['j'], df_processed['fraude'])

# Calcular el estadístico de Chi-Cuadrado y el coeficiente de Cramér's V
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
n = contingency_table.sum().sum()
phi2 = chi2 / n
r, k = contingency_table.shape
cramers_v = np.sqrt(phi2 / min((k-1), (r-1)))

print(f"Coeficiente de Cramér's V: {cramers_v:.4f}")

Coeficiente de Cramér's V: 0.2777


Se decide por baja relación dada por el coeficiente de Cramér, quitar la columna j

In [57]:
df_processed.drop(columns=['j','fecha'], inplace=True)


In [58]:
df_processed.to_csv("../data/df_processed.csv", index=False)

In [59]:
# Separar las características (X) y la variable objetivo (y)
X = df_processed.drop(columns=['fraude'])
y = df_processed['fraude']

# Aplicar SMOTE + Tomek Links
smt = SMOTETomek(random_state=42)
X_balanced, y_balanced = smt.fit_resample(X, y)

# Crear un nuevo DataFrame balanceado
df_processed_balanced = pd.DataFrame(X_balanced, columns=X.columns)
df_processed_balanced['fraude'] = y_balanced

# Verificar el balance de clases
print(df_processed_balanced['fraude'].value_counts())

fraude
0    142359
1    142359
Name: count, dtype: int64


In [60]:
df_processed_balanced.to_csv("../data/df_processed_balanced.csv", index=False)