In [1]:
import pandas as pd 
import numpy as np 

In [2]:
# Fijamos semilla para que los datos sean reproducibles
np.random.seed(42)

# Cantidad de registros
n = 500

# Crear dataset
df = pd.DataFrame({
    "cliente_id": range(1, n + 1),
    "edad": np.random.randint(18, 70, n),
    "ingreso_mensual": np.random.normal(800000, 250000, n),
    "monto_credito": np.random.normal(3000000, 1200000, n),
    "score_crediticio": np.random.randint(300, 850, n),
    "tipo_cliente": np.random.choice(["Nuevo", "Recurrente", "Premium"], n),
    "tiene_mora": np.random.choice([0, 1], n, p=[0.8, 0.2])
})

In [3]:
df.head()

Unnamed: 0,cliente_id,edad,ingreso_mensual,monto_credito,score_crediticio,tipo_cliente,tiene_mora
0,1,56,1330539.0,1828952.0,680,Recurrente,0
1,2,69,1058116.0,4264370.0,357,Premium,0
2,3,46,420157.5,1860721.0,558,Premium,0
3,4,32,678941.5,6158858.0,641,Nuevo,0
4,5,60,1116728.0,3591981.0,429,Premium,1


In [4]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cliente_id        500 non-null    int64  
 1   edad              500 non-null    int32  
 2   ingreso_mensual   500 non-null    float64
 3   monto_credito     500 non-null    float64
 4   score_crediticio  500 non-null    int32  
 5   tipo_cliente      500 non-null    str    
 6   tiene_mora        500 non-null    int64  
dtypes: float64(2), int32(2), int64(2), str(1)
memory usage: 23.6 KB


In [5]:
# Introducir valores nulos en ingreso_mensual
df.loc[np.random.choice(df.index, 20, replace=False), "ingreso_mensual"] = np.nan

# Introducir valores nulos en score_crediticio
df.loc[np.random.choice(df.index, 15, replace=False), "score_crediticio"] = np.nan

In [6]:
df.isnull().sum()

cliente_id           0
edad                 0
ingreso_mensual     20
monto_credito        0
score_crediticio    15
tipo_cliente         0
tiene_mora           0
dtype: int64

In [7]:
# Introducir outliers en ingreso_mensual
df.loc[np.random.choice(df.index, 5, replace=False), "ingreso_mensual"] = 50000000

# Introducir outliers en monto_credito
df.loc[np.random.choice(df.index, 5, replace=False), "monto_credito"] = 20000000

In [8]:
df.describe()

Unnamed: 0,cliente_id,edad,ingreso_mensual,monto_credito,score_crediticio,tiene_mora
count,500.0,500.0,480.0,500.0,485.0,500.0
mean,250.5,44.22,1299420.0,3321858.0,579.731959,0.228
std,144.481833,15.036082,5007910.0,2053354.0,159.202436,0.419963
min,1.0,18.0,-10316.84,-475506.5,306.0,0.0
25%,125.75,32.0,617142.6,2333677.0,441.0,0.0
50%,250.5,45.0,792897.5,3156644.0,569.0,0.0
75%,375.25,57.0,957166.6,3953365.0,719.0,0.0
max,500.0,69.0,50000000.0,20000000.0,849.0,1.0


In [9]:
df.to_csv("dataset_fintech_sucio.csv", index=False)