In [2]:
import numpy as np
import pandas as pd

# CONFIGURACIÓN GENERAL
np.random.seed(42)
n = 5000  # número de clientes


#ASIGNAR SEGMENTOS OCULTOS
segmentos = np.random.choice(
    ["bajo_riesgo", "medio_riesgo", "alto_riesgo"],
    size=n,
    p=[0.40, 0.35, 0.25]
)

#GENERAR VARIABLES POR SEGMENTO

score = np.zeros(n)
ingresos = np.zeros(n)
uso_credito = np.zeros(n)
historial_impagos = np.zeros(n)
frecuencia_pago = np.zeros(n)
saldo_actual = np.zeros(n)
antiguedad = np.random.uniform(0, 60, n)

comportamiento_app = []

for i, seg in enumerate(segmentos):

    if seg == "bajo_riesgo":
        score[i] = np.random.normal(780, 40)
        ingresos[i] = np.random.normal(15000, 3000)
        uso_credito[i] = np.random.beta(2, 10)
        historial_impagos[i] = np.random.poisson(0.1)
        frecuencia_pago[i] = np.random.normal(6, 2)
        saldo_actual[i] = np.random.gamma(2, 1500)
        comportamiento_app.append(np.random.choice(["alto", "medio"], p=[0.7, 0.3]))

    elif seg == "medio_riesgo":
        score[i] = np.random.normal(620, 60)
        ingresos[i] = np.random.normal(9000, 2500)
        uso_credito[i] = np.random.beta(3, 5)
        historial_impagos[i] = np.random.poisson(0.5)
        frecuencia_pago[i] = np.random.normal(4, 2)
        saldo_actual[i] = np.random.gamma(2, 2000)
        comportamiento_app.append(np.random.choice(["medio", "bajo"], p=[0.6, 0.4]))

    else:  # alto riesgo
        score[i] = np.random.normal(500, 80)
        ingresos[i] = np.random.normal(6000, 2000)
        uso_credito[i] = np.random.beta(5, 3)
        historial_impagos[i] = np.random.poisson(1.2)
        frecuencia_pago[i] = np.random.normal(2, 2)
        saldo_actual[i] = np.random.gamma(2, 3000)
        comportamiento_app.append("bajo")

# Correcciones para valores válidos
score = np.clip(score, 300, 900)
ingresos = np.clip(ingresos, 1500, None)
frecuencia_pago = np.clip(frecuencia_pago, 0, 12)


# GENERAR PROBABILIDAD DE IMPAGO

# función logística simulada
logit = (
    -6
    + 0.003 * (650 - score)
    + 0.00020 * (8000 - ingresos)
    + 1.2 * uso_credito
    + 0.7 * historial_impagos
)

prob_impago = 1 / (1 + np.exp(-logit))

# variable binaria final
impago = np.random.binomial(1, prob_impago)


# CONSTRUIR DATASET FINAL

df = pd.DataFrame({
    "ID_cliente": [f"C{i:05d}" for i in range(n)],
    "segmento_oculto": segmentos,
    "score_crediticio": np.round(score, 2),
    "ingresos_mensuales": np.round(ingresos, 2),
    "uso_credito": np.round(uso_credito, 3),
    "historial_impagos": historial_impagos.astype(int),
    "frecuencia_pago": np.round(frecuencia_pago, 2),
    "saldo_actual": np.round(saldo_actual, 2),
    "antiguedad_cliente": np.round(antiguedad, 1),
    "comportamiento_app": comportamiento_app,
    "prob_impago": np.round(prob_impago, 4),
    "impago": impago
})


df.to_csv("clientes_sinteticos.csv", index=False)

df.head(15)


Unnamed: 0,ID_cliente,segmento_oculto,score_crediticio,ingresos_mensuales,uso_credito,historial_impagos,frecuencia_pago,saldo_actual,antiguedad_cliente,comportamiento_app,prob_impago,impago
0,C00000,bajo_riesgo,720.49,11624.44,0.256,0,8.51,4079.53,23.6,medio,0.0013,0
1,C00001,alto_riesgo,524.4,6326.57,0.905,0,1.45,3430.02,28.4,bajo,0.0147,0
2,C00002,medio_riesgo,638.04,10918.98,0.33,0,2.19,3713.53,51.3,medio,0.0021,0
3,C00003,medio_riesgo,638.82,5288.49,0.315,2,5.82,3107.14,20.4,medio,0.0254,0
4,C00004,bajo_riesgo,672.03,11463.99,0.109,0,3.11,3464.19,52.2,alto,0.0013,0
5,C00005,bajo_riesgo,824.84,17416.72,0.252,0,4.28,1435.79,5.3,alto,0.0003,0
6,C00006,bajo_riesgo,782.4,15972.53,0.322,0,4.74,7199.02,46.6,alto,0.0005,0
7,C00007,alto_riesgo,455.27,5079.39,0.776,2,2.37,5175.53,50.9,bajo,0.0759,0
8,C00008,medio_riesgo,560.73,10807.59,0.602,0,2.17,3287.72,10.9,medio,0.0038,0
9,C00009,medio_riesgo,572.09,9553.92,0.462,1,4.64,7037.78,25.8,bajo,0.008,0
