In [10]:
import numpy as np
import pandas as pd

# CONFIGURACIÓN GENERAL
n = 5000  # número de clientes


#ASIGNAR SEGMENTOS OCULTOS
segmentos = np.random.choice(
    ["bajo_riesgo", "medio_riesgo", "alto_riesgo"],
    size=n,
    p=[0.40, 0.35, 0.25]
)

#GENERAR VARIABLES POR SEGMENTO

score = np.zeros(n)
ingresos = np.zeros(n)
uso_credito = np.zeros(n)
historial_impagos = np.zeros(n)
frecuencia_pago = np.zeros(n)
saldo_actual = np.zeros(n)
antiguedad = np.random.uniform(0, 60, n)

comportamiento_app = []

for i, seg in enumerate(segmentos):

    if seg == "bajo_riesgo":
        score[i] = np.random.normal(780, 40)
        ingresos[i] = np.random.normal(15000, 3000)
        uso_credito[i] = np.random.beta(2, 10)
        historial_impagos[i] = np.random.poisson(0.1)
        frecuencia_pago[i] = np.random.normal(6, 2)
        saldo_actual[i] = np.random.gamma(2, 1500)
        comportamiento_app.append(np.random.choice(["alto", "medio"], p=[0.7, 0.3]))

    elif seg == "medio_riesgo":
        score[i] = np.random.normal(620, 60)
        ingresos[i] = np.random.normal(9000, 2500)
        uso_credito[i] = np.random.beta(3, 5)
        historial_impagos[i] = np.random.poisson(0.5)
        frecuencia_pago[i] = np.random.normal(4, 2)
        saldo_actual[i] = np.random.gamma(2, 2000)
        comportamiento_app.append(np.random.choice(["medio", "bajo"], p=[0.6, 0.4]))

    else:  # alto riesgo
        score[i] = np.random.normal(500, 80)
        ingresos[i] = np.random.normal(6000, 2000)
        uso_credito[i] = np.random.beta(5, 3)
        historial_impagos[i] = np.random.poisson(1.2)
        frecuencia_pago[i] = np.random.normal(2, 2)
        saldo_actual[i] = np.random.gamma(2, 3000)
        comportamiento_app.append("bajo")

# Correcciones para valores válidos
score = np.clip(score, 300, 900)
ingresos = np.clip(ingresos, 1500, None)
frecuencia_pago = np.clip(frecuencia_pago, 0, 12)


# GENERAR PROBABILIDAD DE IMPAGO

# función logística simulada
logit = (
    -6
    + 0.003 * (650 - score)
    + 0.00020 * (8000 - ingresos)
    + 1.2 * uso_credito
    + 0.7 * historial_impagos
)

prob_impago = 1 / (1 + np.exp(-logit))

# variable binaria final
impago = np.random.binomial(1, prob_impago)


# CONSTRUIR DATASET FINAL

df = pd.DataFrame({
    "ID_cliente": [f"C{i:05d}" for i in range(n)],
    "segmento_oculto": segmentos,
    "score_crediticio": np.round(score, 2),
    "ingresos_mensuales": np.round(ingresos, 2),
    "uso_credito": np.round(uso_credito, 3),
    "historial_impagos": historial_impagos.astype(int),
    "frecuencia_pago": np.round(frecuencia_pago, 2),
    "saldo_actual": np.round(saldo_actual, 2),
    "antiguedad_cliente": np.round(antiguedad, 1),
    "comportamiento_app": comportamiento_app,
    "prob_impago": np.round(prob_impago, 4),
    "impago": impago
})


df.to_csv("clientes_sinteticos.csv", index=False)

df


Unnamed: 0,ID_cliente,segmento_oculto,score_crediticio,ingresos_mensuales,uso_credito,historial_impagos,frecuencia_pago,saldo_actual,antiguedad_cliente,comportamiento_app,prob_impago,impago
0,C00000,bajo_riesgo,741.58,9255.15,0.341,0,8.29,6613.70,50.7,alto,0.0022,0
1,C00001,bajo_riesgo,705.61,13997.66,0.272,0,7.44,785.34,53.8,alto,0.0009,0
2,C00002,medio_riesgo,661.67,9846.17,0.380,1,4.16,2240.38,20.9,bajo,0.0052,0
3,C00003,bajo_riesgo,876.87,12534.40,0.125,0,5.75,1826.90,54.0,alto,0.0006,0
4,C00004,bajo_riesgo,782.47,7414.67,0.189,0,8.69,481.42,23.6,medio,0.0023,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,C04995,bajo_riesgo,797.26,11092.12,0.357,0,6.32,3788.02,19.6,alto,0.0013,0
4996,C04996,bajo_riesgo,788.67,14192.04,0.157,0,7.19,3332.34,24.6,medio,0.0006,0
4997,C04997,bajo_riesgo,742.68,11440.65,0.144,0,5.30,1325.41,55.1,medio,0.0011,0
4998,C04998,medio_riesgo,693.56,7733.05,0.138,1,2.84,732.79,25.1,bajo,0.0054,0


In [11]:
df.groupby(['impago'])['impago'].count()

impago
0    4940
1      60
Name: impago, dtype: int64