In [12]:
import numpy as np
import pandas as pd

# CONFIGURACIÓN GENERAL
n = 5000  # número de clientes


#ASIGNAR SEGMENTOS OCULTOS
segmentos = np.random.choice(
    ["bajo_riesgo", "medio_riesgo", "alto_riesgo"],
    size=n,
    p=[0.40, 0.35, 0.25]
)

#GENERAR VARIABLES POR SEGMENTO

score = np.zeros(n)
ingresos = np.zeros(n)
uso_credito = np.zeros(n)
historial_impagos = np.zeros(n)
frecuencia_pago = np.zeros(n)
saldo_actual = np.zeros(n)
antiguedad = np.random.uniform(0, 60, n)

comportamiento_app = []

for i, seg in enumerate(segmentos):

    if seg == "bajo_riesgo":
        score[i] = np.random.normal(780, 40)
        ingresos[i] = np.random.normal(15000, 3000)
        uso_credito[i] = np.random.beta(2, 10)
        historial_impagos[i] = np.random.poisson(0.1)
        frecuencia_pago[i] = np.random.normal(6, 2)
        saldo_actual[i] = np.random.gamma(2, 1500)
        comportamiento_app.append(np.random.choice(["alto", "medio"], p=[0.7, 0.3]))

    elif seg == "medio_riesgo":
        score[i] = np.random.normal(620, 60)
        ingresos[i] = np.random.normal(9000, 2500)
        uso_credito[i] = np.random.beta(3, 5)
        historial_impagos[i] = np.random.poisson(0.5)
        frecuencia_pago[i] = np.random.normal(4, 2)
        saldo_actual[i] = np.random.gamma(2, 2000)
        comportamiento_app.append(np.random.choice(["medio", "bajo"], p=[0.6, 0.4]))

    else:  # alto riesgo
        score[i] = np.random.normal(500, 80)
        ingresos[i] = np.random.normal(6000, 2000)
        uso_credito[i] = np.random.beta(5, 3)
        historial_impagos[i] = np.random.poisson(1.2)
        frecuencia_pago[i] = np.random.normal(2, 2)
        saldo_actual[i] = np.random.gamma(2, 3000)
        comportamiento_app.append("bajo")

# Correcciones para valores válidos
score = np.clip(score, 300, 900)
ingresos = np.clip(ingresos, 1500, None)
frecuencia_pago = np.clip(frecuencia_pago, 0, 12)


# GENERAR PROBABILIDAD DE IMPAGO

# función logística simulada
logit = (
    -6
    + 0.003 * (650 - score)
    + 0.00020 * (8000 - ingresos)
    + 1.2 * uso_credito
    + 0.7 * historial_impagos
)

prob_impago = 1 / (1 + np.exp(-logit))

# variable binaria final
impago = np.random.binomial(1, prob_impago)


# CONSTRUIR DATASET FINAL

df = pd.DataFrame({
    "ID_cliente": [f"C{i:05d}" for i in range(n)],
    "segmento_oculto": segmentos,
    "score_crediticio": np.round(score, 2),
    "ingresos_mensuales": np.round(ingresos, 2),
    "uso_credito": np.round(uso_credito, 3),
    "historial_impagos": historial_impagos.astype(int),
    "frecuencia_pago": np.round(frecuencia_pago, 2),
    "saldo_actual": np.round(saldo_actual, 2),
    "antiguedad_cliente": np.round(antiguedad, 1),
    "comportamiento_app": comportamiento_app,
    "prob_impago": np.round(prob_impago, 4),
    "impago": impago
})


df.to_csv("clientes_sinteticos.csv", index=False)

df


Unnamed: 0,ID_cliente,segmento_oculto,score_crediticio,ingresos_mensuales,uso_credito,historial_impagos,frecuencia_pago,saldo_actual,antiguedad_cliente,comportamiento_app,prob_impago,impago
0,C00000,bajo_riesgo,802.59,14813.81,0.214,0,8.31,4254.65,49.6,alto,0.0005,0
1,C00001,medio_riesgo,638.67,9237.82,0.443,2,5.26,10084.21,11.7,bajo,0.0136,0
2,C00002,bajo_riesgo,751.76,15392.89,0.022,0,9.49,808.37,21.5,alto,0.0004,0
3,C00003,bajo_riesgo,749.43,13244.53,0.202,0,3.93,3160.19,8.1,alto,0.0008,0
4,C00004,alto_riesgo,707.97,7969.93,0.579,2,4.05,7330.93,34.9,bajo,0.0167,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,C04995,medio_riesgo,661.17,9300.50,0.407,0,6.35,10252.52,29.5,medio,0.0030,0
4996,C04996,bajo_riesgo,795.24,20834.23,0.127,0,7.34,438.65,37.1,alto,0.0001,0
4997,C04997,bajo_riesgo,801.99,16534.01,0.076,0,4.29,1140.02,38.1,alto,0.0003,0
4998,C04998,alto_riesgo,416.68,9568.92,0.815,0,0.71,6777.44,52.8,bajo,0.0096,0


In [13]:
df.groupby(['impago'])['impago'].count()

impago
0    4933
1      67
Name: impago, dtype: int64