In [1]:
import pandas as pd

# Datos del dataset
data = [
    [1001, "Camiseta Deportiva", "2024-01-15", "Norte", 12, 35000, 420000],
    [1002, "Zapatos Running", "2024-01-17", "Sur", 8, 120000, 960000],
    [1003, "Pantalon Casual", "2024-01-20", "Este", 15, 80000, 1200000],
    [1004, "Camiseta Deportiva", "2024-02-01", "Oeste", 10, 35000, 350000],
    [1005, "Zapatos Running", "2024-02-02", "Norte", 5, 120000, 600000],
    [1006, "Pantalon Casual", "2024-02-04", "Sur", 20, 80000, 1600000],
    [1007, "Gorra ", "2024-02-10", "Este", 25, 25000, 625000],
    [1008, "Buzo Deportivo", "2024-02-12", "Oeste", 10, 95000, 950000],
    [1009, "Buzo Deportivo", "2024-02-12", "Oeste", 10, 95000, 950000],
    [1010, "Pantalon Casual", "2024-03-01", "Norte", None, 80000, 0],
    [1011, "Zapatos Running", "2024-03-05", "Sur", 9, None, 1080000],
    [1012, "Camiseta Deportiva", "2024-03-06", "Este", 10, 35000, 350000],
    [1013, "Gorra ", "2024-03-07", "Oeste", 30, 25000, 750000],
    [1014, "Buzo Deportivo", "2024-03-10", "Sur", 8, 95000, 760000],
    [1015, "Camiseta Deportiva", "2024-04-02", "Norte", 15, 35000, 525000],
    [1016, "Zapatos Running", "2024-04-04", "Oeste", 4, 120000, 480000],
    [1017, "Pantalon Casual", "2024-04-06", "Este", 10, 80000, 800000],
    [1018, "Gorra ", "2024-04-10", "Sur", 12, 25000, 300000],
    [1019, "Buzo Deportivo", "2024-04-12", "Norte", 7, 95000, 665000],
    [1020, "Zapatos Running", "2024-04-15", "Oeste", None, 120000, 0],
    [1021, "Camiseta Deportiva", "2024-05-01", "Este", 10, 35000, 350000],
    [1022, "Buzo Deportivo", "2024-05-03", "Sur", 10, 95000, 950000],
    [1023, "Gorra ", "2024-05-05", "Oeste", 20, 25000, 500000],
    [1024, "Pantalon Casual", "2024-05-08", "Norte", 15, 80000, 1200000],
    [1025, "Zapatos Running", "2024-05-09", "Sur", 8, 120000, 960000],
    [1026, "Camiseta Deportiva", "2024-05-10", "Oeste", 12, 35000, 420000],
    [1027, "Gorra ", "2024-06-01", "Este", 10, 25000, 250000],
    [1028, "Buzo Deportivo", "2024-06-02", "Norte", 11, 95000, 1045000],
    [1029, "Pantalon Casual", "2024-06-04", "Sur", 10, 80000, 800000],
    [1030, "Camiseta Deportiva", "2024-06-06", "Este", 14, 35000, 490000],
]

# Columnas
columns = ["ID_venta", "Producto ", "FechaVenta", "Region", "Unidades", "Precio_unitario", "Total_Venta"]

# Crear DataFrame
df = pd.DataFrame(data, columns=columns)

# Guardar en CSV
file_path = "ventas_tienda.csv"
df.to_csv(file_path, index=False)

In [2]:
from __future__ import annotations
import argparse
from dataclasses import dataclass
from typing import Tuple
import numpy as np
import pandas as pd


@dataclass
class Config:
  rows: int = 220
  seed: int = 42
  out: str = "clientes_banco.csv"


def _rng(seed: int) -> np.random.Generator:
  return np.random.default_rng(seed)


def _generate_base_features(n: int, rng: np.random.Generator) -> dict:
  regions = ["Norte", "Sur", "Este", "Oeste"]
  employment = ["Empleado", "Independiente", "Desempleado"]
  education = ["Secundaria", "Tecnico", "Profesional", "Posgrado"]
  marital = ["Soltero", "Casado", "Divorciado", "Union libre"]
  purposes = ["Consumo", "Vehiculo", "Vivienda", "Educacion"]
  channels = ["Web", "App", "Oficina"]

  customer_id = np.arange(10001, 10001 + n)
  age = rng.integers(18, 71, size=n)
  income_monthly = rng.normal(3_500_000, 1_500_000, size=n).clip(800_000, 18_000_000).round(-3)
  loan_amount = rng.normal(20_000_000, 8_000_000, size=n).clip(3_000_000, 80_000_000).round(-3)
  loan_term_months = rng.integers(6, 85, size=n)
  interest_rate = rng.normal(0.20, 0.06, size=n).clip(0.08, 0.45).round(3)  # tasa efectiva anual
  credit_score = rng.normal(620, 80, size=n).clip(300, 850).round().astype(int)
  late_payments = rng.poisson(0.8, size=n)
  has_mortgage = rng.choice(["Si", "No"], size=n, p=[0.35, 0.65])
  employment_status = rng.choice(employment, size=n, p=[0.7, 0.25, 0.05])
  education_level = rng.choice(education, size=n, p=[0.25, 0.35, 0.3, 0.1])
  marital_status = rng.choice(marital, size=n)
  region = rng.choice(regions, size=n)
  loan_purpose = rng.choice(purposes, size=n, p=[0.5, 0.2, 0.25, 0.05])
  device_channel = rng.choice(channels, size=n, p=[0.5, 0.4, 0.1])
  current_account_balance = (income_monthly * rng.normal(0.3, 0.15, size=n)).clip(0, None).round(-3)

  # Fechas distribuidas en 2024
  dates = pd.to_datetime("2024-01-01") + pd.to_timedelta(rng.integers(0, 365, size=n), unit="D")
  application_date = dates.strftime("%Y-%m-%d")

  return {
    "customer_id": customer_id,
    "application_date": application_date,
    "age": age,
    "income_monthly": income_monthly.astype(int),
    "loan_amount": loan_amount.astype(int),
    "loan_term_months": loan_term_months,
    "interest_rate": interest_rate,
    "credit_score": credit_score,
    "num_late_payments_last_year": late_payments,
    "has_mortgage": has_mortgage,
    "employment_status": employment_status,
    "education_level": education_level,
    "marital_status": marital_status,
    "region": region,
    "loan_purpose": loan_purpose,
    "device_channel": device_channel,
    "current_account_balance": current_account_balance.astype(int),
  }


def _generate_target(df: pd.DataFrame, rng: np.random.Generator) -> np.ndarray:
  """
  Probabilidad de default basada en una combinación lineal de rasgos
  y pasada por función logística.
  """
  score = (
      -2.5
      + 0.00000006 * (df["loan_amount"] - 20_000_000)
      + 0.9 * (df["interest_rate"] - 0.20)
      - 0.0045 * (df["credit_score"] - 620)
      + 0.18 * df["num_late_payments_last_year"]
      - 0.00000005 * (df["income_monthly"] - 3_500_000)
      + 0.25 * (df["employment_status"] == "Desempleado").astype(float)
      + 0.12 * (df["has_mortgage"] == "No").astype(float)
  )
  prob_default = 1 / (1 + np.exp(-score))
  return rng.binomial(1, prob_default)


def _inject_quality_issues(df: pd.DataFrame, rng: np.random.Generator) -> pd.DataFrame:
  n = len(df)
  # Faltantes (~4–6% en columnas seleccionadas)
  for col, p in [("income_monthly", 0.06), ("credit_score", 0.06), ("employment_status", 0.04), ("education_level", 0.04)]:
    mask = rng.random(n) < p
    df.loc[mask, col] = np.nan

  # Outliers numéricos
  df.loc[rng.choice(df.index, 2, replace=False), "income_monthly"] = [25_000_000, 30_000_000]
  df.loc[rng.choice(df.index, 2, replace=False), "loan_amount"] = [90_000_000, 100_000_000]

  # Inconsistencias categóricas (para limpieza)
  ix = rng.choice(df.index, 3, replace=False)
  df.loc[ix[0], "employment_status"] = "empleado"  # lowercase
  df.loc[ix[1], "region"] = "Norte "               # espacio al final
  df.loc[ix[2], "loan_purpose"] = "consumo"        # lowercase

  # Duplicados intencionales
  dup_rows = df.sample(2, random_state=7)
  df = pd.concat([df, dup_rows], ignore_index=True)

  # Mezclar filas
  df = df.sample(frac=1, random_state=13).reset_index(drop=True)
  return df


def build_dataset(cfg: Config) -> Tuple[pd.DataFrame, dict]:
  rng = _rng(cfg.seed)
  features = _generate_base_features(cfg.rows, rng)
  df = pd.DataFrame(features)

  # Target
  df["default"] = _generate_target(df, rng).astype(int)

  # Calidad de datos intencional
  df = _inject_quality_issues(df, rng)

  # Resumen rápido para consola
  summary = {
    "rows": int(len(df)),
    "positive_rate_default": float(df["default"].mean()),
    "missing_counts": {c: int(df[c].isna().sum()) for c in df.columns},
    "duplicates_example_ids": df[df.duplicated(keep=False)]["customer_id"].head(4).tolist(),
  }
  return df, summary

cfg = Config(rows=220, seed=42, out="clientes_banco.csv")
df, summary = build_dataset(cfg)
df.to_csv(cfg.out, index=False)
