In [None]:
from pathlib import Path
import pandas as pd
import numpy as np


df = pd.read_csv("data/terminales_con_sin_transacciones_mensual.csv")

# 2) Tipos y parsing básico
df["periodo"] = pd.to_datetime(df["periodo"] + "-01", format="%Y-%m-%d").dt.to_period(
    "M"
)

# 3) Sanity checks rápidos
print("filas:", len(df))
print("rango periodo:", df["periodo"].min(), "→", df["periodo"].max())

df_duplicados = df[
    df.duplicated(
        subset=["periodo", "rut_comercio", "codigo_local", "numero_terminal"],
        keep=False,
    )
]
dups = len(df_duplicados)
print("duplicados clave periodo×rut×local×terminal:", dups)

# 4) Totales mínimos por fila para adquirirencia
qtrx_cols = [
    "qtrx_visa",
    "qtrx_mastercard",
    "qtrx_amex",
    "qtrx_casas_comerciales",
    "qtrx_vale_electronico",
    "qtrx_ripley",
    "qtrx_hites",
    "qtrx_adquriencia_general",
]
monto_cols = [
    "monto_visa",
    "monto_mastercard",
    "monto_amex",
    "monto_casas_comerciales",
    "monto_vale_electronico",
    "monto_ripley",
    "monto_hites",
    "monto_adquriencia_general",
]

for c in qtrx_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
for c in monto_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)

df["qtrx_total"] = df[qtrx_cols].sum(axis=1)
df["monto_clp"] = df[monto_cols].sum(axis=1)

# 5) Vista rápida
df


  df = pd.read_csv("data/terminales_con_sin_transacciones_mensual.csv")


filas: 2408152
rango periodo: 2024-01 → 2024-12
duplicados clave periodo×rut×local×terminal: 2043
duplicados clave periodo×rut×local×terminal: 2043


Unnamed: 0,periodo,rut_comercio,codigo_local,numero_terminal,estado_terminal,tecnologia_instalar,fecha_instalacion,fecha_baja,pos_con_trx,modelo_equipo,...,monto_vale_electronico,qtrx_ripley,monto_ripley,qtrx_hites,monto_hites,qtrx_adquriencia_general,monto_adquriencia_general,margen_bruto_adquirencia,qtrx_total,monto_clp
0,2024-01,12489233-3,37,8,BAJA,POS GPRS,20080218.0,20201004.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2024-01,6605632-5,160,152,BAJA,POS GPRS,20080430.0,20201004.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024-01,11911027-0,311,303,BAJA,POS GPRS,20080605.0,20201004.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2024-01,9972060-3,379,371,BAJA,POS ETHERNET,20080624.0,20201004.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2024-01,6302656-5,561,553,BAJA,POS GPRS,20080709.0,20201004.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408147,2024-12,15747504-5,177834,1775670,BAJA,POS GPRS,20201223.0,20210927.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2408148,2024-12,15164443-0,177835,1775680,BAJA,POS MOVIL,20201219.0,20230817.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2408149,2024-12,14491388-4,176496,1758570,BAJA,POS MOVIL,20201201.0,20241004.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2408150,2024-12,8750803-K,176500,1758610,BAJA,POS MOVIL,20201126.0,20221024.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df["fecha_baja"] = pd.to_numeric(df["fecha_baja"], errors="coerce")
mask = (df["fecha_baja"] < 20240101.0) & (df["fecha_baja"].notnull())
df = df[~mask].copy()
df

Unnamed: 0,periodo,rut_comercio,codigo_local,numero_terminal,estado_terminal,tecnologia_instalar,fecha_instalacion,fecha_baja,pos_con_trx,modelo_equipo,...,monto_vale_electronico,qtrx_ripley,monto_ripley,qtrx_hites,monto_hites,qtrx_adquriencia_general,monto_adquriencia_general,margen_bruto_adquirencia,qtrx_total,monto_clp
5,2024-01,8123393-4,639,631,BAJA_POR_PERDIDA,POS GPRS,20080729.0,,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,2024-01,13453225-4,372,364,HABILITADO,POS GPRS,20080623.0,,1,,...,0.0,0.0,0.0,0.0,0.0,3326.0,14914534.0,0.0,6652.0,29829068.0
23,2024-01,7101190-9,560,552,HABILITADO,POS GPRS,20080708.0,,1,,...,0.0,0.0,0.0,0.0,0.0,502.0,2421660.0,0.0,1004.0,4843320.0
24,2024-01,12665285-2,593,585,BAJA_POR_PERDIDA,POS GPRS,20080717.0,,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,2024-01,6402350-0,711,703,HABILITADO,POS GPRS,20080822.0,,1,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408142,2024-12,77142345-0,174510,1758530,CON TRANSACCIONES,BOTON_WEB,,,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2408144,2024-12,15801551-K,177822,1775510,BAJA,POS MOVIL,20201221.0,20240415.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2408146,2024-12,9052627-8,130089,1775630,BAJA,POS MOVIL,20201220.0,20240705.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2408149,2024-12,14491388-4,176496,1758570,BAJA,POS MOVIL,20201201.0,20241004.0,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Drop duplicated rows, keeping the first occurrence
key_cols = ["periodo", "rut_comercio", "codigo_local", "numero_terminal"]

print(f"Rows before dropping duplicates: {len(df)}")
df = df.drop_duplicates(subset=key_cols, keep="first")
print(f"Rows after dropping duplicates: {len(df)}")
df.reset_index(drop=True, inplace=True)

Rows before dropping duplicates: 1399968
Rows after dropping duplicates: 1398340
Rows after dropping duplicates: 1398340


In [21]:
df.to_csv("base_con_sin_trx_cleaned.csv", index=False)

In [None]:
# Asegurar columnas totales antes de agrupar
monto_cols = [
    "monto_visa",
    "monto_mastercard",
    "monto_amex",
    "monto_casas_comerciales",
    "monto_vale_electronico",
    "monto_ripley",
    "monto_hites",
    "monto_adquriencia_general",
]
qtrx_cols = [
    "qtrx_visa",
    "qtrx_mastercard",
    "qtrx_amex",
    "qtrx_casas_comerciales",
    "qtrx_vale_electronico",
    "qtrx_ripley",
    "qtrx_hites",
    "qtrx_adquriencia_general",
]
if "monto_clp" not in df.columns:
    for col in monto_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0.0)
    df["monto_clp"] = df[monto_cols].sum(axis=1)
if "qtrx_total" not in df.columns:
    for col in qtrx_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
    df["qtrx_total"] = df[qtrx_cols].sum(axis=1)
# Agregación comercio×mes
cm = df.groupby(["periodo", "rut_comercio"], as_index=False).agg(
    qtrx_total=("qtrx_total", "sum"),
    monto_clp=("monto_clp", "sum"),
    n_local=("codigo_local", "nunique"),
    n_terminal=("numero_terminal", "nunique"),
    n_tecnologias=("tecnologia_instalar", "nunique"),
    fecha_baja=("fecha_baja", "min"),
    monto_visa=("monto_visa", "sum"),
    monto_mastercard=("monto_mastercard", "sum"),
    monto_amex=("monto_amex", "sum"),
    rubro=("vertical", "first"),
    estado_terminal=("estado_terminal", "count"),
    comuna=("comuna_suc", "first"),
    region_suc=("region_suc", "first"),
    qtrx_adquriencia_general=("qtrx_adquriencia_general", "sum"),
)

# Métricas rápidas
cm["ticket_promedio"] = np.where(
    cm["qtrx_total"] > 0, cm["monto_clp"] / cm["qtrx_total"], np.nan
)
# cm["activo_mes"] = (cm["qtrx_total"] > 0).astype(int)
cm = cm[cm["monto_clp"] > 0]


cm["mix_cards"] = np.where(
    cm["monto_clp"] > 0, cm["monto_visa"] / cm["monto_clp"], np.nan
)

cm

Unnamed: 0,periodo,rut_comercio,qtrx_total,monto_clp,n_local,n_terminal,n_tecnologias,fecha_baja,monto_visa,monto_mastercard,monto_amex,rubro,estado_terminal,comuna,region_suc,qtrx_adquriencia_general,ticket_promedio,mix_cards
1,2024-01,10000001-6,2296.0,10207680.0,1,1,1,,4690490,413350,0,C. BARRIO,1,TEMUCO,IX REGION,1148.0,4445.853659,0.459506
5,2024-01,10000539-5,8.0,640000.0,1,1,1,,320000,0,0,RETAIL,1,CALDERA,III REGION,4.0,80000.000000,0.500000
8,2024-01,10001448-3,602.0,2849392.0,1,1,1,,1202046,222650,0,C. BARRIO,1,SAN FERNANDO,VI REGION,301.0,4733.209302,0.421861
10,2024-01,10002450-0,56.0,2624420.0,1,1,1,,1192710,119500,0,RETAIL,1,LONCOCHE,IX REGION,28.0,46864.642857,0.454466
15,2024-01,10004726-8,10.0,33322.0,2,2,2,,16661,0,0,C. BARRIO,2,PELARCO,VII REGION,5.0,3332.200000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754433,2024-12,9964268-8,4392.0,18515320.0,1,1,1,,8330220,927440,0,C. BARRIO,1,TALCA,VII REGION,2196.0,4215.692168,0.449910
754439,2024-12,9971562-6,1572.0,7970200.0,1,1,1,,2833730,1151370,0,ALIMENTACION,1,PUDAHUEL,METROPOLITANA,786.0,5070.101781,0.355541
754454,2024-12,9983607-5,2.0,32180.0,1,1,1,,16090,0,0,C. BARRIO,1,ROMERAL,VII REGION,1.0,16090.000000,0.500000
754464,2024-12,9994855-8,3584.0,34920820.0,1,3,1,20240418.0,12844260,4616150,0,C. BARRIO,3,SAN JOSE DE MAIPO,METROPOLITANA,1792.0,9743.532366,0.367811


In [None]:
cm.to_csv("agregacion_comercio_mes.csv", index=False)

In [None]:
# QA corto
print("tx df:", int(df["qtrx_total"].sum()), "tx cm:", int(cm["qtrx_total"].sum()))
print(
    "monto df:",
    round(df["monto_clp"].sum(), 2),
    "monto cm:",
    round(cm["monto_clp"].sum(), 2),
)
print("rango cm:", cm["periodo"].min(), "→", cm["periodo"].max())


In [None]:
# Guardar
# out = Path("data/processed"); out.mkdir(parents=True, exist_ok=True)
# cm_save = cm.copy()
# cm_save["periodo"] = cm_save["periodo"].astype(str)  # "YYYY-MM"
# cm_save.to_parquet(out/"comercio_mes.parquet", index=False)
# cm_save.to_csv(out/"comercio_mes_sample.csv", index=False)


In [None]:
df["fecha_instalacion"] = pd.to_datetime(df["fecha_instalacion"], errors="coerce")
df["fecha_baja"] = pd.to_datetime(df["fecha_baja"], errors="coerce")


In [None]:
df.drop(columns=["modelo_equipo"], inplace=True)

In [None]:
print(f"estado_terminal ({df['estado_terminal'].nunique()} valores únicos):")
print(df["estado_terminal"].unique())
print(f"\nValue counts:")
print(df["estado_terminal"].value_counts(dropna=False))

print("\n" + "=" * 50 + "\n")

print(f"tecnologia_instalar ({df['tecnologia_instalar'].nunique()} valores únicos):")
print(df["tecnologia_instalar"].unique())
print(f"\nValue counts:")
print(df["tecnologia_instalar"].value_counts(dropna=False))

print("\n" + "=" * 50 + "\n")

print(f"modelo_equipo ({df['modelo_equipo'].nunique()} valores únicos):")
print(df["modelo_equipo"].unique())
print(f"\nValue counts:")
print(df["modelo_equipo"].value_counts(dropna=False))

In [None]:
# 2) Flag de activo por fila:
#    Activo si NO contiene patrones de baja en estado y NO tiene fecha_baja
est = df["estado_terminal"].astype("string").str.upper().fillna("")
df["is_baja_estado"] = est.str.contains(r"BAJA|DESINSTAL|RETIR|INACT", regex=True)
df["is_activo"] = (~df["is_baja_estado"]) & (df["fecha_baja"].isna())

# 3) Agregación comercio×mes con columnas solicitadas
cm = df.groupby(["periodo", "rut_comercio"], as_index=False).agg(
    qtrx_total=("qtrx_total", "sum"),
    monto_clp=("monto_clp", "sum"),
    n_local=("codigo_local", "nunique"),
    n_terminal=("numero_terminal", "nunique"),
    # Distintos valores de tecnología
    tecnologia_instalar=(
        "tecnologia_instalar",
        lambda s: " | ".join(
            sorted(
                pd.Series(s, dtype="string")
                .dropna()
                .astype(str)
                .str.strip()
                .replace("", pd.NA)
                .dropna()
                .unique()
            )
        ),
    ),
    # Estados distintos
    estado_terminal=(
        "estado_terminal",
        lambda s: " | ".join(
            sorted(
                pd.Series(s, dtype="string")
                .dropna()
                .astype(str)
                .str.upper()
                .str.strip()
                .replace("", pd.NA)
                .dropna()
                .unique()
            )
        ),
    ),
    # Primera instalación y todas las bajas distintas
    fecha_instalacion=("fecha_instalacion", "min"),
    fecha_baja=(
        "fecha_baja",
        lambda s: " | ".join(
            sorted(pd.Series(s.dropna().dt.strftime("%Y-%m-%d")).unique())
        ),
    ),
    # ¿Quedan terminales activas?
    tiene_activos=("is_activo", "max"),
)

# 4) Métricas rápidas y formato de salida
cm["ticket_promedio"] = np.where(
    cm["qtrx_total"] > 0, cm["monto_clp"] / cm["qtrx_total"], np.nan
)
cm["activo_mes"] = (cm["qtrx_total"] > 0).astype(int)

# Formatos de columnas
cm["periodo"] = cm["periodo"].astype(str)  # "YYYY-MM"
cm["fecha_instalacion"] = cm["fecha_instalacion"].dt.strftime("%Y-%m-%d")

cm.head()


In [None]:
# Agrupar por rut_comercio y obtener las tecnologías únicas por comercio
tec_por_comercio = (
    df.groupby("rut_comercio")["tecnologia_instalar"]
    .apply(
        lambda s: sorted(
            pd.Series(s, dtype="string")
            .dropna()
            .astype(str)
            .str.strip()
            .replace("", pd.NA)
            .dropna()
            .unique()
        )
    )
    .reset_index()
)
tec_por_comercio["n_tecnologias"] = tec_por_comercio["tecnologia_instalar"].apply(len)

print("Distribución de cantidad de tecnologías distintas por comercio:")
print(tec_por_comercio["n_tecnologias"].value_counts().sort_index())

# Mostrar ejemplos de comercios con más de una tecnología
print("\nEjemplo de comercios con más de una tecnología:")
print(tec_por_comercio[tec_por_comercio["n_tecnologias"] > 1].head())

# Analizar combinaciones más comunes de tecnologías
from collections import Counter

tec_combos = tec_por_comercio["tecnologia_instalar"].apply(lambda x: tuple(sorted(x)))
combo_counts = Counter(tec_combos)
print("\nCombinaciones más comunes de tecnologías por comercio:")
for combo, count in combo_counts.most_common(10):
    print(f"{combo}: {count} comercios")

In [None]:
# 1. Combinaciones de tecnologías de tamaño 2 o más
from collections import Counter
import matplotlib.pyplot as plt

tec_combos = tec_por_comercio["tecnologia_instalar"].apply(lambda x: tuple(sorted(x)))
combo_counts = Counter([combo for combo in tec_combos if len(combo) >= 2])

print("Combinaciones más comunes de dos o más tecnologías por comercio:")
for combo, count in combo_counts.most_common(10):
    print(f"{combo}: {count} comercios")

# 2. Comercios distintos por tecnología individual y periodo
tec_periodo = (
    df.dropna(subset=["tecnologia_instalar"])
    .assign(
        tecnologia_instalar=lambda d: d["tecnologia_instalar"].astype(str).str.strip()
    )
    .loc[lambda d: d["tecnologia_instalar"] != ""]
    .groupby(["periodo", "tecnologia_instalar"])["rut_comercio"]
    .nunique()
    .reset_index(name="n_comercios")
)

print("\nComercios distintos por tecnología y periodo")
tecc = tec_periodo.sort_values(by="n_comercios", ascending=False)
tecc

# 3. Evolución de las dos combinaciones más comunes a lo largo de los 12 periodos


In [None]:
# Obtener las dos combinaciones más comunes de tamaño >=2
top2_combos = [combo for combo, _ in combo_counts.most_common(2)]

# Para cada periodo, contar cuántos comercios tienen exactamente cada combinación
combo_evol = []
for periodo, grupo in df.groupby("periodo"):
    tec_por_comercio_p = grupo.groupby("rut_comercio")["tecnologia_instalar"].apply(
        lambda s: tuple(
            sorted(
                pd.Series(s, dtype="string")
                .dropna()
                .astype(str)
                .str.strip()
                .replace("", pd.NA)
                .dropna()
                .unique()
            )
        )
    )
    for combo in top2_combos:
        count = (tec_por_comercio_p == combo).sum()
        combo_evol.append(
            {"periodo": str(periodo), "combo": " | ".join(combo), "n_comercios": count}
        )

combo_evol_df = pd.DataFrame(combo_evol)

# Graficar
plt.figure(figsize=(10, 5))
for combo in combo_evol_df["combo"].unique():
    data = combo_evol_df[combo_evol_df["combo"] == combo]
    plt.plot(data["periodo"], data["n_comercios"], marker="o", label=combo)
plt.xticks(rotation=45)
plt.xlabel("Periodo")
plt.ylabel("N° comercios con combinación")
plt.title("Evolución de las dos combinaciones de tecnologías más comunes")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Relación entre tecnología y volumen/monto de transacciones por comercio
# 1. Para cada tecnología y periodo, calcular suma y promedio de transacciones y monto por comercio
tec_stats = (
    df.dropna(subset=["tecnologia_instalar"])
    .assign(
        tecnologia_instalar=lambda d: d["tecnologia_instalar"].astype(str).str.strip()
    )
    .loc[lambda d: d["tecnologia_instalar"] != ""]
    .groupby(["periodo", "tecnologia_instalar"])
    .agg(
        n_comercios=("rut_comercio", "nunique"),
        total_qtrx=("qtrx_total", "sum"),
        total_monto=("monto_clp", "sum"),
        avg_qtrx=("qtrx_total", "mean"),
        avg_monto=("monto_clp", "mean"),
        avg_ticket=(
            lambda x: np.where(
                x["qtrx_total"] > 0, x["monto_clp"] / x["qtrx_total"], np.nan
            ).mean()
        ),
    )
    .reset_index()
)

print("Resumen por tecnología y periodo:")
display(tec_stats.head(10))

# 2. Comparar tecnologías en el último periodo disponible
top_period = tec_stats["periodo"].max()
print(f"\nComparativa de tecnologías en el periodo más reciente: {top_period}")
display(
    tec_stats[tec_stats["periodo"] == top_period].sort_values(
        "total_qtrx", ascending=False
    )
)

# 3. (Opcional) Analizar ticket promedio y número de terminales por tecnología
tec_terminals = (
    df.dropna(subset=["tecnologia_instalar"])
    .assign(
        tecnologia_instalar=lambda d: d["tecnologia_instalar"].astype(str).str.strip()
    )
    .loc[lambda d: d["tecnologia_instalar"] != ""]
    .groupby(["tecnologia_instalar"])
    .agg(
        n_comercios=("rut_comercio", "nunique"),
        n_terminales=("numero_terminal", "nunique"),
        avg_ticket=(
            lambda x: np.where(
                x["qtrx_total"] > 0, x["monto_clp"] / x["qtrx_total"], np.nan
            ).mean()
        ),
    )
    .reset_index()
)
print("\nResumen por tecnología (todas las fechas):")
display(tec_terminals.sort_values("n_comercios", ascending=False))


## Margin uplift preparation

Preparar tablas mensuales por comercio y consolidar supuestos de precios/costos para analizar margen en segmentos SMB.

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

DATA_DIR = Path("data")
RAW_TERMINAL_FILE = DATA_DIR / "terminales_con_sin_transacciones_mensual.csv"
EXCLUSION_FILE = DATA_DIR / "RUT_por_excluir_de_pricing.xlsx"
PRICING_FILE = DATA_DIR / "precios_actuales_klap.xlsx"
COMPETITOR_FILE = DATA_DIR / "precios_Competidores.xlsx"
BRAND_COST_FILE = DATA_DIR / "costos_marca_25_1.xlsx"
INTERCHANGE_FILE = DATA_DIR / "Tasa_Intercambio_Chile_Visa_y_Mastercard.csv"
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
terminal_numeric_cols = [
    "qtrx_visa",
    "monto_visa",
    "qtrx_mastercard",
    "monto_mastercard",
    "qtrx_amex",
    "monto_amex",
    "qtrx_casas_comerciales",
    "monto_casas_comerciales",
    "qtrx_vale_electronico",
    "monto_vale_electronico",
    "qtrx_ripley",
    "monto_ripley",
    "qtrx_hites",
    "monto_hites",
    "qtrx_adquriencia_general",
    "monto_adquriencia_general",
]
terminal_dtypes = {
    "periodo": "string",
    "rut_comercio": "string",
    "codigo_local": "Int64",
    "numero_terminal": "Int64",
    "fecha_instalacion": "string",
    "fecha_baja": "string",
    "estado_terminal": "string",
    "tecnologia_instalar": "string",
    "modelo_equipo": "string",
    "vertical": "string",
    "adquirencia": "string",
    "nombre_giro": "string",
    "giro": "string",
    "comuna_suc": "string",
    "region_suc": "string",
    "razon_social": "string",
}

terminal_df = pd.read_csv(
    RAW_TERMINAL_FILE,
    dtype=terminal_dtypes,
    usecols=list(terminal_dtypes.keys()) + terminal_numeric_cols,
)
terminal_df["periodo"] = pd.to_datetime(
    terminal_df["periodo"] + "-01", format="%Y-%m-%d", errors="coerce"
).dt.to_period("M")
terminal_df["fecha_instalacion"] = pd.to_datetime(
    terminal_df["fecha_instalacion"], errors="coerce"
)
terminal_df["fecha_baja"] = pd.to_datetime(terminal_df["fecha_baja"], errors="coerce")
for col in terminal_numeric_cols:
    terminal_df[col] = pd.to_numeric(terminal_df[col], errors="coerce").fillna(0.0)

brand_tags = [
    "visa",
    "mastercard",
    "amex",
    "casas_comerciales",
    "vale_electronico",
    "ripley",
    "hites",
    "adquriencia_general",
]
qtrx_cols = [f"qtrx_{b}" for b in brand_tags]
monto_cols = [f"monto_{b}" for b in brand_tags]
terminal_df["qtrx_total"] = terminal_df[qtrx_cols].sum(axis=1)
terminal_df["monto_total"] = terminal_df[monto_cols].sum(axis=1)
terminal_df["activo_terminal_mes"] = (terminal_df["qtrx_total"] > 0).astype("int8")

agg_map = {col: "sum" for col in qtrx_cols + monto_cols + ["qtrx_total", "monto_total"]}
agg_map.update(
    {
        "codigo_local": "nunique",
        "numero_terminal": "nunique",
        "activo_terminal_mes": "sum",
    }
)
merchant_month = (
    terminal_df.groupby(["periodo", "rut_comercio"], as_index=False)
    .agg(agg_map)
    .rename(
        columns={
            "codigo_local": "n_locales",
            "numero_terminal": "n_terminales",
            "activo_terminal_mes": "terminales_activos",
        }
    )
)
merchant_month["ticket_promedio"] = np.where(
    merchant_month["qtrx_total"] > 0,
    merchant_month["monto_total"] / merchant_month["qtrx_total"],
    np.nan,
)
merchant_month["periodo"] = merchant_month["periodo"].astype(str)

exclusion_ruts = (
    pd.read_excel(EXCLUSION_FILE, sheet_name=0)["rut_comercio"]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
)
merchant_month["excluir_pricing"] = merchant_month["rut_comercio"].isin(
    set(exclusion_ruts)
)

segment_rules = [
    ("Estándar", 0, 8_000_000),
    ("PRO", 8_000_000, 30_000_000),
    ("PRO Max", 30_000_000, 75_000_000),
]
merchant_month["segmento_actual"] = "Enterprise"
volumen = merchant_month["monto_total"]
for segmento, lower, upper in segment_rules:
    merchant_month.loc[(volumen >= lower) & (volumen < upper), "segmento_actual"] = (
        segmento
    )
merchant_month.loc[volumen == 0, "segmento_actual"] = "Sin ventas"

merchant_month_path = PROCESSED_DIR / "merchant_month_pricing.parquet"
merchant_month.to_parquet(merchant_month_path, index=False)
merchant_month.head()

Unnamed: 0,periodo,rut_comercio,qtrx_visa,qtrx_mastercard,qtrx_amex,qtrx_casas_comerciales,qtrx_vale_electronico,qtrx_ripley,qtrx_hites,qtrx_adquriencia_general,...,monto_hites,monto_adquriencia_general,qtrx_total,monto_total,n_locales,n_terminales,terminales_activos,ticket_promedio,excluir_pricing,segmento_actual
0,2024-01,06360905-6,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,1,0,,False,Sin ventas
1,2024-01,06805120-7,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,1,0,,False,Sin ventas
2,2024-01,07269094-K,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,1,0,,False,Sin ventas
3,2024-01,07289970-9,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,1,0,,False,Sin ventas
4,2024-01,09161640-8,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,1,0,,False,Sin ventas


In [4]:
terminal_df

Unnamed: 0,periodo,rut_comercio,codigo_local,numero_terminal,estado_terminal,tecnologia_instalar,fecha_instalacion,fecha_baja,modelo_equipo,vertical,...,monto_vale_electronico,qtrx_ripley,monto_ripley,qtrx_hites,monto_hites,qtrx_adquriencia_general,monto_adquriencia_general,qtrx_total,monto_total,activo_terminal_mes
0,2024-01,12489233-3,37,8,BAJA,POS GPRS,2008-02-18,2020-10-04,,C. BARRIO,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2024-01,6605632-5,160,152,BAJA,POS GPRS,2008-04-30,2020-10-04,,C. BARRIO,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2024-01,11911027-0,311,303,BAJA,POS GPRS,2008-06-05,2020-10-04,,C. BARRIO,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2024-01,9972060-3,379,371,BAJA,POS ETHERNET,2008-06-24,2020-10-04,,C. BARRIO,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2024-01,6302656-5,561,553,BAJA,POS GPRS,2008-07-09,2020-10-04,,ALIMENTACION,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408147,2024-12,15747504-5,177834,1775670,BAJA,POS GPRS,2020-12-23,2021-09-27,,RETAIL,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2408148,2024-12,15164443-0,177835,1775680,BAJA,POS MOVIL,2020-12-19,2023-08-17,,RETAIL,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2408149,2024-12,14491388-4,176496,1758570,BAJA,POS MOVIL,2020-12-01,2024-10-04,,RETAIL,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2408150,2024-12,8750803-K,176500,1758610,BAJA,POS MOVIL,2020-11-26,2022-10-24,,RETAIL,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
merchant_summary = (
    merchant_month.groupby("rut_comercio")
    .agg(
        {
            "periodo": "nunique",
            "monto_total": "sum",
            "qtrx_total": "sum",
            "segmento_actual": lambda s: s.value_counts().idxmax(),
            "excluir_pricing": "any",
        }
    )
    .rename(
        columns={
            "periodo": "meses_reportados",
            "monto_total": "monto_total_anual",
            "qtrx_total": "qtrx_total_anual",
            "segmento_actual": "segmento_mas_frecuente",
            "excluir_pricing": "excluir_pricing_flag",
        }
    )
    .reset_index()
)
merchant_summary["volumen_mensual_promedio"] = merchant_summary[
    "monto_total_anual"
] / merchant_summary["meses_reportados"].clip(lower=1)
merchant_summary["share_meses_activos"] = np.where(
    merchant_summary["meses_reportados"] > 0,
    merchant_summary["monto_total_anual"] > 0,
    np.nan,
)
merchant_summary_path = PROCESSED_DIR / "merchant_summary_pricing.parquet"
merchant_summary.to_parquet(merchant_summary_path, index=False)
merchant_summary.head()

### Pricing inputs and assumptions

Cargar grilla de Klap vigente, referencias de competidores, costos de marca e intercambios para derivar escenarios de margen blended.

In [None]:
pricing_grid = pd.read_excel(PRICING_FILE)
competitor_prices = pd.read_excel(COMPETITOR_FILE)
brand_costs = pd.read_excel(BRAND_COST_FILE)
interchange_caps = pd.read_csv(INTERCHANGE_FILE)

pricing_grid.head()

In [None]:
pricing_grid["Variable_pct"] = pricing_grid["Variable %"] / 100
segment_media = pricing_grid.pivot_table(
    index="Segmento", columns="Medio", values=["Variable_pct", "Fijo CLP (aprox)"]
)

cp_interchange = interchange_caps[interchange_caps["Canal"] == "CP"]
interchange_median = {
    card_type: (
        cp_interchange[cp_interchange["Tipo de tarjeta"] == card_type]["TI %"].median()
        / 100
    )
    for card_type in ["Crédito", "Débito", "Prepago"]
}
interchange_median

In [None]:
brand_cost_pct = (
    brand_costs.groupby("Marca")["Total costos de marca %"].mean().to_dict()
)
brand_cost_pct = {k.lower(): v for k, v in brand_cost_pct.items()}
brand_cost_pct

In [None]:
mix_scenarios = {
    "mix_credit_heavy": {"Crédito": 0.6, "Débito": 0.3, "Prepago": 0.1},
    "mix_debit_heavy": {"Crédito": 0.3, "Débito": 0.6, "Prepago": 0.1},
    "mix_credit_only": {"Crédito": 1.0, "Débito": 0.0, "Prepago": 0.0},
}

brand_cols = [
    col
    for col in merchant_month.columns
    if col.startswith("monto_")
    and col not in {"monto_total", "monto_adquriencia_general"}
]
scenario_frames = []
for scenario_name, mix in mix_scenarios.items():
    scenario_df = merchant_month.copy()
    eff_var = {}
    eff_fix = {}
    for segment in segment_media.index:
        var = 0.0
        fijo = 0.0
        for medio, share in mix.items():
            if share == 0:
                continue
            var += share * segment_media.loc[segment, ("Variable_pct", medio)]
            fijo += share * segment_media.loc[segment, ("Fijo CLP (aprox)", medio)]
        eff_var[segment] = var
        eff_fix[segment] = fijo
    scenario_df["pricing_var_pct"] = scenario_df["segmento_actual"].map(eff_var)
    scenario_df["pricing_fijo_clp"] = scenario_df["segmento_actual"].map(eff_fix)
    scenario_df["interchange_pct"] = sum(
        mix[m] * interchange_median.get(m, 0) for m in mix
    )
    brand_cost_amount = np.zeros(len(scenario_df))
    for col in brand_cols:
        brand = col.replace("monto_", "")
        brand_cost_amount += scenario_df[col].to_numpy() * brand_cost_pct.get(
            brand, 0.0
        )
    scenario_df["brand_cost_amount"] = brand_cost_amount
    scenario_df["pricing_revenue"] = scenario_df["monto_total"] * scenario_df[
        "pricing_var_pct"
    ].fillna(0) + scenario_df["qtrx_total"] * scenario_df["pricing_fijo_clp"].fillna(0)
    scenario_df["interchange_cost"] = (
        scenario_df["monto_total"] * scenario_df["interchange_pct"]
    )
    scenario_df["gross_margin"] = (
        scenario_df["pricing_revenue"]
        - scenario_df["interchange_cost"]
        - scenario_df["brand_cost_amount"]
    )
    scenario_df["gross_margin_pct"] = np.where(
        scenario_df["monto_total"] > 0,
        scenario_df["gross_margin"] / scenario_df["monto_total"],
        np.nan,
    )
    scenario_df["scenario"] = scenario_name
    scenario_frames.append(scenario_df)

margin_scenarios = pd.concat(scenario_frames, ignore_index=True)
margin_scenarios_path = PROCESSED_DIR / "merchant_margin_scenarios.parquet"
margin_scenarios.to_parquet(margin_scenarios_path, index=False)
margin_scenarios.head()

In [None]:
smb_segments = ["Estándar", "PRO", "PRO Max"]
margin_summary = (
    margin_scenarios[
        (~margin_scenarios["excluir_pricing"])
        & (margin_scenarios["segmento_actual"].isin(smb_segments))
        & (margin_scenarios["monto_total"] > 0)
    ]
    .groupby(["segmento_actual", "scenario"])
    .agg({"rut_comercio": "nunique", "monto_total": "sum", "gross_margin": "sum"})
    .rename(columns={"rut_comercio": "merchants"})
)
margin_summary["margin_pct"] = (
    margin_summary["gross_margin"] / margin_summary["monto_total"]
)
margin_summary