In [2]:
import pandas as pd, numpy as np
from pathlib import Path

P = Path("../../new_segments/retail_behavior_segments.csv")  # ajusta ruta si es necesario
df = pd.read_csv(P, encoding="utf-8-sig")

# Normaliza nombres por si acaso
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

# Orden y mapeos útiles
orig_order = ["Retail-Low","Retail-Mid","Retail-High","Retail-Whale"]
df["segment_label"] = pd.Categorical(df["segment_label"], categories=orig_order, ordered=True)

# Features para evaluación de separabilidad (mismo espíritu que el cluster original)
def _num(s): return pd.to_numeric(s, errors="coerce")
Xcols = ["intensity_blend","freq_blend","median_amount_out"]
df["log_intensity"] = np.log1p(_num(df["intensity_blend"]))
df["log_ticket_out"] = np.log1p(_num(df["median_amount_out"]))
df["freq"] = _num(df["freq_blend"]).fillna(0)

non_wh = df[df["segment_label"]!="Retail-Whale"].copy()


In [3]:
from scipy import stats

def cohens_d(a, b):
    a = pd.to_numeric(a, errors="coerce").dropna()
    b = pd.to_numeric(b, errors="coerce").dropna()
    if len(a)<2 or len(b)<2: return np.nan
    m1, m2 = a.mean(), b.mean()
    s1, s2 = a.std(ddof=1), b.std(ddof=1)
    sp = np.sqrt(((len(a)-1)*s1**2 + (len(b)-1)*s2**2) / (len(a)+len(b)-2))
    return (m1-m2)/sp if sp>0 else np.nan

low = non_wh[non_wh["segment_label"]=="Retail-Low"]
mid  = non_wh[non_wh["segment_label"]=="Retail-Mid"]

rows = []
for col, label in [("log_intensity","Intensidad (log)"),
                   ("freq","Frecuencia"),
                   ("log_ticket_out","Ticket OUT (log)")]:
    d = cohens_d(low[col], mid[col])
    f, p = stats.f_oneway(*[g[col].dropna() for _, g in non_wh.groupby("segment_label") if _.startswith("Retail-") and _!="Retail-Whale"])
    rows.append({"feature": label, "cohens_d_low_vs_mid": d, "anova_F(Low/Mid/High)": f, "anova_p": p})

sep_tbl = pd.DataFrame(rows)
sep_tbl


  f, p = stats.f_oneway(*[g[col].dropna() for _, g in non_wh.groupby("segment_label") if _.startswith("Retail-") and _!="Retail-Whale"])


Unnamed: 0,feature,cohens_d_low_vs_mid,anova_F(Low/Mid/High),anova_p
0,Intensidad (log),-3.403979,18507.740963,0.0
1,Frecuencia,-0.761318,979.277031,0.0
2,Ticket OUT (log),-1.324877,3057.615607,0.0


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Matriz de features (estandarizada y con pesos suaves, como en tu approach)
use = non_wh[["log_intensity","freq","log_ticket_out"]].fillna(0).to_numpy()
scaler = StandardScaler().fit(use)
Xs = scaler.transform(use) * np.array([1.0, 0.5, 0.1])  # mismos pesos idea original

# Etiquetas "actuales" (Low/Mid/High) sin whales
y_now = non_wh["segment_label"].map({"Retail-Low":0,"Retail-Mid":1,"Retail-High":2}).to_numpy()
sil_now = silhouette_score(Xs, y_now) if len(np.unique(y_now))>1 else np.nan
dbi_now = davies_bouldin_score(Xs, y_now)

# Etiquetas "fusionadas" (Core=Low+Mid, High)
y_merged = non_wh["segment_label"].map({"Retail-Low":0,"Retail-Mid":0,"Retail-High":1}).to_numpy()
sil_mrg = silhouette_score(Xs, y_merged) if len(np.unique(y_merged))>1 else np.nan
dbi_mrg = davies_bouldin_score(Xs, y_merged)

pd.DataFrame([{"setting":"Actual (Low/Mid/High)","silhouette":sil_now,"davies_b":dbi_now},
              {"setting":"Fusionado (Core/High)","silhouette":sil_mrg,"davies_b":dbi_mrg}])


Unnamed: 0,setting,silhouette,davies_b
0,Actual (Low/Mid/High),0.409346,0.806084
1,Fusionado (Core/High),0.393649,0.849276


In [5]:
from sklearn.cluster import KMeans

def eval_kmeans_k(k):
    km = KMeans(n_clusters=k, random_state=42, n_init=10).fit(Xs)
    labs = km.labels_
    sil = silhouette_score(Xs, labs) if len(np.unique(labs))>1 else np.nan
    dbi = davies_bouldin_score(Xs, labs)
    share = (pd.Series(labs).value_counts(normalize=True)*100).round(2).to_dict()
    return sil, dbi, share

sil_k3, dbi_k3, share_k3 = eval_kmeans_k(3)
sil_k2, dbi_k2, share_k2 = eval_kmeans_k(2)

pd.DataFrame([
    {"k":"3 (≈Low/Mid/High)","silhouette":sil_k3,"davies_b":dbi_k3,"cluster_share_%":share_k3},
    {"k":"2 (≈Core/High)","silhouette":sil_k2,"davies_b":dbi_k2,"cluster_share_%":share_k2},
])


Unnamed: 0,k,silhouette,davies_b,cluster_share_%
0,3 (≈Low/Mid/High),0.409346,0.806084,"{1: 45.86, 2: 39.27, 0: 14.88}"
1,2 (≈Core/High),0.482804,0.755566,"{0: 67.27, 1: 32.73}"


In [6]:
# Antes: Low / Mid / High / Whale
prof_before = (df.groupby("segment_label", dropna=False)
                 .agg(clients=("customer_id","nunique"),
                      share_clients=("customer_id", lambda s: 100*s.nunique()/df["customer_id"].nunique()),
                      med_monthly_out=("median_monthly_out","median"),
                      med_monthly_in=("median_monthly_in","median"),
                      med_intensity=("intensity_blend","median"),
                      med_freq=("freq_blend","median"))
                 .round(2))

# Después: Core (=Low+Mid) / High / Whale
df2 = df.copy()
df2["segment_merged"] = df2["segment_label"].replace({"Retail-Low":"Retail-Core","Retail-Mid":"Retail-Core"})
df2.loc[df2["segment_label"]=="Retail-High","segment_merged"]="Retail-High"
df2.loc[df2["segment_label"]=="Retail-Whale","segment_merged"]="Retail-Whale"
merged_order = ["Retail-Core","Retail-High","Retail-Whale"]

prof_after = (df2.groupby("segment_merged", dropna=False)
                .agg(clients=("customer_id","nunique"),
                     share_clients=("customer_id", lambda s: 100*s.nunique()/df2["customer_id"].nunique()),
                     med_monthly_out=("median_monthly_out","median"),
                     med_monthly_in=("median_monthly_in","median"),
                     med_intensity=("intensity_blend","median"),
                     med_freq=("freq_blend","median"))
                .reindex(merged_order).round(2))

prof_before, prof_after


  prof_before = (df.groupby("segment_label", dropna=False)
  df2["segment_merged"] = df2["segment_label"].replace({"Retail-Low":"Retail-Core","Retail-Mid":"Retail-Core"})
  prof_after = (df2.groupby("segment_merged", dropna=False)


(               clients  share_clients  med_monthly_out  med_monthly_in  \
 segment_label                                                            
 Retail-Low        1377          14.73              0.0         58691.0   
 Retail-Mid        4245          45.40         721717.0          3521.0   
 Retail-High       3635          38.87        8500000.0       1975000.0   
 Retail-Whale        94           1.01      285229178.0     103119773.0   
 
                med_intensity  med_freq  
 segment_label                           
 Retail-Low           36000.0      0.25  
 Retail-Mid         1100000.0      1.00  
 Retail-High       10467650.0      1.67  
 Retail-Whale     322977477.5      2.12  ,
                 clients  share_clients  med_monthly_out  med_monthly_in  \
 segment_merged                                                            
 Retail-Core        5622          60.12     3.228388e+05         50000.0   
 Retail-High        3635          38.87     8.500000e+06       1975

In [7]:
# === Perfil de segmentos: Antes vs Después (formateado) ===
import pandas as pd
import numpy as np

# --- Helpers de formato ---
def fmt_amount(x):
    try:
        return f"{float(x):,.0f}"
    except Exception:
        return x

def fmt_freq(x):
    try:
        return f"{float(x):,.2f}"
    except Exception:
        return x

def fmt_pct(x):
    try:
        return f"{float(x):.2f}%"
    except Exception:
        return x

# --- Columnas esperadas ---
needed = {
    "customer_id","segment_label",
    "median_monthly_out","median_monthly_in",
    "tx_per_active_month_out","tx_per_active_month_in"
}
missing = needed - set(df.columns)
assert not missing, f"Faltan columnas en df: {missing}"

# --- Antes: Low / Mid / High / Whale ---
order_before = ["Retail-Low","Retail-Mid","Retail-High","Retail-Whale"]
df["segment_label"] = pd.Categorical(df["segment_label"], categories=order_before, ordered=True)

prof_before_raw = (
    df.groupby("segment_label", dropna=False)
      .agg(
          clients=("customer_id","nunique"),
          share_clients=("customer_id", lambda s: 100*s.nunique()/df["customer_id"].nunique()),
          # OUT
          med_monthly_out=("median_monthly_out","median"),
          mean_monthly_out=("median_monthly_out","mean"),
          med_tx_month_out=("tx_per_active_month_out","median"),
          mean_tx_month_out=("tx_per_active_month_out","mean"),
          # IN
          med_monthly_in=("median_monthly_in","median"),
          mean_monthly_in=("median_monthly_in","mean"),
          med_tx_month_in=("tx_per_active_month_in","median"),
          mean_tx_month_in=("tx_per_active_month_in","mean"),
      )
      .reindex(order_before)
)

# --- Después: Core(=Low+Mid) / High / Whale ---
df2 = df.copy()
df2["segment_merged"] = df2["segment_label"].replace({"Retail-Low":"Retail-Core","Retail-Mid":"Retail-Core"})
df2.loc[df2["segment_label"]=="Retail-High","segment_merged"] = "Retail-High"
df2.loc[df2["segment_label"]=="Retail-Whale","segment_merged"] = "Retail-Whale"
order_after = ["Retail-Core","Retail-High","Retail-Whale"]

prof_after_raw = (
    df2.groupby("segment_merged", dropna=False)
       .agg(
           clients=("customer_id","nunique"),
           share_clients=("customer_id", lambda s: 100*s.nunique()/df2["customer_id"].nunique()),
           # OUT
           med_monthly_out=("median_monthly_out","median"),
           mean_monthly_out=("median_monthly_out","mean"),
           med_tx_month_out=("tx_per_active_month_out","median"),
           mean_tx_month_out=("tx_per_active_month_out","mean"),
           # IN
           med_monthly_in=("median_monthly_in","median"),
           mean_monthly_in=("median_monthly_in","mean"),
           med_tx_month_in=("tx_per_active_month_in","median"),
           mean_tx_month_in=("tx_per_active_month_in","mean"),
       )
       .reindex(order_after)
)

# --- Redondeo numérico base (evita notación científica) ---
prof_before_num = prof_before_raw.copy().astype(float)
prof_after_num  = prof_after_raw.copy().astype(float)

# --- Formateo legible ---
amount_cols = [
    "med_monthly_out","mean_monthly_out","med_monthly_in","mean_monthly_in"
]
freq_cols = [
    "med_tx_month_out","mean_tx_month_out","med_tx_month_in","mean_tx_month_in"
]

def format_table(df_in):
    df_fmt = df_in.copy()
    df_fmt["share_clients"] = df_fmt["share_clients"].map(fmt_pct)
    for c in amount_cols:
        df_fmt[c] = df_fmt[c].map(fmt_amount)
    for c in freq_cols:
        df_fmt[c] = df_fmt[c].map(fmt_freq)
    # Reordenar columnas por bloques
    return df_fmt[
        ["clients","share_clients",
         "med_monthly_out","mean_monthly_out","med_tx_month_out","mean_tx_month_out",
         "med_monthly_in","mean_monthly_in","med_tx_month_in","mean_tx_month_in"]
    ]

prof_before = format_table(prof_before_num)
prof_after  = format_table(prof_after_num)

# --- (Opcional) Tabla combinada con nivel "Escenario" ---
prof_before_idx = prof_before.copy()
prof_before_idx.index.name = "segment"
prof_before_idx.insert(0, "Escenario", "Antes (Low/Mid/High/Whale)")

prof_after_idx = prof_after.copy()
prof_after_idx.index.name = "segment"
prof_after_idx.insert(0, "Escenario", "Después (Core/High/Whale)")

combined = pd.concat([prof_before_idx, prof_after_idx])


print("\n=== Perfil DESPUÉS (Core/High/Whale) ===")
display(prof_after)

# --- Exportar a CSV (una sola tabla combinada) ---
import csv  # para elegir el estilo de comillas al exportar

combined.to_csv(
    "../../data/retail_segment_profiles_before_after.csv",
    index=True,              # conserva el índice (segment)
    encoding="utf-8-sig",    # amigable con Excel
    sep=";",                 # útil en configuraciones regionales ES/LA
    quoting=csv.QUOTE_MINIMAL
)
print("CSV exportado: ../../data/retail_segment_profiles_before_after.csv")




=== Perfil DESPUÉS (Core/High/Whale) ===


  df.groupby("segment_label", dropna=False)
  df2["segment_merged"] = df2["segment_label"].replace({"Retail-Low":"Retail-Core","Retail-Mid":"Retail-Core"})
  df2.groupby("segment_merged", dropna=False)


Unnamed: 0_level_0,clients,share_clients,med_monthly_out,mean_monthly_out,med_tx_month_out,mean_tx_month_out,med_monthly_in,mean_monthly_in,med_tx_month_in,mean_tx_month_in
segment_merged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Retail-Core,5622.0,60.12%,322839,788583,1.0,0.88,50000,1491375,1.0,0.78
Retail-High,3635.0,38.87%,8500000,18478288,1.5,1.68,1975000,22589029,1.0,0.97
Retail-Whale,94.0,1.01%,285229178,373094015,2.0,2.09,103119773,305936784,1.0,1.37


CSV exportado: ../../data/retail_segment_profiles_before_after.csv
