Integramos las bases 01, 02 y 03

In [1]:
import pandas as pd


In [2]:
df_perfil = pd.read_csv("df_perfil_clientes_final.csv")
df_experiment = pd.read_csv("df_experiment_final.csv")
df_web = pd.read_csv("df_web_events_final.csv")


In [3]:
df_perfil.shape



(70609, 9)

In [4]:
df_experiment.shape


(70609, 2)

In [5]:
df_web.shape

(755405, 5)

In [6]:
df_web_agg = (
    df_web
    .groupby("client_id")
    .agg(
        web_events=("process_step", "count"),
        web_visits=("visit_id", "nunique"),
        web_steps=("process_step", "nunique"),
        web_confirms=("process_step", lambda x: (x == "confirm").sum()),
    )
    .reset_index()
)


In [7]:
df_web_agg.shape


(120157, 5)

In [8]:
df_web_agg["client_id"].duplicated().sum()


np.int64(0)

In [9]:
df_web_agg.isnull().sum()


client_id       0
web_events      0
web_visits      0
web_steps       0
web_confirms    0
dtype: int64

In [10]:
df_base = df_perfil.merge(
    df_experiment,
    on="client_id",
    how="left"
)


In [11]:
df_base.shape
df_base["experiment_group"].value_counts(dropna=False)


experiment_group
Test       26968
Control    23532
NaN        20109
Name: count, dtype: int64

In [12]:
df_final = df_base.merge(
    df_web_agg,
    on="client_id",
    how="left"
)


In [13]:
df_final.shape
df_final.isnull().mean().sort_values(ascending=False)


experiment_group    0.284794
age_group           0.005509
balance             0.000198
num_accounts        0.000198
balance_log         0.000198
tenure_months       0.000198
logins_6m           0.000198
calls_6m            0.000198
gender              0.000000
client_id           0.000000
web_events          0.000000
web_visits          0.000000
web_steps           0.000000
web_confirms        0.000000
dtype: float64

In [14]:
df_final.shape
df_final.info()
df_final.isnull().mean().sort_values(ascending=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   client_id         70609 non-null  int64  
 1   age_group         70220 non-null  float64
 2   gender            70609 non-null  object 
 3   tenure_months     70595 non-null  float64
 4   num_accounts      70595 non-null  float64
 5   balance           70595 non-null  float64
 6   balance_log       70595 non-null  float64
 7   logins_6m         70595 non-null  float64
 8   calls_6m          70595 non-null  float64
 9   experiment_group  50500 non-null  object 
 10  web_events        70609 non-null  int64  
 11  web_visits        70609 non-null  int64  
 12  web_steps         70609 non-null  int64  
 13  web_confirms      70609 non-null  int64  
dtypes: float64(7), int64(5), object(2)
memory usage: 7.5+ MB


experiment_group    0.284794
age_group           0.005509
balance             0.000198
num_accounts        0.000198
balance_log         0.000198
tenure_months       0.000198
logins_6m           0.000198
calls_6m            0.000198
gender              0.000000
client_id           0.000000
web_events          0.000000
web_visits          0.000000
web_steps           0.000000
web_confirms        0.000000
dtype: float64

In [15]:
df_ab = df_final[df_final["experiment_group"].notna()].copy()


In [16]:
df_final["experiment_group"] = df_final["experiment_group"].fillna("No_experiment")


In [17]:
# ¿Clientes con actividad web?
(df_final[["web_events", "web_visits", "web_confirms"]] > 0).mean()

# ¿Confirmaciones ≤ visitas?
(df_final["web_confirms"] <= df_final["web_visits"]).mean()


np.float64(0.9473721480264555)

In [18]:
df_final.to_csv("df_model_ready.csv", index=False)

df_ab.to_csv("df_ab_test.csv", index=False)


## Dataset final integrado

Se generan dos datasets:

- df_model_ready.csv  
  Dataset completo de clientes, incluyendo usuarios fuera del experimento.
  Útil para análisis descriptivo y modelado.

- df_ab_test.csv  
  Subconjunto con usuarios asignados a Test y Control.
  Usado para análisis A/B y estimación causal.

Los merges se realizan siempre como LEFT sobre client_id.
