In [3]:
# =========================
# [CELDA 1] Setup + Paths + Config
# =========================
import os
from pathlib import Path
import numpy as np
import pandas as pd

# Reproducibilidad (para sampling/plots si lo usas)
SEED = 42
np.random.seed(SEED)

# --- Proyecto ---
PROJECT_DIR = Path("/workspace/TFM_education_ai_analytics")

# --- Entradas base (ya generadas por tus pipelines) ---
PROCESSED_DIR = PROJECT_DIR / "data/2_processed"          # students/interactions/assessments por split
FEATURES_DIR  = PROJECT_DIR / "data/3_features"           # engineered_features.csv + target.csv por split (FE1)
EMB_DIR       = PROJECT_DIR / "data/4_embeddings"         # segmentation_gmm_ae.csv por split

# --- Salidas para el Transformer (nuevo) ---
OUT_DIR = PROJECT_DIR / "data/6_transformer_features"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Splits
SPLITS = ["training", "validation", "test"]

# Ventanas de semanas (como en el paper, adaptado a tu caso)
WEEKS_LIST = [12, 16, 20, 24]

print("‚úÖ Paths OK")
print("PROJECT_DIR:", PROJECT_DIR)
print("OUT_DIR:", OUT_DIR)
print("WEEKS_LIST:", WEEKS_LIST)


‚úÖ Paths OK
PROJECT_DIR: /workspace/TFM_education_ai_analytics
OUT_DIR: /workspace/TFM_education_ai_analytics/data/6_transformer_features
WEEKS_LIST: [12, 16, 20, 24]


In [5]:
# =========================
# [CELDA 2] Sanity check + carga r√°pida (TRAIN)
# =========================
from pathlib import Path

def check_exists(path: Path, label: str):
    if not path.exists():
        raise FileNotFoundError(f"‚ùå No existe {label}: {path}")
    return True

# --- Archivos esperados por split ---
expected = {
    "processed_students": lambda split: PROCESSED_DIR / split / "students.csv",
    "processed_interactions": lambda split: PROCESSED_DIR / split / "interactions.csv",
    "processed_assessments": lambda split: PROCESSED_DIR / split / "assessments.csv",
    "target": lambda split: FEATURES_DIR / split / "target.csv",
    "engineered": lambda split: FEATURES_DIR / split / "engineered_features.csv",
"segmentation": lambda split: (PROJECT_DIR / "data/5_students_segmented") / split / "segmentation_gmm_ae.csv",
}

# 1) Comprobar existencia
for split in SPLITS:
    print(f"\nüîé Checking split: {split}")
    for name, fn in expected.items():
        p = fn(split)
        check_exists(p, f"{name} ({split})")
        print(f"  ‚úÖ {name}: {p.name}")

# 2) Cargar TRAIN para inspecci√≥n r√°pida
train_students = pd.read_csv(expected["processed_students"]("training"))
train_interactions = pd.read_csv(expected["processed_interactions"]("training"))
train_target = pd.read_csv(expected["target"]("training"), index_col=0)
train_seg = pd.read_csv(expected["segmentation"]("training"), index_col=0)

print("\nüìå TRAIN shapes:")
print("students:", train_students.shape)
print("interactions:", train_interactions.shape)
print("target:", train_target.shape)
print("segmentation:", train_seg.shape)

print("\nüìå TRAIN columns (heads):")
display(train_students.head(3))
display(train_interactions.head(3))
display(train_target.head(3))
display(train_seg.head(3))



üîé Checking split: training
  ‚úÖ processed_students: students.csv
  ‚úÖ processed_interactions: interactions.csv
  ‚úÖ processed_assessments: assessments.csv
  ‚úÖ target: target.csv
  ‚úÖ engineered: engineered_features.csv
  ‚úÖ segmentation: segmentation_gmm_ae.csv

üîé Checking split: validation
  ‚úÖ processed_students: students.csv
  ‚úÖ processed_interactions: interactions.csv
  ‚úÖ processed_assessments: assessments.csv
  ‚úÖ target: target.csv
  ‚úÖ engineered: engineered_features.csv
  ‚úÖ segmentation: segmentation_gmm_ae.csv

üîé Checking split: test
  ‚úÖ processed_students: students.csv
  ‚úÖ processed_interactions: interactions.csv
  ‚úÖ processed_assessments: assessments.csv
  ‚úÖ target: target.csv
  ‚úÖ engineered: engineered_features.csv
  ‚úÖ segmentation: segmentation_gmm_ae.csv

üìå TRAIN shapes:
students: (22785, 15)
interactions: (7474712, 9)
target: (22785, 1)
segmentation: (22785, 12)

üìå TRAIN columns (heads):


Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,date_registration,date_unregistration,module_presentation_length
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,-159.0,,268
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,-53.0,,268
2,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,-176.0,,268


Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click,activity_type,week_from,week_to
0,AAA,2013J,28400,546652,-10,4,forumng,,
1,AAA,2013J,28400,546652,-10,1,forumng,,
2,AAA,2013J,28400,546652,-10,1,forumng,,


Unnamed: 0_level_0,final_result
unique_id,Unnamed: 1_level_1
11391_AAA_2013J,2
28400_AAA_2013J,2
32885_AAA_2013J,2


Unnamed: 0_level_0,cluster_id,cluster_label,cluster_name,p_cluster_0,p_cluster_1,p_cluster_2,p_cluster_3,p_cluster_4,p_cluster_5,confidence,entropy,entropy_norm
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
100064_FFF_2013J,2,STRATEGIC_HIGH_PERFORMER,Alto rendimiento (estrat√©gico),1.514016e-35,0.0,1.0,1.1761e-41,8.944613e-15,0.0,1.0,2.470691e-13,1.378919e-13
100561_DDD_2014J,0,CONSISTENT_GOOD,Consistentes (buen nivel),0.6840016,6e-45,8.037468e-15,3.43644e-15,0.3159992,1.197326e-17,0.684002,0.6238165,0.3481586
100621_CCC_2014B,4,STANDARD_PROFILE,Perfil est√°ndar,0.001462395,4.322117e-37,2.6649400000000002e-17,4.784217e-16,0.9972535,0.001286102,0.997254,0.02084927,0.0116362


In [7]:
# =========================
# [CELDA 3] Crear unique_id + alinear TRAIN (students/target/segmentation)
# =========================
import numpy as np
import pandas as pd

def add_unique_id(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "unique_id" not in df.columns:
        df["unique_id"] = (
            df["id_student"].astype(str)
            + "_"
            + df["code_module"].astype(str)
            + "_"
            + df["code_presentation"].astype(str)
        )
    return df

# 1) unique_id en students e interactions (por consistencia)
train_students = add_unique_id(train_students)
train_interactions = add_unique_id(train_interactions)

# 2) Pasar target/segmentation a index string
train_target.index = train_target.index.astype(str)
train_seg.index = train_seg.index.astype(str)

# 3) Index base desde students (es la "verdad" de cohort)
students_idx = train_students["unique_id"].astype(str).unique()
students_idx = pd.Index(students_idx, name="unique_id")

# 4) Intersecci√≥n de todo lo que necesitamos (para no tener NaNs raros)
common_idx = students_idx.intersection(train_target.index).intersection(train_seg.index)

print("üìå Alineaci√≥n TRAIN:")
print("unique_id en students:", len(students_idx))
print("target:", len(train_target))
print("segmentation:", len(train_seg))
print("‚úÖ common_idx:", len(common_idx))

# 5) Filtrar todo a common_idx
train_students_al = train_students.set_index("unique_id").loc[common_idx].copy()
train_target_al = train_target.loc[common_idx].copy()
train_seg_al = train_seg.loc[common_idx].copy()

# 6) Checks r√°pidos
assert train_students_al.index.equals(train_target_al.index)
assert train_students_al.index.equals(train_seg_al.index)

print("\n‚úÖ TRAIN listo para construir ventanas semanales")
print("students_al:", train_students_al.shape)
print("target_al:", train_target_al.shape)
print("seg_al:", train_seg_al.shape)

display(train_students_al.head(2))
display(train_target_al.head(2))
display(train_seg_al.head(2))


üìå Alineaci√≥n TRAIN:
unique_id en students: 22785
target: 22785
segmentation: 22785
‚úÖ common_idx: 22785

‚úÖ TRAIN listo para construir ventanas semanales
students_al: (22785, 15)
target_al: (22785, 1)
seg_al: (22785, 12)


Unnamed: 0_level_0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,date_registration,date_unregistration,module_presentation_length
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
11391_AAA_2013J,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,-159.0,,268
28400_AAA_2013J,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,-53.0,,268


Unnamed: 0_level_0,final_result
unique_id,Unnamed: 1_level_1
11391_AAA_2013J,2
28400_AAA_2013J,2


Unnamed: 0_level_0,cluster_id,cluster_label,cluster_name,p_cluster_0,p_cluster_1,p_cluster_2,p_cluster_3,p_cluster_4,p_cluster_5,confidence,entropy,entropy_norm
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
11391_AAA_2013J,4,STANDARD_PROFILE,Perfil est√°ndar,0.002708,1.413943e-30,8.118378e-17,4.360323e-14,0.997291,2.576666e-20,0.997291,0.018712,0.010444
28400_AAA_2013J,4,STANDARD_PROFILE,Perfil est√°ndar,0.001128,0.0,1.017301e-11,4.614538e-10,0.998871,1.09245e-19,0.998871,0.008782,0.004901


In [13]:
# =========================
# [CELDA 5] Prestart + secuencia semanal + robust scaling (log1p + clip) + normalizaci√≥n por curso (fit TRAIN)
# =========================
import numpy as np
import pandas as pd

EPS = 1e-6

def _ensure_numeric(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    out = df.copy()
    for c in cols:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce").fillna(0.0)
    return out

def add_prestart_features(
    df_interactions: pd.DataFrame,
    index_uids: pd.Index,
    clicks_col: str = "sum_click",
    day_col: str = "date",
) -> pd.DataFrame:
    df = df_interactions.copy()
    df = add_unique_id(df)

    df[clicks_col] = pd.to_numeric(df[clicks_col], errors="coerce").fillna(0).astype(float)
    df[day_col] = pd.to_numeric(df[day_col], errors="coerce").astype(float)

    pre = df[df[day_col] < 0].copy()

    out = pd.DataFrame(index=index_uids)
    if pre.empty:
        out["prestart_clicks_total"] = 0.0
        out["prestart_active_days"] = 0
        out["prestart_active_weeks"] = 0
        out["prestart_earliest_day"] = 0.0
        out["investigated_platform"] = 0
        return out

    pre["week"] = np.floor(pre[day_col] / 7.0).astype(int)

    agg = pre.groupby("unique_id").agg(
        prestart_clicks_total=(clicks_col, "sum"),
        prestart_active_days=(day_col, lambda s: int(pd.Series(s.dropna().astype(int)).nunique())),
        prestart_active_weeks=("week", lambda s: int(pd.Series(s.dropna().astype(int)).nunique())),
        prestart_earliest_day=(day_col, "min"),
    )

    agg = agg.reindex(index=index_uids, fill_value=0.0)
    agg["prestart_active_days"] = agg["prestart_active_days"].astype(int)
    agg["prestart_active_weeks"] = agg["prestart_active_weeks"].astype(int)
    agg["prestart_clicks_total"] = agg["prestart_clicks_total"].astype(float)
    agg["prestart_earliest_day"] = agg["prestart_earliest_day"].astype(float)

    agg["investigated_platform"] = (agg["prestart_clicks_total"] > 0).astype(int)
    return agg


def build_weekly_sequence_features(
    df_interactions: pd.DataFrame,
    index_uids: pd.Index,
    weeks: int,
    activity_col: str = "activity_type",
    clicks_col: str = "sum_click",
    day_col: str = "date",
) -> pd.DataFrame:
    df = df_interactions.copy()
    df = add_unique_id(df)

    df[clicks_col] = pd.to_numeric(df[clicks_col], errors="coerce").fillna(0).astype(float)
    df[day_col] = pd.to_numeric(df[day_col], errors="coerce").astype(float)

    df["week"] = np.floor(df[day_col] / 7.0).astype(int)
    df = df[(df["week"] >= 0) & (df["week"] < weeks)].copy()

    g = df.groupby(["unique_id", "week", activity_col])[clicks_col].sum().reset_index()

    pivot = g.pivot_table(
        index="unique_id",
        columns=["week", activity_col],
        values=clicks_col,
        aggfunc="sum",
        fill_value=0.0,
    )
    pivot.columns = [f"clicks_{str(act).lower()}_w{int(w):02d}" for (w, act) in pivot.columns]
    pivot = pivot.sort_index(axis=1)

    weekly_total = df.groupby(["unique_id", "week"])[clicks_col].sum().unstack(fill_value=0.0)
    for w in range(weeks):
        if w not in weekly_total.columns:
            weekly_total[w] = 0.0
    weekly_total = weekly_total[sorted(weekly_total.columns)]
    weekly_total.columns = [f"total_clicks_w{int(w):02d}" for w in weekly_total.columns]

    active_weeks = (weekly_total.values > 0).sum(axis=1)
    active_weeks = pd.Series(active_weeks, index=weekly_total.index, name="active_weeks_uptoW")

    total_sum = weekly_total.sum(axis=1).astype(float)
    early_weeks = list(range(min(4, weeks)))
    early_sum = weekly_total[[f"total_clicks_w{w:02d}" for w in early_weeks]].sum(axis=1).astype(float)
    early_ratio = (early_sum / (total_sum + EPS)).fillna(0.0).rename("early_ratio_uptoW")

    late_weeks = list(range(max(0, weeks - 4), weeks))
    late_sum = weekly_total[[f"total_clicks_w{w:02d}" for w in late_weeks]].sum(axis=1).astype(float)
    late_ratio = (late_sum / (total_sum + EPS)).fillna(0.0).rename("late_ratio_uptoW")

    out = pivot.join(weekly_total, how="outer").join(active_weeks).join(early_ratio).join(late_ratio).fillna(0.0)
    out = out.reindex(index=index_uids, fill_value=0.0)
    out.index.name = "unique_id"
    return out


def clip_and_log1p(df: pd.DataFrame, p_clip: float = 0.995) -> pd.DataFrame:
    """
    1) Clip por percentil (solo sobre columnas de clicks)
    2) log1p
    """
    out = df.copy()
    click_cols = [c for c in out.columns if c.startswith("clicks_") or c.startswith("total_clicks_")]
    if not click_cols:
        return out

    # clip global (m√°s simple y robusto); evita outliers 500000
    vals = out[click_cols].values.astype(float).ravel()
    clip_val = float(np.quantile(vals, p_clip))
    if clip_val <= 0:
        clip_val = 0.0

    out[click_cols] = np.clip(out[click_cols].astype(float), 0.0, clip_val)
    out[click_cols] = np.log1p(out[click_cols].astype(float))
    return out

def fit_course_stats(
    X_seq: pd.DataFrame,
    course_series: pd.Series,
) -> dict:
    """
    Fit medias/std por curso SOLO para columnas de clicks.
    """
    X_seq = X_seq.copy()
    course_series = course_series.reindex(X_seq.index)
    course_series = course_series.fillna("UNKNOWN").astype(str)

    click_cols = [c for c in X_seq.columns if c.startswith("clicks_") or c.startswith("total_clicks_")]
    stats = {}

    for course, idxs in course_series.groupby(course_series).groups.items():
        block = X_seq.loc[idxs, click_cols]
        mu = block.mean(axis=0)
        sd = block.std(axis=0).replace(0, 1.0)
        stats[str(course)] = {c: {"mean": float(mu[c]), "std": float(sd[c])} for c in click_cols}

    return stats


def apply_course_norm(
    X_seq: pd.DataFrame,
    course_series: pd.Series,
    course_stats: dict,
) -> pd.DataFrame:
    """
    Aplica z-score por curso SOLO a columnas de clicks.
    Ratios (0..1) y features prestart no se normalizan por curso.
    """
    out = X_seq.copy()
    course_series = course_series.reindex(out.index)
    course_series = course_series.fillna("UNKNOWN").astype(str)

    click_cols = [c for c in out.columns if c.startswith("clicks_") or c.startswith("total_clicks_")]
    if not click_cols:
        return out

    # fallback global
    global_mu = out[click_cols].mean(axis=0)
    global_sd = out[click_cols].std(axis=0).replace(0, 1.0)

    for course, idxs in course_series.groupby(course_series).groups.items():
        ckey = str(course)
        if ckey in course_stats:
            mu = pd.Series({col: course_stats[ckey][col]["mean"] for col in click_cols})
            sd = pd.Series({col: course_stats[ckey][col]["std"] for col in click_cols}).replace(0, 1.0)
        else:
            mu, sd = global_mu, global_sd

        out.loc[idxs, click_cols] = (out.loc[idxs, click_cols] - mu) / sd

    out[click_cols] = out[click_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return out




def build_transformer_block_for_split(
    split_students_al: pd.DataFrame,     # index=unique_id, con code_module/code_presentation
    split_interactions: pd.DataFrame,    # raw interactions del split
    weeks: int,
    course_stats: dict | None = None,    # si None => fit en este split (solo TRAIN)
    p_clip: float = 0.995,
):
    """
    Devuelve:
      - X_seq_norm: secuencia semanal (0..W-1) + features agregadas + prestart, ya normalizado
      - course_stats (si se fittea en este split)
    """
    # course key (module_presentation)
    course = (
        split_students_al["code_module"].astype(str)
        + "_"
        + split_students_al["code_presentation"].astype(str)
    )

    # 1) secuencia + agregados
    X_seq = build_weekly_sequence_features(split_interactions, split_students_al.index, weeks=weeks)

    # 2) prestart
    X_pre = add_prestart_features(split_interactions, split_students_al.index)

    # ratio prestart vs actividad en ventana (sobre TOTAL semanal)
    total_uptoW = X_seq[[c for c in X_seq.columns if c.startswith("total_clicks_w")]].sum(axis=1).astype(float)
    X_pre["prestart_ratio_vs_uptoW"] = (X_pre["prestart_clicks_total"] / (total_uptoW + EPS)).fillna(0.0)

    # 3) unir
    X = pd.concat([X_seq, X_pre], axis=1)

    # 4) robust anti-outliers en clicks (clip + log1p)
    X = clip_and_log1p(X, p_clip=p_clip)

    # 5) normalizaci√≥n por curso (fit solo en TRAIN)
    if course_stats is None:
        course_stats = fit_course_stats(X, course)
    X = apply_course_norm(X, course, course_stats)

    return X, course_stats


# --------- EJEMPLO (TRAIN, W=12) ---------
W = 12
X_train_w12, course_stats_train = build_transformer_block_for_split(
    split_students_al=train_students_al,
    split_interactions=train_interactions,
    weeks=W,
    course_stats=None,      # FIT aqu√≠ (solo TRAIN)
    p_clip=0.995,
)

print("‚úÖ X_train_w12 (seq+prestart+norm):", X_train_w12.shape)
display(X_train_w12.head(3))

# Guardamos stats en memoria para aplicar en val/test luego
COURSE_STATS = course_stats_train


‚úÖ X_train_w12 (seq+prestart+norm): (22785, 236)


Unnamed: 0_level_0,clicks_dataplus_w00,clicks_dataplus_w01,clicks_dataplus_w02,clicks_dataplus_w03,clicks_dataplus_w04,clicks_dataplus_w05,clicks_dataplus_w06,clicks_dataplus_w07,clicks_dataplus_w08,clicks_dataplus_w09,...,total_clicks_w11,active_weeks_uptoW,early_ratio_uptoW,late_ratio_uptoW,prestart_clicks_total,prestart_active_days,prestart_active_weeks,prestart_earliest_day,investigated_platform,prestart_ratio_vs_uptoW
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11391_AAA_2013J,-0.144014,-0.132509,-0.061898,-0.129121,-0.177694,-0.218672,-0.140828,-0.113607,-0.104414,-0.087706,...,1.167762,10.0,0.622177,0.11499,98.0,1,1,-5.0,1,0.201232
28400_AAA_2013J,-0.144014,-0.132509,-0.061898,-0.129121,-0.177694,-0.218672,-0.140828,-0.113607,-0.104414,-0.087706,...,0.551847,11.0,0.612431,0.170018,215.0,7,2,-10.0,1,0.393053
32885_AAA_2013J,-0.144014,-0.132509,-0.061898,-0.129121,-0.177694,-0.218672,-0.140828,-0.113607,-0.104414,-0.087706,...,1.436596,10.0,0.713911,0.19685,295.0,8,2,-10.0,1,0.774278


In [14]:
# =========================
# [CELDA 6] Guardar TRAIN (w12) + checks de coherencia
# =========================
import os
import numpy as np
import pandas as pd

W = 12  # cambia aqu√≠ si est√°s probando otra ventana

# --- 1) Construir X_train_w12 usando COURSE_STATS ya fit (de la celda 5) ---
X_train_w12, COURSE_STATS = build_transformer_block_for_split(
    split_students_al=train_students_al,
    split_interactions=train_interactions,
    weeks=W,
    course_stats=None,   # FIT en TRAIN
    p_clip=0.995,
)

y_train = train_target_al.loc[X_train_w12.index].copy()

# --- 2) Checks r√°pidos ---
print("‚úÖ Shapes")
print("X_train:", X_train_w12.shape)
print("y_train:", y_train.shape)

# NaNs / inf
n_nan = int(np.isnan(X_train_w12.values).sum())
n_inf = int(np.isinf(X_train_w12.values).sum())
print("\n‚úÖ NaN/Inf checks")
print("NaNs:", n_nan, "| Infs:", n_inf)

# Ratios en [0,1] (tolerancia num√©rica)
ratio_cols = [c for c in X_train_w12.columns if c.endswith("_ratio_uptoW") or c.endswith("_ratio_vs_uptoW")]
ratio_bad = {}
for c in ratio_cols:
    mn, mx = float(X_train_w12[c].min()), float(X_train_w12[c].max())
    if mn < -1e-6 or mx > 1 + 1e-6:
        ratio_bad[c] = (mn, mx)

print("\n‚úÖ Ratio columns:", ratio_cols)
print("Ratio fuera de [0,1]?", "NO" if not ratio_bad else f"S√ç -> {ratio_bad}")

# Investigated platform binaria
if "investigated_platform" in X_train_w12.columns:
    uniq = sorted(X_train_w12["investigated_platform"].unique().tolist())
    print("\n‚úÖ investigated_platform unique:", uniq)

# --- 3) Guardado ---
out_split_dir = OUT_DIR / "training"
out_split_dir.mkdir(parents=True, exist_ok=True)

X_path = out_split_dir / f"w{W:02d}_X_seq.csv"
y_path = out_split_dir / f"w{W:02d}_y.csv"

X_train_w12.to_csv(X_path)
y_train.to_csv(y_path)

print("\nüíæ Saved")
print("X:", X_path)
print("y:", y_path)

# Vista r√°pida
display(X_train_w12.head(3))
display(y_train.head(3))


‚úÖ Shapes
X_train: (22785, 236)
y_train: (22785, 1)

‚úÖ NaN/Inf checks
NaNs: 0 | Infs: 0

‚úÖ Ratio columns: ['early_ratio_uptoW', 'late_ratio_uptoW', 'prestart_ratio_vs_uptoW']
Ratio fuera de [0,1]? S√ç -> {'prestart_ratio_vs_uptoW': (0.0, 588000000.0)}

‚úÖ investigated_platform unique: [0, 1]

üíæ Saved
X: /workspace/TFM_education_ai_analytics/data/6_transformer_features/training/w12_X_seq.csv
y: /workspace/TFM_education_ai_analytics/data/6_transformer_features/training/w12_y.csv


Unnamed: 0_level_0,clicks_dataplus_w00,clicks_dataplus_w01,clicks_dataplus_w02,clicks_dataplus_w03,clicks_dataplus_w04,clicks_dataplus_w05,clicks_dataplus_w06,clicks_dataplus_w07,clicks_dataplus_w08,clicks_dataplus_w09,...,total_clicks_w11,active_weeks_uptoW,early_ratio_uptoW,late_ratio_uptoW,prestart_clicks_total,prestart_active_days,prestart_active_weeks,prestart_earliest_day,investigated_platform,prestart_ratio_vs_uptoW
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11391_AAA_2013J,-0.144014,-0.132509,-0.061898,-0.129121,-0.177694,-0.218672,-0.140828,-0.113607,-0.104414,-0.087706,...,1.167762,10.0,0.622177,0.11499,98.0,1,1,-5.0,1,0.201232
28400_AAA_2013J,-0.144014,-0.132509,-0.061898,-0.129121,-0.177694,-0.218672,-0.140828,-0.113607,-0.104414,-0.087706,...,0.551847,11.0,0.612431,0.170018,215.0,7,2,-10.0,1,0.393053
32885_AAA_2013J,-0.144014,-0.132509,-0.061898,-0.129121,-0.177694,-0.218672,-0.140828,-0.113607,-0.104414,-0.087706,...,1.436596,10.0,0.713911,0.19685,295.0,8,2,-10.0,1,0.774278


Unnamed: 0_level_0,final_result
unique_id,Unnamed: 1_level_1
11391_AAA_2013J,2
28400_AAA_2013J,2
32885_AAA_2013J,2
