In [None]:
# Carga target + day0 + segmentaciones de training por semana (W_WINDOWS desde config.py)
from pathlib import Path
import pandas as pd

from educational_ai_analytics.config import W_WINDOWS  # array de semanas

BASE_FEAT = Path("/workspace/TFM_education_ai_analytics/data/3_features/training")
BASE_SEG = Path("/workspace/TFM_education_ai_analytics/data/5_students_segmented/training")

# 1) Target + Day0
target = pd.read_csv(BASE_FEAT / "target.csv", index_col=0)
day0 = pd.read_csv(BASE_FEAT / "day0_static_features.csv", index_col=0)

# 2) Segmentaciones por semana
seg_by_w = {}
for w in sorted(W_WINDOWS):
    p = BASE_SEG / f"students_segmented_uptW{int(w)}.csv"
    seg = pd.read_csv(p, index_col=0)
    seg = seg.add_prefix(f"w{int(w)}_")
    seg_by_w[int(w)] = seg

# 3) Dataset base alineado
common_idx = target.index.intersection(day0.index)
df_base = day0.loc[common_idx].join(target.loc[common_idx], how="inner")

# 4) DataFrame combinado con todas las segmentaciones
df_all = df_base.copy()
for w, seg in seg_by_w.items():
    df_all = df_all.join(seg, how="left")

print("W_WINDOWS:", W_WINDOWS)
print("base:", df_base.shape)
for w in sorted(seg_by_w):
    print(f"seg W{w}:", seg_by_w[w].shape)
print("all:", df_all.shape)

df_all.head(3)


W_WINDOWS: [12, 18, 24]
base: (22785, 16)
seg W12: (22785, 11)
seg W18: (22785, 11)
seg W24: (22785, 11)
all: (22785, 49)


Unnamed: 0_level_0,imd_band,age_band,highest_education,num_of_prev_attempts,studied_credits,region_encoded,prestart_clicks_total,prestart_active_days,prestart_active_weeks,prestart_earliest_day,...,w24_cluster_label,w24_cluster_name,w24_p_cluster_0,w24_p_cluster_1,w24_p_cluster_2,w24_p_cluster_3,w24_p_cluster_4,w24_confidence,w24_entropy,w24_entropy_norm
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11391_AAA_2013J,1.356077,2.315221,1.230868,0.0,2.963451,-1.582024,0.329978,-1.123904,-0.914229,0.960497,...,CONSISTENT_GOOD,Consistentes (buen nivel),0.000255,0.0,5.814712e-15,0.9997444,1.113072e-15,0.999744,0.002366,0.00147
28400_AAA_2013J,-1.446584,0.682052,1.230868,0.0,-0.52672,-0.011035,0.766746,0.937025,0.605604,-0.569965,...,METHODICAL_EXPLORER,Exploradores metódicos,0.999356,0.0,0.0006437378,8.682577e-10,9.131973e-20,0.999356,0.005375,0.003339
32885_AAA_2013J,-0.245444,-0.951118,-1.413218,0.0,-0.52672,1.298122,0.943143,1.280514,0.605604,-0.569965,...,METHODICAL_EXPLORER,Exploradores metódicos,0.999998,0.0,5.651113e-17,3.850084e-15,1.619867e-06,0.999998,2.3e-05,1.5e-05


In [39]:
from pathlib import Path
import pandas as pd
import numpy as np
from educational_ai_analytics.config import W_WINDOWS

BASE_PROCESSED = Path("/workspace/TFM_education_ai_analytics/data/2_processed/training")
BASE_TARGET = Path("/workspace/TFM_education_ai_analytics/data/3_features/training/target.csv")

# Carga
interactions_df = pd.read_csv(BASE_PROCESSED / "interactions.csv", index_col=None)
target_full = pd.read_csv(BASE_TARGET, index_col=0).sort_index()

# Robustez columna clicks
click_col = "sum_click" if "sum_click" in interactions_df.columns else "clicks"
if click_col not in interactions_df.columns:
    raise ValueError("No se encontró ni 'sum_click' ni 'clicks' en interactions.csv")

# Limpieza y unique_id
interactions_df["activity_type"] = interactions_df["activity_type"].astype(str).str.strip().str.lower()
interactions_df["unique_id"] = (
    interactions_df["id_student"].astype(str)
    .str.cat(interactions_df["code_module"].astype(str), sep="_")
    .str.cat(interactions_df["code_presentation"].astype(str), sep="_")
)

# Semana 0-based
interactions_df["week"] = (
    pd.to_numeric(interactions_df["date"], errors="coerce").fillna(-9999) // 7
).astype(int)

# Base temporal válida
interactions_base = interactions_df[interactions_df["week"] >= 0].copy()

# Actividades globales (mismo F para todas las ventanas)
activities_global = sorted(interactions_base["activity_type"].unique().tolist())

sequences = {f"upto_{w}": None for w in sorted(W_WINDOWS)}
masks = {f"upto_{w}": None for w in sorted(W_WINDOWS)}
labels = {f"upto_{w}": None for w in sorted(W_WINDOWS)}
ids = {f"upto_{w}": None for w in sorted(W_WINDOWS)}

for upto_week in sorted(W_WINDOWS):
    inter_uptoW = interactions_base[interactions_base["week"] < upto_week].copy()

    g = (
        inter_uptoW.groupby(["unique_id", "week", "activity_type"], as_index=False)[click_col]
        .sum()
        .rename(columns={click_col: "sum_click"})
    )

    weeks = list(range(upto_week))
    full_cols = pd.MultiIndex.from_product(
        [weeks, activities_global], names=["week", "activity_type"]
    )

    wide = (
        g.pivot_table(
            index="unique_id",
            columns=["week", "activity_type"],
            values="sum_click",
            aggfunc="sum",
            fill_value=0,
        )
        .reindex(columns=full_cols, fill_value=0)
        .sort_index()
    )

    # Orden estable basado en target (mejor para trazabilidad y evaluación)
    common_ids = target_full.index.intersection(wide.index)
    wide_w = wide.loc[common_ids]
    target_w = target_full.loc[common_ids]

    X_seq = wide_w.values.reshape(len(wide_w), upto_week, len(activities_global)).astype(np.float32)
    mask = (X_seq.sum(axis=2) > 0).astype(np.int32)
    y = target_w["final_result"].astype(np.int64).values

    key = f"upto_{upto_week}"
    sequences[key] = X_seq
    masks[key] = mask
    labels[key] = y
    ids[key] = common_ids

    print(f"======== WEEK {upto_week} ========")
    print("X_seq:", X_seq.shape)
    print("mask :", mask.shape)
    print("y    :", y.shape)
    print("n_activities:", len(activities_global))
    print("classes:", np.unique(y, return_counts=True))
    print("================================")

# Ejemplo de consumo consistente:
W_KEY = f"upto_{sorted(W_WINDOWS)[0]}"
X = sequences[W_KEY]
M = masks[W_KEY]
y = labels[W_KEY]
uid = ids[W_KEY]

print("\nW_KEY:", W_KEY, "| X:", X.shape, "| M:", M.shape, "| y:", y.shape, "| ids:", len(uid))


X_seq: (19816, 12, 20)
mask : (19816, 12)
y    : (19816,)
n_activities: 20
classes: (array([0, 1, 2, 3]), array([4532, 4576, 8599, 2109]))
X_seq: (19859, 18, 20)
mask : (19859, 18)
y    : (19859,)
n_activities: 20
classes: (array([0, 1, 2, 3]), array([4541, 4596, 8611, 2111]))
X_seq: (19874, 24, 20)
mask : (19874, 24)
y    : (19874,)
n_activities: 20
classes: (array([0, 1, 2, 3]), array([4545, 4605, 8613, 2111]))

W_KEY: upto_12 | X: (19816, 12, 20) | M: (19816, 12) | y: (19816,) | ids: 19816


In [40]:
import tensorflow as tf
from tensorflow.keras import layers


class PositionalEncoding(layers.Layer):
    def __init__(self, d_model: int, max_len: int = 512):
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len

        pos = tf.range(max_len, dtype=tf.float32)[:, tf.newaxis]   # (L, 1)
        i = tf.range(d_model, dtype=tf.float32)[tf.newaxis, :]     # (1, D)

        angle_rates = 1.0 / tf.pow(10000.0, (2.0 * tf.floor(i / 2.0)) / tf.cast(d_model, tf.float32))
        angle_rads = pos * angle_rates

        sin_part = tf.sin(angle_rads[:, 0::2])
        cos_part = tf.cos(angle_rads[:, 1::2])

        pe = tf.concat([sin_part, cos_part], axis=-1)              # (L, D)
        self.pe = pe[tf.newaxis, ...]                              # (1, L, D)

    def call(self, x):
        seq_len = tf.shape(x)[1]
        return x + self.pe[:, :seq_len, :]


class TransformerEncoderBlock(layers.Layer):
    def __init__(self, d_model: int, num_heads: int, ff_dim: int, dropout: float):
        super().__init__()
        self.mha = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=d_model // num_heads,
            dropout=dropout,
        )
        self.dropout1 = layers.Dropout(dropout)
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.add1 = layers.Add()

        self.ffn = tf.keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dropout(dropout),
                layers.Dense(d_model),
            ]
        )
        self.dropout2 = layers.Dropout(dropout)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.add2 = layers.Add()

    def call(self, x, training=False, attention_mask=None):
        attn_out = self.mha(
            query=x,
            key=x,
            value=x,
            attention_mask=attention_mask,
            training=training,
        )
        attn_out = self.dropout1(attn_out, training=training)
        x = self.add1([x, attn_out])
        x = self.norm1(x)

        ffn_out = self.ffn(x, training=training)
        ffn_out = self.dropout2(ffn_out, training=training)
        x = self.add2([x, ffn_out])
        x = self.norm2(x)
        return x


class GLULayer(layers.Layer):
    # h(X) = (XW) * sigmoid(XV)
    def __init__(self, d_model: int):
        super().__init__()
        self.proj = layers.Dense(2 * d_model)

    def call(self, x):
        z = self.proj(x)
        a, b = tf.split(z, num_or_size_splits=2, axis=-1)
        return a * tf.sigmoid(b)


class GLUTransformerClassifier(tf.keras.Model):
    def __init__(
        self,
        latent_d: int,
        num_heads: int,
        ff_dim: int,
        dropout: float,
        num_classes: int,
        num_layers: int = 2,
        max_len: int = 512,
    ):
        super().__init__()
        if latent_d % num_heads != 0:
            raise ValueError("latent_d debe ser divisible por num_heads")

        self.input_proj = layers.Dense(latent_d)
        self.pos_encoding = PositionalEncoding(d_model=latent_d, max_len=max_len)
        self.input_dropout = layers.Dropout(dropout)

        self.encoders = [
            TransformerEncoderBlock(
                d_model=latent_d,
                num_heads=num_heads,
                ff_dim=ff_dim,
                dropout=dropout,
            )
            for _ in range(num_layers)
        ]

        self.glu = GLULayer(d_model=latent_d)
        self.norm_out = layers.LayerNormalization(epsilon=1e-6)

        self.head = tf.keras.Sequential(
            [
                layers.Dense(latent_d, activation="relu"),
                layers.Dropout(dropout),
                layers.Dense(num_classes, activation="softmax"),
            ]
        )

    def call(self, inputs, training=False, mask=None):
        # Soporta model.fit(x=(X_seq, seq_mask), ...)
        if isinstance(inputs, (tuple, list)):
            x, seq_mask = inputs
        else:
            x = inputs
            seq_mask = None

        x = self.input_proj(x)                  # (B, W, D)
        x = self.pos_encoding(x)                # (B, W, D)
        x = self.input_dropout(x, training=training)

        # Mask para atención: (B, 1, W) -> broadcast interno a (B, W, W)
        attn_mask = None
        if seq_mask is not None:
            attn_mask = tf.cast(seq_mask[:, tf.newaxis, :], tf.bool)

        for encoder in self.encoders:
            x = encoder(x, training=training, attention_mask=attn_mask)

        x = self.glu(x)
        x = self.norm_out(x)

        # Masked pooling temporal
        if seq_mask is not None:
            m = tf.cast(seq_mask, x.dtype)[:, :, tf.newaxis]  # (B, W, 1)
            pooled = tf.reduce_sum(x * m, axis=1) / (tf.reduce_sum(m, axis=1) + 1e-8)
        else:
            pooled = tf.reduce_mean(x, axis=1)

        return self.head(pooled)


In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# ---------- Selección de ventana ----------
W_KEY = "upto_12"  # cambia a: upto_18 / upto_24
X = sequences[W_KEY].astype(np.float32)   # (N, W, F)
M = masks[W_KEY].astype(np.int32)         # (N, W)
y = labels[W_KEY].astype(np.int64)        # (N,)

# ---------- Split train/val ----------
X_tr, X_va, M_tr, M_va, y_tr, y_va = train_test_split(
    X, M, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ---------- Class weights (multiclase) ----------
classes = np.unique(y_tr)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_tr)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}
print("class_weight:", class_weight)

# ---------- Modelo ----------
model = GLUTransformerClassifier(
    latent_d=128,      # paper-inspired
    num_heads=4,       # 512 % 8 == 0
    ff_dim=512,
    dropout=0.1,
    num_classes=4,     # multiclase
    num_layers=2,
    max_len=128
)

optimizer = tf.keras.optimizers.AdamW(
    learning_rate=1e-4,
    weight_decay=1e-4
)

model.compile(
    optimizer=optimizer,
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=6,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=3,
        min_lr=1e-6
    )
]

# ---------- Entrenamiento ----------
history = model.fit(
    x=(X_tr, M_tr),                  # (X_seq, seq_mask)
    y=y_tr,
    validation_data=((X_va, M_va), y_va),
    epochs=40,
    batch_size=128,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1
)

# ---------- Evaluación rápida ----------
val_loss, val_acc = model.evaluate((X_va, M_va), y_va, verbose=0)
print(f"val_loss={val_loss:.4f} | val_acc={val_acc:.4f}")
print("best_val_acc:", max(history.history["val_accuracy"]))


class_weight: {0: 1.0932413793103448, 1: 1.0824911226440863, 2: 0.5761011774967292, 3: 2.3491404860699467}
Epoch 1/40
[1m121/124[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.3290 - loss: 1.3427

In [42]:
# Accuracy final de train/val (última época registrada)
print("train_acc_last:", history.history["accuracy"][-1])
print("val_acc_last  :", history.history["val_accuracy"][-1])

# Mejor val_accuracy alcanzada
print("val_acc_best  :", max(history.history["val_accuracy"]))

# Evaluación explícita en validación
val_loss, val_acc = model.evaluate((X_va, M_va), y_va, verbose=0)
print("val_loss_eval :", val_loss)
print("val_acc_eval  :", val_acc)


train_acc_last: 0.4547060430049896
val_acc_last  : 0.425580233335495
val_acc_best  : 0.425580233335495
val_loss_eval : 1.1786150932312012
val_acc_eval  : 0.42078709602355957


In [43]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, f1_score, balanced_accuracy_score

# ========= 1) Datos multiclase =========
W_KEY = "upto_12"  # cambia a upto_18 / upto_24 para comparar
X = sequences[W_KEY]          # (N, W, F)
M = masks[W_KEY]              # (N, W)
y = labels[W_KEY].astype(int) # 0,1,2,3

X_tr, X_va, M_tr, M_va, y_tr, y_va = train_test_split(
    X, M, y, test_size=0.2, random_state=42, stratify=y
)

# class weights para evitar colapso a "Pass"
classes = np.unique(y_tr)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_tr)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}
print("class_weight:", class_weight)

# ========= 2) Modelo (paper-inspired, multiclase) =========
model = GLUTransformerClassifier(
    latent_d=512,      # tamaño hidden "paper-like"
    num_heads=8,       # 512 % 8 = 0
    ff_dim=512,        # puedes subir a 1024/2048 luego
    dropout=0.1,
    num_classes=4,     # multiclase
    num_layers=2,      # 2 encoders
    max_len=128
)

optimizer = tf.keras.optimizers.AdamW(
    learning_rate=3e-4,
    weight_decay=1e-4
)

model.compile(
    optimizer=optimizer,
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=6, restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6
    )
]

history = model.fit(
    x=(X_tr, M_tr),
    y=y_tr,
    validation_data=((X_va, M_va), y_va),
    epochs=40,
    batch_size=128,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1
)

# ========= 3) Evaluación multiclase =========
val_loss, val_acc = model.evaluate((X_va, M_va), y_va, verbose=0)
y_proba = model.predict((X_va, M_va), verbose=0)
y_pred = np.argmax(y_proba, axis=1)

macro_f1 = f1_score(y_va, y_pred, average="macro")
weighted_f1 = f1_score(y_va, y_pred, average="weighted")
bal_acc = balanced_accuracy_score(y_va, y_pred)

print(f"val_loss={val_loss:.4f} val_acc={val_acc:.4f}")
print(f"macro_f1={macro_f1:.4f} weighted_f1={weighted_f1:.4f} balanced_acc={bal_acc:.4f}")
print(classification_report(y_va, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_va, y_pred))


class_weight: {0: 1.0932413793103448, 1: 1.0824911226440863, 2: 0.5761011774967292, 3: 2.3491404860699467}
Epoch 1/40


KeyboardInterrupt: 