In [220]:
import pandas as pd
import numpy as np

# MATRIZ DE TRANSICIÓN DE ESTADOS PARA EL ENGAGEMENT

df = pd.read_csv("/Users/joshchaidez/Desktop/Globant_Motivation_Prediction/data/data_globant.csv")

# Separar 10% de las personas para testeo
unique_names = df["Name"].unique()
np.random.seed(1111)
test_names = np.random.choice(unique_names, size=int(0.2 * len(unique_names)), replace=False)
train_df = df[~df["Name"].isin(test_names)].reset_index(drop=True)
test_df = df[df["Name"].isin(test_names)].reset_index(drop=True)

# Discretizar
def compute_transition_matrices(
    df,
    numero_estados=10,
    threshold_counts=0,
    threshold_prob=0.05,
    value_col="Engagement",
    id_col="Name",
    sort_cols=None,
):
    if sort_cols is None:
        default_sort = [id_col]
        for c in ("Year", "Month", "Day"):
            if c in df.columns:
                default_sort.append(c)
        sort_cols = default_sort

    # bins y labels
    bins = np.linspace(0, 5, numero_estados + 1)
    labels = bins[1:]  # etiqueta por cada intervalo derecho (ej. 0.5, 1.0, ...)
    
    # trabajar sobre copia para no mutar df original
    tmp = df.copy()
    tmp["Engagement_bin"] = pd.cut(tmp[value_col], bins=bins, labels=labels, include_lowest=True)
    tmp = tmp.sort_values(by=sort_cols)

    # inicializar matriz de conteos
    transition_counts = pd.DataFrame(0, index=labels, columns=labels, dtype=int)

    # contar transiciones
    for _, group in tmp.groupby(id_col):
        states = group["Engagement_bin"].dropna().astype(float).values
        for s1, s2 in zip(states[:-1], states[1:]):
            transition_counts.loc[s1, s2] += 1

    # probabilidades (normalizar por fila)
    transition_prob = transition_counts.div(transition_counts.sum(axis=1).replace(0, np.nan), axis=0)

    # aplicar umbral de conteos
    if threshold_counts is not None and threshold_counts > 0:
        transition_counts = transition_counts.where(transition_counts >= threshold_counts, 0)

    # aplicar umbral de probabilidad y renormalizar filas
    if threshold_prob is not None and threshold_prob > 0:
        transition_prob = transition_prob.where(transition_prob >= threshold_prob, 0)
        transition_prob = transition_prob.div(transition_prob.sum(axis=1).replace(0, np.nan), axis=0)

    return transition_counts, transition_prob, bins, labels


In [221]:
# Calcular matrices de transición para 10 y 5 estados con los datos de entrenamiento
transition_counts10, transition_probs10, bins10, labels10 = compute_transition_matrices(
    train_df,
    numero_estados=10,
    threshold_counts=0,
    threshold_prob=0.05,
    value_col="Engagement",
    id_col="Name",
    sort_cols=["Name", "Month", "Day"],
)

transition_probs10

transition_counts5, transition_probs5, bins5, labels5 = compute_transition_matrices(
    train_df,
    numero_estados=5,
    threshold_counts=0,
    threshold_prob=0.05,
    value_col="Engagement",
    id_col="Name",
    sort_cols=["Name", "Month", "Day"],
)

test_df["Engagement_bin_10"] = pd.cut(
    test_df["Engagement"],
    bins=bins10,
    labels=labels10,
    include_lowest=True
)

test_df["Engagement_bin_5"] = pd.cut(
    test_df["Engagement"],
    bins=bins5,
    labels=labels5,
    include_lowest=True
)

transition_probs10, transition_probs5

(          0.5       1.0  1.5       2.0       2.5       3.0       3.5  \
 0.5  0.380952  0.000000  0.0  0.071429  0.159524  0.157143  0.104762   
 1.0  0.000000  0.440367  0.0  0.000000  0.000000  0.440367  0.000000   
 1.5  0.000000  0.000000  0.4  0.000000  0.000000  0.000000  0.133333   
 2.0  0.075650  0.000000  0.0  0.695035  0.158392  0.070922  0.000000   
 2.5  0.000000  0.000000  0.0  0.065236  0.648069  0.200000  0.086695   
 3.0  0.000000  0.000000  0.0  0.000000  0.133086  0.593849  0.273065   
 3.5  0.000000  0.000000  0.0  0.000000  0.000000  0.297530  0.495166   
 4.0  0.000000  0.000000  0.0  0.000000  0.000000  0.066066  0.319069   
 4.5  0.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.107394   
 5.0  0.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.000000   
 
           4.0       4.5       5.0  
 0.5  0.126190  0.000000  0.000000  
 1.0  0.119266  0.000000  0.000000  
 1.5  0.133333  0.200000  0.133333  
 2.0  0.000000  0.000000  0.000000  
 2.5  0.00

In [222]:
# Probar las matrices de transición con los datos de testeo
def test_transition_matrix(
    test_df,
    transition_prob,
    state_col="Engagement_bin",
    id_col="Name",
    sort_keys=None,
):
    if sort_keys is None:
        default_sort = [id_col]
        for c in ("Year", "Month", "Day"):
            if c in test_df.columns:
                default_sort.append(c)
        sort_keys = default_sort

    correct = 0
    total = 0

    tmp = test_df.copy()
    tmp = tmp.sort_values(by=sort_keys)

    for _, group in tmp.groupby(id_col):
        states = group[state_col].dropna().astype(float).values
        for s1, s2 in zip(states[:-1], states[1:]):
            if s1 in transition_prob.index and s2 in transition_prob.columns:
                predicted_probs = transition_prob.loc[s1]
                predicted_state = predicted_probs.idxmax()
                if predicted_state == s2:
                    correct += 1
                total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy

In [223]:
# Acuraccy para 10 y 5 estados
accuracy10 = test_transition_matrix(test_df, transition_probs10, state_col="Engagement_bin_10", id_col="Name", sort_keys= None)
accuracy5 = test_transition_matrix(test_df, transition_probs5, state_col="Engagement_bin_5", id_col="Name", sort_keys= None)

accuracy10, accuracy5

(0.5427675772503359, 0.7227944469323779)

In [224]:
# Test log-likelihood para 10 y 5 estados CHATTTTTTT
def _infer_sort_keys(df, id_col, sort_keys):
    """
    Devuelve la lista de columnas para ordenar, usando la misma lógica
    que test_transition_matrix si sort_keys es None.
    """
    if sort_keys is not None:
        return sort_keys
    
    default_sort = [id_col]
    for c in ("Year", "Month", "Day"):
        if c in df.columns:
            default_sort.append(c)
    return default_sort


def compute_log_likelihood(
    test_df,
    transition_prob,
    state_col="Engagement_bin",
    id_col="Name",
    sort_keys=None,
    eps=1e-12,
):
    sort_keys = _infer_sort_keys(test_df, id_col, sort_keys)

    tmp = test_df.copy()
    tmp = tmp.sort_values(by=sort_keys)

    logL_total = 0.0
    n_transitions = 0

    for _, group in tmp.groupby(id_col):
        states = group[state_col].dropna().astype(float).values
        if len(states) < 2:
            continue

        for s1, s2 in zip(states[:-1], states[1:]):
            # Verificar que existan los estados en la matriz
            if (s1 in transition_prob.index) and (s2 in transition_prob.columns):
                p = transition_prob.loc[s1, s2]
                # Si la prob es 0 o NaN, usamos eps
                if (p is None) or (p <= 0) or np.isnan(p):
                    p = eps
            else:
                # Si la transición no está en la matriz, tratamos como prob muy pequeña
                p = eps

            logL_total += np.log(p)
            n_transitions += 1

    if n_transitions == 0:
        return np.nan, np.nan, np.nan

    logL_avg = logL_total / n_transitions
    perplexity = float(np.exp(-logL_avg))
    return float(logL_total), float(logL_avg), perplexity


In [225]:
# Test log-likelihood para 10 y 5 estados
logL10 = compute_log_likelihood(test_df, transition_probs10, state_col="Engagement_bin_10", id_col="Name", sort_keys=None)
logL5 = compute_log_likelihood(test_df, transition_probs5, state_col="Engagement_bin_5", id_col="Name", sort_keys=None)

logL10, logL5

((-6225.241013854957, -2.787837444628283, 16.245849237811758),
 (-3071.174255031168, -1.3753579288093005, 3.9564926122013704))