In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import Precision, Recall
import numpy as np

class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super().__init__(name=name, **kwargs)
        self.precision = Precision()
        self.recall = Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + 1e-6))  # Evita división por cero

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()


def create_progress_evaluator_model():
    """
    Crea un modelo para evaluar el progreso del usuario y predecir
    la probabilidad de recordar una palabra.
    """
    # Definir el modelo
    model = Sequential([
    # Capa de entrada - 3 características
    Dense(16, activation='relu', input_shape=(3,)),
    Dropout(0.2), # Prevenir overfitting
    # Capa oculta
    Dense(8, activation='relu'),
    Dropout(0.1),
    # Capa de salida - probabilidad de recordar la palabra
    Dense(1, activation='sigmoid')
    ])
    # Compilar el modelo
    model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=["F1Score",'accuracy']
    )
    return model
# Crear el modelo
progress_model = create_progress_evaluator_model()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
'''
def generate_synthetic_training_data(num_samples=1000):
    """
    Genera datos sintéticos para el entrenamiento inicial del modelo.
    """
    X = np.zeros((num_samples, 7))
    y = np.zeros((num_samples, 1))
    for i in range(num_samples):
    # Generar características aleatorias
    times_reviewed = np.random.randint(1, 10)
    times_correct = np.random.randint(0, times_reviewed + 1)
    times_incorrect = times_reviewed - times_correct
    days_since_last = np.random.randint(0, 30)
    avg_response_time = np.random.uniform(1.0, 10.0)
    word_difficulty = np.random.randint(1, 6)
    word_level_idx = np.random.randint(0, 4) # 0:A1, 1:A2, 2:B1, 3:B2
    # Probabilidad "real" (sintética) basada en reglas lógicas
    p_remember = 0.9 * (times_correct / max(1, times_reviewed)) - \
    0.05 * days_since_last - \
    0.1 * word_difficulty + \
    0.2 * (1.0 - min(1.0, avg_response_time / 10.0))
    # Ajustar a rango [0, 1]
    p_remember = max(0.01, min(0.99, p_remember))
    # Asignar valores
    X[i] = [times_reviewed, times_correct, times_incorrect,
    days_since_last, times_correct/max(1, times_reviewed),
    avg_response_time, word_difficulty]
    # El objetivo es 1 si se recuerda, 0 si no
    y[i] = 1 if np.random.random() < p_remember else 0
    return X, y
'''
import pandas as pd

df = pd.read_csv('duolingo.csv')

# Generar datos sintéticos
X_train, y_train = df.loc[:,("times_reviewed","times_correct","delta")],df.loc[:,"recall"] 
# Entrenamiento inicial
progress_model.fit(X_train, y_train, epochs=15, batch_size=32, validation_split=0.2)
# Guardar el modelo
progress_model.save('vocabulary_progress_model.keras')

Epoch 1/15
[1m95555/95555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 1ms/step - F1Score: 0.9183 - accuracy: 0.8437 - loss: 0.5714 - val_F1Score: 0.9174 - val_accuracy: 0.8477 - val_loss: 0.3520
Epoch 2/15
[1m95555/95555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 1ms/step - F1Score: 0.9187 - accuracy: 0.8499 - loss: 0.3429 - val_F1Score: 0.9174 - val_accuracy: 0.8475 - val_loss: 0.3394
Epoch 3/15
[1m95555/95555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 1ms/step - F1Score: 0.9187 - accuracy: 0.8496 - loss: 0.3307 - val_F1Score: 0.9174 - val_accuracy: 0.8475 - val_loss: 0.3236
Epoch 4/15
[1m95555/95555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 1ms/step - F1Score: 0.9186 - accuracy: 0.8495 - loss: 0.3241 - val_F1Score: 0.9174 - val_accuracy: 0.8475 - val_loss: 0.3067
Epoch 5/15
[1m95555/95555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 1ms/step - F1Score: 0.9188 - accuracy: 0.8498 - loss: 0.3209 - val_F1Score: 0.9174 - 

In [28]:
# Estructura de datos de entrada

user_word_interaction = {
 "user_id": "user123",
 "word_id": "word456",
 "features": {
 "times_reviewed": 2, # Número de veces que ha visto la palabra
 "times_correct": 1, # Veces que respondió correctamente
 "delta": 24, # Días desde la última revisión
 }
}
#dias 24 48 72 96 120

In [29]:
'''
def determine_review_priority(user_words_data):
    """
    Determina qué palabras necesitan ser repasadas con mayor prioridad.
    Args:
    user_words_data: Lista de diccionarios con datos de interacción
    Returns:
    Lista de IDs de palabras ordenadas por prioridad de repaso
    """
    word_priorities = []
    for word_data in user_words_data:
    recall_prob = predict_recall_probability(word_data)
    # Calcular prioridad: palabras con baja probabilidad de ser recordadas
    # pero que han sido revisadas tienen alta prioridad
    priority_score = (1 - recall_prob) * (word_data["times_reviewed"] + 1)
    word_priorities.append({
    "word_id": word_data["word_id"],
    "recall_probability": recall_prob,
    "priority_score": priority_score
    })
    # Ordenar por prioridad (mayor primero)
    sorted_priorities = sorted(word_priorities,
    key=lambda x: x["priority_score"],
    reverse=True)
    return [item["word_id"] for item in sorted_priorities]

'''

def predict_recall_probability(user_word_data):
    """
    Predice la probabilidad de que un usuario recuerde una palabra
    basado en sus interacciones previas.
    Args:
    user_word_data: Dict con los datos de interacción usuario-palabra
    Returns:
    Probabilidad (0-1) de que el usuario recuerde la palabra
    """
    # Extraer características
    features = [
    user_word_data["times_reviewed"],
    user_word_data["times_correct"],
    user_word_data["delta"],
    ]
    # Convertir a array y dar forma adecuada
    features_array = np.array(features).reshape(1, -1)
    # Realizar predicción
    probability = progress_model.predict(features_array)[0][0]
    return float(probability)


predict_recall_probability(user_word_interaction["features"])



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


0.5212194323539734

In [53]:
# Estructura de datos de entrada

user_word_interaction = {
 "user_id": "user123",
 "word_id": "word456",
 "features": {
 "times_reviewed": 7, # Número de veces que ha visto la palabra
 "times_correct": 6, # Veces que respondió correctamente
 "delta": 24, # Días desde la última revisión
 }
}
#dias 24 48 72 96 120

predict_recall_probability(user_word_interaction["features"])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


0.7212440371513367