# Clustering con Datos Reales
## Entrenamiento para uso futuro (actualmente sin datos reales)

In [None]:
import sys
sys.path.append('../scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

from database_connection import execute_query
import joblib
import json
from datetime import datetime

%matplotlib inline

## 1. Verificar Disponibilidad de Datos Reales

In [None]:
query_count = """
SELECT 
    COUNT(DISTINCT s.user_id) as total_users,
    COUNT(DISTINCT s.session_id) as total_sessions,
    COUNT(DISTINCT ua.activity_uuid) as total_activities,
    MIN(s.created_at) as first_session,
    MAX(s.created_at) as last_session
FROM session_service_test.sessions s
LEFT JOIN session_service_test.user_activities ua ON s.session_id = ua.session_id
WHERE s.user_id NOT BETWEEN 1001 AND 1100
"""

real_data_stats = execute_query(query_count)

print("ESTADÍSTICAS DE DATOS REALES:")
print("="*60)
if real_data_stats:
    stats = real_data_stats[0]
    print(f"Total usuarios reales: {stats['total_users']}")
    print(f"Total sesiones: {stats['total_sessions']}")
    print(f"Total actividades: {stats['total_activities']}")
    print(f"Primera sesión: {stats['first_session']}")
    print(f"Última sesión: {stats['last_session']}")
    
    MIN_USERS = 50
    MIN_ACTIVITIES = 5
    
    if stats['total_users'] >= MIN_USERS:
        print(f"\n✓ Suficientes usuarios para entrenar ({stats['total_users']} >= {MIN_USERS})")
        CAN_TRAIN = True
    else:
        print(f"\n✗ Insuficientes usuarios ({stats['total_users']} < {MIN_USERS})")
        print(f"  Se necesitan al menos {MIN_USERS - stats['total_users']} usuarios más")
        CAN_TRAIN = False
else:
    print("No hay datos reales disponibles todavía")
    CAN_TRAIN = False

print("\n" + "="*60)

## 2. Cargar Datos Reales (si están disponibles)

In [None]:
if CAN_TRAIN:
    query_real = """
    WITH user_activity_stats AS (
        SELECT 
            s.user_id,
            COUNT(DISTINCT ua.activity_uuid) as total_activities,
            SUM(CASE WHEN ua.status = 'completed' THEN 1 ELSE 0 END) as completed_activities,
            SUM(CASE WHEN ua.status = 'abandoned' THEN 1 ELSE 0 END) as abandoned_activities,
            AVG(ua.pause_count) as avg_pause_count,
            AVG(TIMESTAMPDIFF(MINUTE, ua.started_at, ua.completed_at)) as avg_activity_duration_minutes
        FROM session_service_test.sessions s
        JOIN session_service_test.user_activities ua ON s.session_id = ua.session_id
        WHERE ua.started_at IS NOT NULL
        AND s.user_id NOT BETWEEN 1001 AND 1100
        GROUP BY s.user_id
    ),
    emotion_stats AS (
        SELECT 
            s.user_id,
            AVG(CASE WHEN ms.predominant_emotion = 'Angry' THEN ms.emotion_confidence_avg ELSE 0 END) as avg_frustration,
            AVG(ms.looking_screen_percentage) as avg_visual_attention,
            AVG(ms.ear_avg) as avg_ear,
            AVG(CASE WHEN ms.engagement_level = 'high' THEN 3 
                     WHEN ms.engagement_level = 'medium' THEN 2 
                     ELSE 1 END) as avg_engagement_score
        FROM session_service_test.sessions s
        JOIN session_service_test.user_activities ua ON s.session_id = ua.session_id
        JOIN monitoring_service_test.minute_summaries ms ON ua.activity_uuid = ms.activity_uuid
        WHERE s.user_id NOT BETWEEN 1001 AND 1100
        GROUP BY s.user_id
    ),
    distraction_stats AS (
        SELECT 
            s.user_id,
            SUM(ms.distraction_count) as total_distraction_events,
            SUM(ms.drowsiness_count) as total_drowsiness_events,
            SUM(TIMESTAMPDIFF(HOUR, ua.started_at, COALESCE(ua.completed_at, NOW()))) as total_hours
        FROM session_service_test.sessions s
        JOIN session_service_test.user_activities ua ON s.session_id = ua.session_id
        JOIN monitoring_service_test.minute_summaries ms ON ua.activity_uuid = ms.activity_uuid
        WHERE s.user_id NOT BETWEEN 1001 AND 1100
        GROUP BY s.user_id
    ),
    intervention_stats AS (
        SELECT 
            s.user_id,
            COUNT(i.packet_id) as total_interventions,
            SUM(CASE WHEN i.intervention_type = 'video_instruction' THEN 1 ELSE 0 END) as video_interventions,
            SUM(CASE WHEN i.intervention_type = 'text_instruction' THEN 1 ELSE 0 END) as text_interventions,
            SUM(CASE WHEN i.intervention_type = 'vibration_only' THEN 1 ELSE 0 END) as vibration_interventions
        FROM session_service_test.sessions s
        JOIN session_service_test.user_activities ua ON s.session_id = ua.session_id
        JOIN monitoring_service_test.interventions i ON ua.activity_uuid = i.activity_uuid
        WHERE s.user_id NOT BETWEEN 1001 AND 1100
        GROUP BY s.user_id
    ),
    activity_type_performance AS (
        SELECT 
            s.user_id,
            SUM(CASE WHEN am.activity_type IN ('tracing', 'memory_game') AND ua.status = 'completed' THEN 1 ELSE 0 END) as easy_completed,
            SUM(CASE WHEN am.activity_type IN ('tracing', 'memory_game') THEN 1 ELSE 0 END) as easy_total,
            SUM(CASE WHEN am.activity_type IN ('fill_in_blank', 'reading_comprehension') AND ua.status = 'completed' THEN 1 ELSE 0 END) as hard_completed,
            SUM(CASE WHEN am.activity_type IN ('fill_in_blank', 'reading_comprehension') THEN 1 ELSE 0 END) as hard_total
        FROM session_service_test.sessions s
        JOIN session_service_test.user_activities ua ON s.session_id = ua.session_id
        JOIN session_service_test.activity_masters am ON ua.external_activity_id = am.external_activity_id
        WHERE s.user_id NOT BETWEEN 1001 AND 1100
        GROUP BY s.user_id
    ),
    visual_fatigue_stats AS (
        SELECT 
            s.user_id,
            SUM(CASE WHEN ms.ear_avg < 0.25 THEN 1 ELSE 0 END) as low_ear_count,
            COUNT(*) as total_minutes
        FROM session_service_test.sessions s
        JOIN session_service_test.user_activities ua ON s.session_id = ua.session_id
        JOIN monitoring_service_test.minute_summaries ms ON ua.activity_uuid = ms.activity_uuid
        WHERE s.user_id NOT BETWEEN 1001 AND 1100
        GROUP BY s.user_id
    )
    SELECT 
        uas.user_id,
        COALESCE(uas.completed_activities / NULLIF(uas.total_activities, 0), 0) as completion_rate,
        COALESCE(uas.abandoned_activities / NULLIF(uas.total_activities, 0), 0) as abandonment_rate,
        COALESCE(es.avg_frustration, 0) as avg_frustration,
        COALESCE(es.avg_visual_attention, 0) as avg_visual_attention,
        COALESCE(es.avg_engagement_score, 1) as avg_engagement_score,
        COALESCE(ds.total_distraction_events / NULLIF(ds.total_hours, 0), 0) as distraction_events_per_hour,
        COALESCE(ds.total_drowsiness_events / NULLIF(ds.total_hours, 0), 0) as drowsiness_events_per_hour,
        COALESCE(vfs.low_ear_count / NULLIF(vfs.total_minutes, 0), 0) as avg_visual_fatigue,
        COALESCE(uas.avg_pause_count, 0) as avg_pause_count,
        COALESCE(uas.avg_activity_duration_minutes, 0) as avg_activity_duration_minutes,
        COALESCE(is_total.total_interventions / NULLIF(uas.total_activities, 0), 0) as intervention_count_per_activity,
        COALESCE(is_total.video_interventions / NULLIF(is_total.total_interventions, 0), 0) as response_to_video,
        COALESCE(is_total.text_interventions / NULLIF(is_total.total_interventions, 0), 0) as response_to_text,
        COALESCE(is_total.vibration_interventions / NULLIF(is_total.total_interventions, 0), 0) as response_to_vibration,
        COALESCE(atp.easy_completed / NULLIF(atp.easy_total, 0), 0) as preference_easy_activities
    FROM user_activity_stats uas
    LEFT JOIN emotion_stats es ON uas.user_id = es.user_id
    LEFT JOIN distraction_stats ds ON uas.user_id = ds.user_id
    LEFT JOIN intervention_stats is_total ON uas.user_id = is_total.user_id
    LEFT JOIN activity_type_performance atp ON uas.user_id = atp.user_id
    LEFT JOIN visual_fatigue_stats vfs ON uas.user_id = vfs.user_id
    WHERE uas.total_activities >= 5
    ORDER BY uas.user_id
    """
    
    real_data = execute_query(query_real)
    df_real = pd.DataFrame(real_data)
    
    print(f"Datos reales cargados: {df_real.shape[0]} usuarios")
    print(f"\nPrimeros registros:")
    display(df_real.head())
else:
    print("No hay suficientes datos reales para entrenar")
    print("Este notebook quedará como template para cuando estén disponibles")

## 3. Comparación: Datos Fake vs Datos Reales

In [None]:
if CAN_TRAIN:
    df_fake = pd.read_csv('../output/features_for_clustering.csv')
    
    feature_cols = [col for col in df_fake.columns if col != 'user_id']
    
    comparison_stats = pd.DataFrame({
        'Feature': feature_cols,
        'Fake_Mean': df_fake[feature_cols].mean().values,
        'Real_Mean': df_real[feature_cols].mean().values,
        'Fake_Std': df_fake[feature_cols].std().values,
        'Real_Std': df_real[feature_cols].std().values
    })
    
    comparison_stats['Diff_%'] = ((comparison_stats['Real_Mean'] - comparison_stats['Fake_Mean']) / 
                                   comparison_stats['Fake_Mean'] * 100).round(2)
    
    print("COMPARACIÓN FAKE vs REAL:")
    print("="*80)
    display(comparison_stats)
    
    fig, axes = plt.subplots(3, 3, figsize=(18, 12))
    axes = axes.flatten()
    
    for idx, feature in enumerate(feature_cols[:9]):
        axes[idx].hist(df_fake[feature], bins=20, alpha=0.5, label='Fake', color='blue')
        axes[idx].hist(df_real[feature], bins=20, alpha=0.5, label='Real', color='red')
        axes[idx].set_title(feature, fontsize=10)
        axes[idx].legend()
        axes[idx].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../output/fake_vs_real_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("Comparación no disponible - esperando datos reales")

## 4. Preprocesamiento de Datos Reales

In [None]:
if CAN_TRAIN:
    feature_cols = [col for col in df_real.columns if col != 'user_id']
    X_real = df_real[feature_cols].values
    user_ids_real = df_real['user_id'].values
    
    scaler_real = StandardScaler()
    X_real_scaled = scaler_real.fit_transform(X_real)
    
    print(f"Datos normalizados: {X_real_scaled.shape}")
    print(f"\nEstadísticas después de normalización:")
    print(f"  Media: {X_real_scaled.mean(axis=0).round(3)}")
    print(f"  Desviación estándar: {X_real_scaled.std(axis=0).round(3)}")
else:
    print("Esperando datos reales para preprocesar")

## 5. Determinar K Óptimo con Datos Reales

In [None]:
if CAN_TRAIN:
    inertias_real = []
    silhouette_scores_real = []
    calinski_scores_real = []
    davies_bouldin_scores_real = []
    K_range = range(2, 9)
    
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
        labels = kmeans.fit_predict(X_real_scaled)
        
        inertias_real.append(kmeans.inertia_)
        silhouette_scores_real.append(silhouette_score(X_real_scaled, labels))
        calinski_scores_real.append(calinski_harabasz_score(X_real_scaled, labels))
        davies_bouldin_scores_real.append(davies_bouldin_score(X_real_scaled, labels))
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    axes[0, 0].plot(K_range, inertias_real, 'bo-', linewidth=2, markersize=8)
    axes[0, 0].set_xlabel('K')
    axes[0, 0].set_ylabel('Inercia')
    axes[0, 0].set_title('Método del Codo (Datos Reales)')
    axes[0, 0].grid(True, alpha=0.3)
    
    axes[0, 1].plot(K_range, silhouette_scores_real, 'ro-', linewidth=2, markersize=8)
    axes[0, 1].set_xlabel('K')
    axes[0, 1].set_ylabel('Silhouette Score')
    axes[0, 1].set_title('Silhouette Score (Datos Reales)')
    axes[0, 1].grid(True, alpha=0.3)
    
    axes[1, 0].plot(K_range, calinski_scores_real, 'go-', linewidth=2, markersize=8)
    axes[1, 0].set_xlabel('K')
    axes[1, 0].set_ylabel('Calinski-Harabasz Score')
    axes[1, 0].set_title('Calinski-Harabasz (mayor es mejor)')
    axes[1, 0].grid(True, alpha=0.3)
    
    axes[1, 1].plot(K_range, davies_bouldin_scores_real, 'mo-', linewidth=2, markersize=8)
    axes[1, 1].set_xlabel('K')
    axes[1, 1].set_ylabel('Davies-Bouldin Score')
    axes[1, 1].set_title('Davies-Bouldin (menor es mejor)')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../output/real_data_elbow_metrics.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    metrics_real_df = pd.DataFrame({
        'K': list(K_range),
        'Inercia': inertias_real,
        'Silhouette': silhouette_scores_real,
        'Calinski': calinski_scores_real,
        'Davies-Bouldin': davies_bouldin_scores_real
    })
    
    print("\nMétricas por K (Datos Reales):")
    display(metrics_real_df)
    
    optimal_k_real = silhouette_scores_real.index(max(silhouette_scores_real)) + 2
    print(f"\nK óptimo sugerido (Silhouette): {optimal_k_real}")
else:
    print("Esperando datos reales para determinar K óptimo")

## 6. Entrenar Modelo Final con Datos Reales

In [None]:
if CAN_TRAIN:
    optimal_k_real = 4
    
    kmeans_real = KMeans(n_clusters=optimal_k_real, random_state=42, n_init=30)
    cluster_labels_real = kmeans_real.fit_predict(X_real_scaled)
    
    df_real['cluster'] = cluster_labels_real
    
    silhouette_real = silhouette_score(X_real_scaled, cluster_labels_real)
    davies_bouldin_real = davies_bouldin_score(X_real_scaled, cluster_labels_real)
    calinski_real = calinski_harabasz_score(X_real_scaled, cluster_labels_real)
    
    print(f"MODELO ENTRENADO CON DATOS REALES (K={optimal_k_real})")
    print("="*60)
    print(f"\nMétricas:")
    print(f"  - Silhouette Score: {silhouette_real:.3f}")
    print(f"  - Davies-Bouldin: {davies_bouldin_real:.3f}")
    print(f"  - Calinski-Harabasz: {calinski_real:.2f}")
    print(f"  - Inercia: {kmeans_real.inertia_:.2f}")
    
    print(f"\nDistribución de clusters:")
    cluster_counts_real = pd.Series(cluster_labels_real).value_counts().sort_index()
    for cluster_id, count in cluster_counts_real.items():
        percentage = (count / len(cluster_labels_real)) * 100
        print(f"  Cluster {cluster_id}: {count} usuarios ({percentage:.1f}%)")
else:
    print("Esperando datos reales para entrenar modelo")

## 7. Interpretación de Clusters Reales

In [None]:
if CAN_TRAIN:
    cluster_stats_real = df_real.groupby('cluster')[feature_cols].mean()
    
    print("CARACTERÍSTICAS DE CLUSTERS (DATOS REALES):")
    print("="*80)
    display(cluster_stats_real.round(3))
    
    plt.figure(figsize=(14, 8))
    sns.heatmap(cluster_stats_real.T, annot=True, fmt='.2f', cmap='RdYlGn',
                center=cluster_stats_real.values.mean())
    plt.title('Centroides de Clusters Reales', fontsize=14, pad=20)
    plt.xlabel('Cluster')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig('../output/real_clusters_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    cluster_labels_real_map = {}
    for cluster_id in range(optimal_k_real):
        stats = cluster_stats_real.loc[cluster_id]
        
        if stats['completion_rate'] > 0.7 and stats['avg_frustration'] < 0.45:
            label = "Rápido Visual (Real)"
        elif stats['avg_frustration'] > 0.55 and stats['completion_rate'] > 0.55:
            label = "Lector Constante (Real)"
        elif stats['distraction_events_per_hour'] > 6:
            label = "Disperso Visual (Real)"
        else:
            label = "Fatigado Visual (Real)"
        
        cluster_labels_real_map[cluster_id] = label
        
        count = (cluster_labels_real == cluster_id).sum()
        percentage = (count / len(cluster_labels_real)) * 100
        
        print(f"\nCluster {cluster_id}: {label}")
        print(f"  N = {count} ({percentage:.1f}%)")
        print(f"  Completion: {stats['completion_rate']:.2%}")
        print(f"  Frustración: {stats['avg_frustration']:.3f}")
        print(f"  Atención: {stats['avg_visual_attention']:.1f}%")
        print(f"  Distracción/h: {stats['distraction_events_per_hour']:.2f}")
else:
    print("Esperando datos reales para interpretación")

## 8. Visualización PCA (Datos Reales)

In [None]:
if CAN_TRAIN:
    pca_real = PCA(n_components=2, random_state=42)
    X_pca_real = pca_real.fit_transform(X_real_scaled)
    
    print(f"Varianza explicada (Datos Reales):")
    print(f"  PC1: {pca_real.explained_variance_ratio_[0]:.2%}")
    print(f"  PC2: {pca_real.explained_variance_ratio_[1]:.2%}")
    print(f"  Total: {pca_real.explained_variance_ratio_.sum():.2%}")
    
    plt.figure(figsize=(12, 8))
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A']
    
    for cluster_id in range(optimal_k_real):
        mask = cluster_labels_real == cluster_id
        plt.scatter(X_pca_real[mask, 0], X_pca_real[mask, 1],
                   c=colors[cluster_id], label=cluster_labels_real_map[cluster_id],
                   alpha=0.6, s=100, edgecolors='black', linewidth=0.5)
    
    centroids_pca_real = pca_real.transform(kmeans_real.cluster_centers_)
    plt.scatter(centroids_pca_real[:, 0], centroids_pca_real[:, 1],
               c='black', marker='X', s=300, edgecolors='yellow', linewidth=2,
               label='Centroides', zorder=5)
    
    plt.xlabel(f'PC1 ({pca_real.explained_variance_ratio_[0]:.1%})')
    plt.ylabel(f'PC2 ({pca_real.explained_variance_ratio_[1]:.1%})')
    plt.title('Clusters con Datos Reales (PCA)', fontsize=14, pad=20)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('../output/real_clusters_pca.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("Esperando datos reales para visualización")

## 9. Guardar Modelo con Datos Reales

In [None]:
if CAN_TRAIN:
    joblib.dump(kmeans_real, '../models/user_type_classifier_REAL.pkl')
    joblib.dump(scaler_real, '../models/scaler_REAL.pkl')
    
    labels_real_export = {str(k): v for k, v in cluster_labels_real_map.items()}
    with open('../models/cluster_labels_REAL.json', 'w') as f:
        json.dump(labels_real_export, f, indent=2)
    
    model_metadata_real = {
        'n_clusters': optimal_k_real,
        'features': feature_cols,
        'silhouette_score': float(silhouette_real),
        'davies_bouldin_score': float(davies_bouldin_real),
        'calinski_harabasz_score': float(calinski_real),
        'inertia': float(kmeans_real.inertia_),
        'cluster_distribution': {str(k): int(v) for k, v in cluster_counts_real.items()},
        'trained_on': datetime.now().isoformat(),
        'data_source': 'REAL',
        'total_users': len(df_real)
    }
    
    with open('../models/model_metadata_REAL.json', 'w') as f:
        json.dump(model_metadata_real, f, indent=2)
    
    df_real.to_csv('../output/real_users_with_clusters.csv', index=False)
    
    print("MODELO CON DATOS REALES GUARDADO:")
    print("="*60)
    print("  ✓ models/user_type_classifier_REAL.pkl")
    print("  ✓ models/scaler_REAL.pkl")
    print("  ✓ models/cluster_labels_REAL.json")
    print("  ✓ models/model_metadata_REAL.json")
    print("  ✓ output/real_users_with_clusters.csv")
    
    print("\n" + "="*60)
    print("SIGUIENTE PASO:")
    print("  1. Comparar rendimiento fake vs real")
    print("  2. Si el modelo real es mejor, reemplazar el fake en producción")
    print("  3. Actualizar predict_user_type.py para usar modelo REAL")
    print("="*60)
else:
    print("\nMODELO EN ESPERA")
    print("="*60)
    print("Este notebook está listo para ejecutarse cuando haya suficientes datos reales.")
    print(f"\nRequisitos mínimos:")
    print(f"  - Al menos 50 usuarios reales")
    print(f"  - Al menos 5 actividades por usuario")
    print(f"\nEstado actual: {real_data_stats[0]['total_users'] if real_data_stats else 0} usuarios reales")

## 10. Pipeline Automatizado de Reentrenamiento

In [None]:
def automated_retraining_pipeline():
    """
    Pipeline completo para reentrenar el modelo automáticamente
    Puede ser llamado por un cron job o scheduler
    """
    print("INICIANDO PIPELINE DE REENTRENAMIENTO AUTOMÁTICO")
    print("="*60)
    
    # 1. Verificar disponibilidad de datos
    query_count = """
    SELECT COUNT(DISTINCT s.user_id) as total_users
    FROM session_service_test.sessions s
    JOIN session_service_test.user_activities ua ON s.session_id = ua.session_id
    WHERE s.user_id NOT BETWEEN 1001 AND 1100
    AND ua.status IN ('completed', 'abandoned')
    GROUP BY s.user_id
    HAVING COUNT(DISTINCT ua.activity_uuid) >= 5
    """
    
    real_data_stats = execute_query(query_count)
    n_users = len(real_data_stats) if real_data_stats else 0
    
    print(f"Usuarios disponibles: {n_users}")
    
    if n_users < 50:
        print(f"Insuficientes datos. Se necesitan al menos 50 usuarios con 5+ actividades.")
        print(f"Faltan: {50 - n_users} usuarios")
        return False
    
    # 2. Cargar y preparar datos
    print("\nCargando datos reales...")
    # (usar query_real del paso 2)
    
    # 3. Entrenar modelo
    print("\nEntrenando modelo...")
    # (usar código del paso 6)
    
    # 4. Validar mejora sobre modelo anterior
    print("\nValidando mejora...")
    try:
        old_metadata = json.load(open('../models/model_metadata_REAL.json'))
        old_silhouette = old_metadata['silhouette_score']
        
        if silhouette_real > old_silhouette:
            print(f"✓ Mejora detectada: {old_silhouette:.3f} → {silhouette_real:.3f}")
            # Guardar nuevo modelo
        else:
            print(f"✗ No hay mejora: {old_silhouette:.3f} → {silhouette_real:.3f}")
            print("  Manteniendo modelo anterior")
            return False
    except FileNotFoundError:
        print("Primer entrenamiento con datos reales")
        # Guardar modelo
    
    # 5. Guardar modelo
    print("\nGuardando modelo...")
    # (usar código del paso 9)
    
    print("\n" + "="*60)
    print("REENTRENAMIENTO COMPLETADO EXITOSAMENTE")
    print("="*60)
    
    return True

print("Pipeline de reentrenamiento automático definido")
print("Para ejecutar manualmente: automated_retraining_pipeline()")
print("Para automatizar: configurar cron job que ejecute este notebook")