In [1]:
"""
=====================================================================
NOTEBOOK 3 : MODÈLE AVANCÉ + MÉTRIQUES 
Projet : Système de Recommandation MovieLens sur Amazon SageMaker
Auteur : Gninninmaguignon Silué
Date : Octobre 2025
=====================================================================
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=" * 70)
print(" ENTRAÎNEMENT DU MODÈLE AVANCÉ + MÉTRIQUES")
print("=" * 70)

# Vérifier GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n Device: {device}")
if device.type == 'cuda':
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
else:
    print("   ⚠️  Mode CPU (normal pour SageMaker notebook)")

# ============================================
# PARTIE 1 : CHARGEMENT DES DONNÉES
# ============================================

print("\n" + "=" * 70)
print(" CHARGEMENT DES DONNÉES PRÉPARÉES")
print("=" * 70)

train_df = pd.read_csv("../data/processed/train_compact.csv")
test_df = pd.read_csv("../data/processed/test_compact.csv")

print(f" Train: {train_df.shape}")
print(f" Test:  {test_df.shape}")

# Nombre d'utilisateurs et de films
n_users = train_df['user'].max() + 1
n_items = train_df['item'].max() + 1
print(f"\n Dataset info:")
print(f"   Users: {n_users}")
print(f"   Items: {n_items}")
print(f"   Features: {train_df.shape[1] - 1} (sans rating)")

# ============================================
# PARTIE 2 : DATASET PYTORCH
# ============================================

print("\n" + "=" * 70)
print(" CRÉATION DU DATASET PYTORCH")
print("=" * 70)

class MovieLensDataset(Dataset):
    """Dataset PyTorch avec features complètes"""
    
    def __init__(self, df, feature_cols):
        self.user = torch.tensor(df['user'].values, dtype=torch.long)
        self.item = torch.tensor(df['item'].values, dtype=torch.long)
        
        # Features additionnelles (toutes sauf user, item, rating)
        self.features = torch.tensor(
            df[feature_cols].values, 
            dtype=torch.float32
        )
        
        self.rating = torch.tensor(df['rating'].values, dtype=torch.float32)
    
    def __len__(self):
        return len(self.rating)
    
    def __getitem__(self, idx):
        return (self.user[idx], self.item[idx], 
                self.features[idx], self.rating[idx])

# Colonnes de features (sans user, item, rating)
feature_cols = [col for col in train_df.columns 
                if col not in ['user', 'item', 'rating']]

print(f" {len(feature_cols)} features additionnelles:")
for i, col in enumerate(feature_cols, 1):
    print(f"   {i:2d}. {col}")

# Créer les datasets
train_dataset = MovieLensDataset(train_df, feature_cols)
test_dataset = MovieLensDataset(test_df, feature_cols)

# DataLoaders
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                         shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, 
                        shuffle=False, num_workers=2)

print(f"\n DataLoaders créés (batch_size={batch_size})")
print(f"   Train batches: {len(train_loader)}")
print(f"   Test batches:  {len(test_loader)}")

# ============================================
# PARTIE 3 : MODÈLE NEURAL HYBRID RECOMMENDER
# ============================================

print("\n" + "=" * 70)
print("  ARCHITECTURE DU MODÈLE")
print("=" * 70)

class HybridRecommenderNet(nn.Module):
    """
    Modèle Hybrid: Collaborative Filtering + Content-Based
    - Embeddings pour user/item
    - Features additionnelles
    - Architecture deep avec dropout et batch norm
    """
    
    def __init__(self, n_users, n_items, n_features, 
                 embedding_dim=128, hidden_dims=[256, 128, 64]):
        super(HybridRecommenderNet, self).__init__()
        
        # Embeddings
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        
        # Batch Normalization pour embeddings
        self.user_bn = nn.BatchNorm1d(embedding_dim)
        self.item_bn = nn.BatchNorm1d(embedding_dim)
        
        # Réseau pour features additionnelles
        self.feature_fc = nn.Sequential(
            nn.Linear(n_features, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.BatchNorm1d(64)
        )
        
        # Réseau principal (concat embeddings + features)
        total_input = embedding_dim * 2 + 64
        
        layers = []
        input_dim = total_input
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(input_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.BatchNorm1d(hidden_dim)
            ])
            input_dim = hidden_dim
        
        # Couche de sortie
        layers.append(nn.Linear(input_dim, 1))
        
        self.fc_layers = nn.Sequential(*layers)
        
        # Initialisation Xavier
        self._init_weights()
    
    def _init_weights(self):
        """Initialisation des poids"""
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, mean=0, std=0.01)
    
    def forward(self, user, item, features):
        # Embeddings
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        
        user_emb = self.user_bn(user_emb)
        item_emb = self.item_bn(item_emb)
        
        # Features
        feat_emb = self.feature_fc(features)
        
        # Concatenation
        x = torch.cat([user_emb, item_emb, feat_emb], dim=1)
        
        # Forward pass
        output = self.fc_layers(x)
        
        return output.squeeze()

# Créer le modèle
n_features = len(feature_cols)
model = HybridRecommenderNet(
    n_users=n_users,
    n_items=n_items,
    n_features=n_features,
    embedding_dim=128,
    hidden_dims=[256, 128, 64]
).to(device)

print(" Modèle créé:")
print(f"   Embeddings: {128}D")
print(f"   Hidden layers: [256, 128, 64]")
print(f"   Total parameters: {sum(p.numel() for p in model.parameters()):,}")

# ============================================
# PARTIE 4 : FONCTIONS DE MÉTRIQUES
# ============================================

print("\n" + "=" * 70)
print(" DÉFINITION DES MÉTRIQUES")
print("=" * 70)

def calculate_rmse(predictions, targets):
    """Root Mean Square Error"""
    return np.sqrt(mean_squared_error(targets, predictions))

def calculate_mae(predictions, targets):
    """Mean Absolute Error"""
    return mean_absolute_error(targets, predictions)

def calculate_precision_at_k(model, user_item_matrix, k=10):
    """
    Precision@K: Proportion de films recommandés qui sont pertinents
    """
    precisions = []
    model.eval()
    
    with torch.no_grad():
        for user_id in range(min(100, n_users)):  # Échantillon
            # Films déjà notés par l'utilisateur
            rated_items = user_item_matrix[user_id].nonzero()[0]
            
            if len(rated_items) == 0:
                continue
            
            # Prédire pour tous les films
            user_tensor = torch.tensor([user_id] * n_items, 
                                      dtype=torch.long).to(device)
            item_tensor = torch.arange(n_items, 
                                      dtype=torch.long).to(device)
            
            # Features moyennes (simplification pour la démo)
            feat_mean = torch.zeros(n_items, n_features).to(device)
            
            predictions = model(user_tensor, item_tensor, feat_mean).cpu().numpy()
            
            # Top-K recommandations
            top_k_items = np.argsort(predictions)[-k:]
            
            # Films pertinents (rating >= 4)
            relevant_items = set([i for i in rated_items 
                                if user_item_matrix[user_id, i] >= 4])
            
            # Precision
            if len(relevant_items) > 0:
                hits = len(set(top_k_items) & relevant_items)
                precisions.append(hits / k)
    
    return np.mean(precisions) if precisions else 0.0

def calculate_hit_rate(predictions, targets, threshold=4.0):
    """Hit Rate: Proportion de prédictions correctes (rating >= threshold)"""
    pred_binary = (predictions >= threshold).astype(int)
    target_binary = (targets >= threshold).astype(int)
    return np.mean(pred_binary == target_binary)

print(" Métriques définies:")
print("   1. RMSE (Root Mean Square Error)")
print("   2. MAE (Mean Absolute Error)")
print("   3. Precision@K (K=10)")
print("   4. Hit Rate (threshold=4.0)")

# ============================================
# PARTIE 5 : FONCTION D'ENTRAÎNEMENT
# ============================================

print("\n" + "=" * 70)
print(" CONFIGURATION DE L'ENTRAÎNEMENT")
print("=" * 70)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3, verbose=True
)

print(" Configuration:")
print(f"   Loss: MSE")
print(f"   Optimizer: Adam (lr=0.001, weight_decay=1e-5)")
print(f"   Scheduler: ReduceLROnPlateau")

# ============================================
# PARTIE 6 : BOUCLE D'ENTRAÎNEMENT
# ============================================

def train_epoch(model, loader, criterion, optimizer, device):
    """Entraîner une epoch"""
    model.train()
    total_loss = 0
    
    for users, items, features, ratings in loader:
        users = users.to(device)
        items = items.to(device)
        features = features.to(device)
        ratings = ratings.to(device)
        
        optimizer.zero_grad()
        predictions = model(users, items, features)
        loss = criterion(predictions, ratings)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(loader)

def evaluate(model, loader, criterion, device):
    """Évaluer le modèle"""
    model.eval()
    total_loss = 0
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for users, items, features, ratings in loader:
            users = users.to(device)
            items = items.to(device)
            features = features.to(device)
            ratings = ratings.to(device)
            
            predictions = model(users, items, features)
            loss = criterion(predictions, ratings)
            
            total_loss += loss.item()
            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(ratings.cpu().numpy())
    
    avg_loss = total_loss / len(loader)
    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    
    # Calculer les métriques
    rmse = calculate_rmse(all_predictions, all_targets)
    mae = calculate_mae(all_predictions, all_targets)
    hit_rate = calculate_hit_rate(all_predictions, all_targets)
    
    return avg_loss, rmse, mae, hit_rate, all_predictions, all_targets

# ============================================
# PARTIE 7 : ENTRAÎNEMENT
# ============================================

print("\n" + "=" * 70)
print(" DÉBUT DE L'ENTRAÎNEMENT")
print("=" * 70)

n_epochs = 15
best_rmse = float('inf')
history = {
    'train_loss': [], 'test_loss': [],
    'test_rmse': [], 'test_mae': [], 'test_hit_rate': []
}

print(f"\n Entraînement sur {n_epochs} epochs\n")

for epoch in range(n_epochs):
    # Entraînement
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    
    # Évaluation
    test_loss, rmse, mae, hit_rate, _, _ = evaluate(
        model, test_loader, criterion, device
    )
    
    # Enregistrer
    history['train_loss'].append(train_loss)
    history['test_loss'].append(test_loss)
    history['test_rmse'].append(rmse)
    history['test_mae'].append(mae)
    history['test_hit_rate'].append(hit_rate)
    
    # Scheduler
    scheduler.step(test_loss)
    
    # Affichage
    print(f"Epoch [{epoch+1:2d}/{n_epochs}] | "
          f"Train Loss: {train_loss:.4f} | "
          f"Test Loss: {test_loss:.4f} | "
          f"RMSE: {rmse:.4f} | "
          f"MAE: {mae:.4f} | "
          f"Hit Rate: {hit_rate:.3f}")
    
    # Sauvegarder le meilleur modèle
    if rmse < best_rmse:
        best_rmse = rmse
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'rmse': rmse,
            'mae': mae,
            'n_users': n_users,
            'n_items': n_items,
            'n_features': n_features,
            'feature_cols': feature_cols
        }, '../models/saved_models/best_model.pth')
        print(f"    Meilleur modèle sauvegardé (RMSE: {rmse:.4f})")

print("\n" + "=" * 70)
print(" ENTRAÎNEMENT TERMINÉ")
print("=" * 70)
print(f"\n Meilleur RMSE: {best_rmse:.4f}")

# ============================================
# PARTIE 8 : VISUALISATION DES RÉSULTATS
# ============================================

print("\n" + "=" * 70)
print(" GÉNÉRATION DES GRAPHIQUES")
print("=" * 70)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Résultats d\'Entraînement - Modèle Hybrid', 
             fontsize=16, fontweight='bold')

epochs_range = range(1, n_epochs + 1)

# 1. Losses
ax = axes[0, 0]
ax.plot(epochs_range, history['train_loss'], 
        marker='o', label='Train Loss', linewidth=2)
ax.plot(epochs_range, history['test_loss'], 
        marker='s', label='Test Loss', linewidth=2)
ax.set_title('Evolution des Losses', fontweight='bold')
ax.set_xlabel('Epoch')
ax.set_ylabel('MSE Loss')
ax.legend()
ax.grid(True, alpha=0.3)

# 2. RMSE
ax = axes[0, 1]
ax.plot(epochs_range, history['test_rmse'], 
        marker='o', color='coral', linewidth=2)
ax.set_title('Evolution du RMSE', fontweight='bold')
ax.set_xlabel('Epoch')
ax.set_ylabel('RMSE')
ax.grid(True, alpha=0.3)

# 3. MAE
ax = axes[1, 0]
ax.plot(epochs_range, history['test_mae'], 
        marker='s', color='mediumseagreen', linewidth=2)
ax.set_title('Evolution du MAE', fontweight='bold')
ax.set_xlabel('Epoch')
ax.set_ylabel('MAE')
ax.grid(True, alpha=0.3)

# 4. Hit Rate
ax = axes[1, 1]
ax.plot(epochs_range, history['test_hit_rate'], 
        marker='^', color='mediumpurple', linewidth=2)
ax.set_title('Evolution du Hit Rate', fontweight='bold')
ax.set_xlabel('Epoch')
ax.set_ylabel('Hit Rate')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/plots/04_training_metrics.png', 
            dpi=150, bbox_inches='tight')
print(" Graphique sauvegardé: outputs/plots/04_training_metrics.png")
plt.close()

# Distribution des erreurs
_, _, _, _, final_preds, final_targets = evaluate(
    model, test_loader, criterion, device
)

errors = final_preds - final_targets

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogramme des erreurs
axes[0].hist(errors, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].axvline(0, color='red', linestyle='--', linewidth=2, label='Erreur = 0')
axes[0].set_title('Distribution des Erreurs de Prédiction', fontweight='bold')
axes[0].set_xlabel('Erreur (Prédiction - Réel)')
axes[0].set_ylabel('Fréquence')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Scatter plot: Prédictions vs Réel
axes[1].scatter(final_targets, final_preds, alpha=0.3, s=10)
axes[1].plot([1, 5], [1, 5], 'r--', linewidth=2, label='Prédiction parfaite')
axes[1].set_title('Prédictions vs Ratings Réels', fontweight='bold')
axes[1].set_xlabel('Rating Réel')
axes[1].set_ylabel('Rating Prédit')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/plots/05_prediction_analysis.png', 
            dpi=150, bbox_inches='tight')
print(" Graphique sauvegardé: outputs/plots/05_prediction_analysis.png")
plt.close()

# ============================================
# PARTIE 9 : RAPPORT FINAL
# ============================================

print("\n" + "=" * 70)
print(" GÉNÉRATION DU RAPPORT FINAL")
print("=" * 70)

final_metrics = {
    'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model': 'HybridRecommenderNet',
    'architecture': {
        'embedding_dim': 128,
        'hidden_dims': [256, 128, 64],
        'total_parameters': sum(p.numel() for p in model.parameters()),
        'n_features': n_features
    },
    'training': {
        'epochs': n_epochs,
        'batch_size': batch_size,
        'optimizer': 'Adam',
        'learning_rate': 0.001,
        'scheduler': 'ReduceLROnPlateau'
    },
    'final_metrics': {
        'best_rmse': float(best_rmse),
        'final_rmse': float(history['test_rmse'][-1]),
        'final_mae': float(history['test_mae'][-1]),
        'final_hit_rate': float(history['test_hit_rate'][-1]),
        'train_loss': float(history['train_loss'][-1]),
        'test_loss': float(history['test_loss'][-1])
    },
    'history': {
        'test_rmse': [float(x) for x in history['test_rmse']],
        'test_mae': [float(x) for x in history['test_mae']],
        'test_hit_rate': [float(x) for x in history['test_hit_rate']]
    }
}

with open('../outputs/metrics/training_report.json', 'w') as f:
    json.dump(final_metrics, f, indent=2)
print(" Rapport JSON sauvegardé: outputs/metrics/training_report.json")

# ============================================
# RÉSUMÉ FINAL
# ============================================

print("\n" + "=" * 70)
print(" RÉSULTATS FINAUX")
print("=" * 70)

print(f"\n Performances du modèle:")
print(f"    Meilleur RMSE: {best_rmse:.4f}")
print(f"    MAE final:     {history['test_mae'][-1]:.4f}")
print(f"    Hit Rate:      {history['test_hit_rate'][-1]:.3f}")

print(f"\n Fichiers sauvegardés:")
print(f"    Modèle: models/saved_models/best_model.pth")
print(f"    Graphiques: outputs/plots/")
print(f"    Métriques: outputs/metrics/training_report.json")

print("\n PROCHAINE ÉTAPE: Système de Recommandation Top-K")
print("=" * 70)

 ENTRAÎNEMENT DU MODÈLE AVANCÉ + MÉTRIQUES

 Device: cpu
   ⚠️  Mode CPU (normal pour SageMaker notebook)

 CHARGEMENT DES DONNÉES PRÉPARÉES
 Train: (80000, 22)
 Test:  (20000, 22)

 Dataset info:
   Users: 943
   Items: 1682
   Features: 21 (sans rating)

 CRÉATION DU DATASET PYTORCH
 19 features additionnelles:
    1. age_normalized
    2. gender_M
    3. gender_F
    4. occupation_encoded
    5. user_rating_count
    6. user_avg_rating
    7. user_rating_std
    8. num_genres
    9. item_rating_count
   10. item_avg_rating
   11. item_rating_std
   12. item_popularity_log
   13. year
   14. month
   15. day_of_week
   16. hour
   17. genre_match_score
   18. rating_diff_user_avg
   19. rating_diff_item_avg

 DataLoaders créés (batch_size=256)
   Train batches: 313
   Test batches:  79

  ARCHITECTURE DU MODÈLE
 Modèle créé:
   Embeddings: 128D
   Hidden layers: [256, 128, 64]
   Total parameters: 462,209

 DÉFINITION DES MÉTRIQUES
 Métriques définies:
   1. RMSE (Root Mean Square Er