In [5]:
"""
=====================================================================
NOTEBOOK 4 : SYSTÈME DE RECOMMANDATION TOP-K
Projet : Système de Recommandation MovieLens sur Amazon SageMaker
Auteur : Gninninmaguignon Silué
Date : Octobre 2025
=====================================================================
"""

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import pickle
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=" * 70)
print(" SYSTÈME DE RECOMMANDATION TOP-K")
print("=" * 70)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n  Device: {device}")

# ============================================
# PARTIE 1 : CHARGEMENT DU MODÈLE ET DONNÉES
# ============================================

print("\n" + "=" * 70)
print(" CHARGEMENT DU MODÈLE ENTRAÎNÉ")
print("=" * 70)

# Définir l'architecture (doit correspondre au modèle entraîné)
class HybridRecommenderNet(nn.Module):
    def __init__(self, n_users, n_items, n_features, 
                 embedding_dim=128, hidden_dims=[256, 128, 64]):
        super(HybridRecommenderNet, self).__init__()

        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        self.user_bn = nn.BatchNorm1d(embedding_dim)
        self.item_bn = nn.BatchNorm1d(embedding_dim)

        self.feature_fc = nn.Sequential(
            nn.Linear(n_features, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.BatchNorm1d(64)
        )

        total_input = embedding_dim * 2 + 64
        layers = []
        input_dim = total_input

        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(input_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.BatchNorm1d(hidden_dim)
            ])
            input_dim = hidden_dim

        layers.append(nn.Linear(input_dim, 1))
        self.fc_layers = nn.Sequential(*layers)

    def forward(self, user, item, features):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        user_emb = self.user_bn(user_emb)
        item_emb = self.item_bn(item_emb)
        feat_emb = self.feature_fc(features)
        x = torch.cat([user_emb, item_emb, feat_emb], dim=1)
        output = self.fc_layers(x)
        return output.squeeze()

# Charger le checkpoint
checkpoint = torch.load('../models/saved_models/best_model.pth', 
                        map_location=device,weights_only=False)

n_users = checkpoint['n_users']
n_items = checkpoint['n_items']
n_features = checkpoint['n_features']

print(f" Modèle entraîné chargé")
print(f"   Users: {n_users}")
print(f"   Items: {n_items}")
print(f"   Features: {n_features}")
print(f"   Best RMSE: {checkpoint['rmse']:.4f}")

# Créer le modèle et charger les poids
model = HybridRecommenderNet(n_users, n_items, n_features).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
print(" Poids chargés, modèle en mode évaluation")

# Charger les encoders
with open('../models/encoders/user_encoder.pkl', 'rb') as f:
    user_encoder = pickle.load(f)
with open('../models/encoders/item_encoder.pkl', 'rb') as f:
    item_encoder = pickle.load(f)
print(" Encoders chargés")

# Charger les métadonnées
movies_meta = pd.read_csv("../data/processed/movies_metadata.csv")
users_meta = pd.read_csv("../data/processed/users_metadata.csv")
data_full = pd.read_csv("../data/processed/train_features.csv")
print(" Métadonnées chargées")

# ============================================
# PARTIE 2 : FONCTIONS DE RECOMMANDATION
# ============================================

print("\n" + "=" * 70)
print(" CRÉATION DES FONCTIONS DE RECOMMANDATION")
print("=" * 70)

def get_user_profile(user_id_original):
    """Récupérer le profil d'un utilisateur"""
    user_data = users_meta[users_meta['user_id'] == user_id_original]
    if len(user_data) == 0:
        return None

    user_info = user_data.iloc[0]

    # Films déjà notés
    user_ratings = data_full[data_full['user_id'] == user_id_original]

    profile = {
        'user_id': int(user_id_original),
        'age': int(user_info['age']),
        'gender': user_info['gender'],
        'occupation': user_info['occupation'],
        'n_ratings': len(user_ratings),
        'avg_rating': float(user_ratings['rating'].mean()),
        'rated_items': user_ratings['item_id'].tolist()
    }

    return profile

def get_movie_info(item_id):
    """Récupérer les informations d'un film"""
    movie = movies_meta[movies_meta['item_id'] == item_id]
    if len(movie) == 0:
        return None

    movie_info = movie.iloc[0]

    # Genres
    genre_cols = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy',
                  'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                  'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                  'Thriller', 'War', 'Western']

    genres = [col for col in genre_cols if movie_info[col] == 1]

    # Statistiques du film
    item_ratings = data_full[data_full['item_id'] == item_id]
    
    return {
        'item_id': int(item_id),
        'title': movie_info['title'],
        'genres': genres,
        'n_ratings': len(item_ratings),
        'avg_rating': float(item_ratings['rating'].mean()) if len(item_ratings) > 0 else 0
    }

def prepare_features_for_prediction(user_id_encoded, n_items, data_full, user_id_original):
    """Préparer les features pour la prédiction"""
    # Récupérer les features moyennes de l'utilisateur
    user_data = data_full[data_full['user_id'] == user_id_original]
    
    if len(user_data) == 0:
        # Features par défaut
        features = np.zeros((n_items, n_features))
    else:
        # Calculer les features moyennes
        feature_cols = checkpoint['feature_cols']
        
        # Pour chaque film, utiliser les features moyennes de l'utilisateur
        # (simplification pour la démo)
        user_features_mean = user_data[feature_cols].mean().values
        features = np.tile(user_features_mean, (n_items, 1))
    
    return torch.tensor(features, dtype=torch.float32).to(device)

def recommend_top_k(user_id_original, top_k=10, exclude_rated=True):
    """
    Recommander les top-K films pour un utilisateur
    
    Args:
        user_id_original: ID original de l'utilisateur
        top_k: Nombre de recommandations
        exclude_rated: Exclure les films déjà notés
    
    Returns:
        Liste de recommandations avec scores
    """
    # Vérifier que l'utilisateur existe
    if user_id_original not in user_encoder.classes_:
        return None
    
    # Encoder l'user_id
    user_id_encoded = user_encoder.transform([user_id_original])[0]
    
    # Préparer les tensors
    user_tensor = torch.tensor([user_id_encoded] * n_items, 
                               dtype=torch.long).to(device)
    item_tensor = torch.arange(n_items, dtype=torch.long).to(device)
    
    # Features
    features_tensor = prepare_features_for_prediction(
        user_id_encoded, n_items, data_full, user_id_original
    )
    
    # Prédiction
    with torch.no_grad():
        predictions = model(user_tensor, item_tensor, features_tensor)
        predictions = predictions.cpu().numpy()
    
    # Créer un DataFrame avec les prédictions
    item_ids_original = item_encoder.inverse_transform(range(n_items))
    recommendations_df = pd.DataFrame({
        'item_id': item_ids_original,
        'predicted_rating': predictions
    })
    
    # Exclure les films déjà notés
    if exclude_rated:
        rated_items = data_full[data_full['user_id'] == user_id_original]['item_id'].values
        recommendations_df = recommendations_df[~recommendations_df['item_id'].isin(rated_items)]
    
    # Trier par score décroissant
    recommendations_df = recommendations_df.sort_values('predicted_rating', 
                                                        ascending=False)
    
    # Top-K
    top_recommendations = recommendations_df.head(top_k)
    
    # Ajouter les infos des films
    recommendations = []
    for _, row in top_recommendations.iterrows():
        movie_info = get_movie_info(row['item_id'])
        if movie_info:
            movie_info['predicted_rating'] = float(row['predicted_rating'])
            recommendations.append(movie_info)
    
    return recommendations

print(" Fonctions de recommandation créées:")
print("   - get_user_profile()")
print("   - get_movie_info()")
print("   - recommend_top_k()")

# ============================================
# PARTIE 3 : TESTS DE RECOMMANDATION
# ============================================

print("\n" + "=" * 70)
print(" TESTS DE RECOMMANDATION")
print("=" * 70)

# Sélectionner quelques utilisateurs aléatoires
test_users = np.random.choice(data_full['user_id'].unique(), 3, replace=False)

for user_id in test_users:
    print(f"\n{'='*70}")
    print(f"👤 UTILISATEUR {user_id}")
    print(f"{'='*70}")
    
    # Profil utilisateur
    profile = get_user_profile(user_id)
    print(f"\n Profil:")
    print(f"   Âge: {profile['age']} ans")
    print(f"   Genre: {profile['gender']}")
    print(f"   Profession: {profile['occupation']}")
    print(f"   Nombre de ratings: {profile['n_ratings']}")
    print(f"   Rating moyen: {profile['avg_rating']:.2f}⭐")
    
    # Quelques films déjà notés
    user_ratings = data_full[data_full['user_id'] == user_id].sort_values(
        'rating', ascending=False
    ).head(5)
    
    print(f"\n Films appréciés (top 5):")
    for idx, row in user_ratings.iterrows():
        movie = get_movie_info(row['item_id'])
        if movie:
            print(f"   - {movie['title']:45s} ({row['rating']}⭐)")
    
    # Recommandations
    recommendations = recommend_top_k(user_id, top_k=10)
    
    print(f"\n TOP 10 RECOMMANDATIONS:")
    for i, rec in enumerate(recommendations, 1):
        genres_str = ', '.join(rec['genres'][:3])  # Max 3 genres
        print(f"   {i:2d}. {rec['title']:40s} | "
              f"Score: {rec['predicted_rating']:.2f}⭐ | "
              f"Genres: {genres_str}")

# ============================================
# PARTIE 4 : ANALYSE DES RECOMMANDATIONS
# ============================================

print("\n" + "=" * 70)
print(" ANALYSE DES RECOMMANDATIONS")
print("=" * 70)

# Analyser la diversité des recommandations
sample_users = np.random.choice(data_full['user_id'].unique(), 50, replace=False)
all_recommended_items = []
all_genres = []

for user_id in sample_users:
    recs = recommend_top_k(user_id, top_k=10)
    if recs:
        for rec in recs:
            all_recommended_items.append(rec['item_id'])
            all_genres.extend(rec['genres'])

print(f"\n Statistiques sur 50 utilisateurs:")
print(f"   Films recommandés uniques: {len(set(all_recommended_items))}")
print(f"   Films recommandés (total): {len(all_recommended_items)}")
print(f"   Diversité: {len(set(all_recommended_items))/len(all_recommended_items)*100:.1f}%")

# Genres les plus recommandés
from collections import Counter
genre_counts = Counter(all_genres)

print(f"\n Top 10 genres recommandés:")
for genre, count in genre_counts.most_common(10):
    print(f"   {genre:15s}: {count:3d} fois")

# ============================================
# PARTIE 5 : VISUALISATION
# ============================================

print("\n" + "=" * 70)
print(" VISUALISATION DES RECOMMANDATIONS")
print("=" * 70)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1. Distribution des scores prédits
sample_user = test_users[0]
recs = recommend_top_k(sample_user, top_k=50, exclude_rated=False)
scores = [r['predicted_rating'] for r in recs]

axes[0].hist(scores, bins=20, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_title(f'Distribution des Scores Prédits\n(Utilisateur {sample_user})', 
                  fontweight='bold')
axes[0].set_xlabel('Score Prédit')
axes[0].set_ylabel('Fréquence')
axes[0].grid(True, alpha=0.3)

# 2. Genres recommandés
top_genres = genre_counts.most_common(10)
genres, counts = zip(*top_genres)

axes[1].barh(range(len(genres)), counts, color='coral', edgecolor='black')
axes[1].set_yticks(range(len(genres)))
axes[1].set_yticklabels(genres)
axes[1].set_title('Top 10 Genres Recommandés\n(50 utilisateurs)', 
                  fontweight='bold')
axes[1].set_xlabel('Nombre de recommandations')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig('../outputs/plots/06_recommendation_analysis.png', 
            dpi=150, bbox_inches='tight')
print(" Graphique sauvegardé: outputs/plots/06_recommendation_analysis.png")
plt.close()

# ============================================
# PARTIE 6 : SAUVEGARDE DU SYSTÈME
# ============================================

print("\n" + "=" * 70)
print(" SAUVEGARDE DU SYSTÈME DE RECOMMANDATION")
print("=" * 70)

# Créer un rapport de recommandation
rec_report = {
    'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model': 'HybridRecommenderNet',
    'best_rmse': float(checkpoint['rmse']),
    'recommendation_stats': {
        'n_users_tested': len(sample_users),
        'unique_items_recommended': len(set(all_recommended_items)),
        'diversity_score': float(len(set(all_recommended_items))/len(all_recommended_items)),
        'top_genres': dict(genre_counts.most_common(10))
    },
    'sample_recommendations': []
}

# Ajouter des exemples de recommandations
for user_id in test_users[:3]:
    profile = get_user_profile(user_id)
    recs = recommend_top_k(user_id, top_k=5)
    
    rec_report['sample_recommendations'].append({
        'user_id': int(user_id),
        'user_profile': {
            'age': profile['age'],
            'gender': profile['gender'],
            'occupation': profile['occupation']
        },
        'recommendations': [
            {
                'title': r['title'],
                'predicted_rating': r['predicted_rating'],
                'genres': r['genres']
            } for r in recs
        ]
    })

with open('../outputs/metrics/recommendation_report.json', 'w') as f:
    json.dump(rec_report, f, indent=2)
print(" Rapport sauvegardé: outputs/metrics/recommendation_report.json")

# ============================================
# PARTIE 7 : FONCTION DE DÉMO INTERACTIVE
# ============================================

print("\n" + "=" * 70)
print(" FONCTION DE DÉMO INTERACTIVE")
print("=" * 70)

def demo_recommendation(user_id, top_k=10):
    """
    Fonction de démo pour tester le système
    
    Usage:
        demo_recommendation(196, top_k=10)
    """
    print("=" * 70)
    print(f" SYSTÈME DE RECOMMANDATION - UTILISATEUR {user_id}")
    print("=" * 70)
    
    # Vérifier l'existence
    if user_id not in data_full['user_id'].unique():
        print(f" Utilisateur {user_id} introuvable")
        return
    
    # Profil
    profile = get_user_profile(user_id)
    print(f"\n PROFIL:")
    print(f"   • Âge: {profile['age']} ans")
    print(f"   • Genre: {'Homme' if profile['gender'] == 'M' else 'Femme'}")
    print(f"   • Profession: {profile['occupation']}")
    print(f"   • Activité: {profile['n_ratings']} films notés")
    print(f"   • Note moyenne: {profile['avg_rating']:.2f}⭐")
    
    # Films appréciés
    user_ratings = data_full[data_full['user_id'] == user_id].sort_values(
        'rating', ascending=False
    ).head(5)
    
    print(f"\n❤️  FILMS PRÉFÉRÉS:")
    for idx, (_, row) in enumerate(user_ratings.iterrows(), 1):
        movie = get_movie_info(row['item_id'])
        if movie:
            print(f"   {idx}. {movie['title']} ({row['rating']}⭐)")
    
    # Recommandations
    recommendations = recommend_top_k(user_id, top_k=top_k)
    
    print(f"\n🎯 TOP {top_k} RECOMMANDATIONS:")
    print(f"   {'#':<3} {'Titre':<42} {'Score':<8} {'Genres'}")
    print(f"   {'-'*70}")
    
    for i, rec in enumerate(recommendations, 1):
        genres_str = ', '.join(rec['genres'][:3])
        print(f"   {i:<3} {rec['title'][:40]:<42} "
              f"{rec['predicted_rating']:.2f}⭐   {genres_str}")
    
    print("=" * 70)
    
    return recommendations

# Sauvegarder la fonction
print(" Fonction demo_recommendation() créée")
print("\n Usage:")
print("   recommendations = demo_recommendation(196, top_k=10)")

# ============================================
# RÉSUMÉ FINAL
# ============================================

print("\n" + "=" * 70)
print(" SYSTÈME DE RECOMMANDATION OPÉRATIONNEL")
print("=" * 70)

print("\n FONCTIONNALITÉS:")
print("    Recommandations personnalisées Top-K")
print("    Profils utilisateurs détaillés")
print("    Exclusion des films déjà notés")
print("    Analyse de diversité")
print("    Fonction de démo interactive")

print("\n PERFORMANCES:")
print(f"   • RMSE: {checkpoint['rmse']:.4f}")
print(f"   • MAE: {checkpoint['mae']:.4f}")
print(f"   • Diversité: {len(set(all_recommended_items))/len(all_recommended_items)*100:.1f}%")

print("\n FICHIERS SAUVEGARDÉS:")
print("    Modèle: models/saved_models/best_model.pth")
print("    Rapport: outputs/metrics/recommendation_report.json")
print("    Graphiques: outputs/plots/")

print("\n TESTEZ LE SYSTÈME:")
print("   demo_recommendation(196, top_k=10)")
print("   demo_recommendation(42, top_k=15)")

print("\n PROCHAINE ÉTAPE: Interface Web et Déploiement SageMaker")
print("=" * 70)

 SYSTÈME DE RECOMMANDATION TOP-K

  Device: cpu

 CHARGEMENT DU MODÈLE ENTRAÎNÉ
 Modèle entraîné chargé
   Users: 943
   Items: 1682
   Features: 19
   Best RMSE: 0.6247
 Poids chargés, modèle en mode évaluation
 Encoders chargés
 Métadonnées chargées

 CRÉATION DES FONCTIONS DE RECOMMANDATION
 Fonctions de recommandation créées:
   - get_user_profile()
   - get_movie_info()
   - recommend_top_k()

 TESTS DE RECOMMANDATION

👤 UTILISATEUR 610

 Profil:
   Âge: 22 ans
   Genre: M
   Profession: student
   Nombre de ratings: 75
   Rating moyen: 3.73⭐

 Films appréciés (top 5):
   - Jumanji (1995)                                (5⭐)
   - To Kill a Mockingbird (1962)                  (5⭐)
   - Casablanca (1942)                             (5⭐)
   - Silence of the Lambs, The (1991)              (5⭐)
   - North by Northwest (1959)                     (5⭐)

 TOP 10 RECOMMANDATIONS:
    1. Pather Panchali (1955)                   | Score: 4.48⭐ | Genres: Drama
    2. Golden Earrings (1947)     

In [6]:
!pip install streamlit plotly

1992.78s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m123.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: toml, pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0 toml-0.10.2
