# Evaluator Module
The Evaluator module creates evaluation reports.

Reports contain evaluation metrics depending on models specified in the evaluation config.

In [28]:
# reloads modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2

# third parties imports
import numpy as np 
import pandas as pd


# local imports
from configs import EvalConfig
from constants import Constant as C
from loaders import export_evaluation_report
from loaders import load_ratings
from surprise.model_selection import train_test_split, KFold, cross_validate, LeaveOneOut
from surprise import Dataset, SVD, Reader, accuracy
from loaders import load_ratings
from loaders import load_items
from configs import EvalConfig
from collections import defaultdict
import time
from models import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
ratings_dataset = load_ratings(surprise_format=True)

In [30]:
algo = SVD()

# 1. Model validation functions
Validation functions are a way to perform crossvalidation on recommender system models. 

In [31]:
def generate_split_predictions(algo, ratings_dataset, eval_config):
    """Generate predictions on a random test set specified in eval_config"""
    # -- implement the function generate_split_predictions --
    trainset, testset = train_test_split(ratings_dataset, test_size=EvalConfig.test_size)
    algo.fit(trainset)
    predictions = algo.test(testset)
    return predictions


def generate_loo_top_n(algo, ratings_dataset, eval_config):
    """Generate top-n recommendations for each user on a random Leave-one-out split (LOO)"""
    # -- implement the function generate_loo_top_n --
    loo = LeaveOneOut(n_splits=1, random_state=1)
    trainset, testset = next(loo.split(ratings_dataset))
    algo.fit(trainset)
    anti_test_set = trainset.build_anti_testset()
    predictions = algo.test(anti_test_set)
    anti_testset_top_n = get_top_n(predictions, n=eval_config.top_n_value)
    

    return anti_testset_top_n, testset


def generate_full_top_n(algo, ratings_dataset, eval_config):
    """Generate top-n recommendations for each user with full training set (LOO)"""
    # -- implement the function generate_full_top_n --
    full_train_set = ratings_dataset.build_full_trainset()
    algo.fit(full_train_set)
    anti_test_set = full_train_set.build_anti_testset()
    predictions = algo.test(anti_test_set)
    anti_testset_top_n = get_top_n(predictions, n=eval_config.top_n_value)
    return anti_testset_top_n


def precompute_information():
    """ Returns a dictionary that precomputes relevant information for evaluating in full mode
    
    Dictionary keys:
    - precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to popularity rankings
    """
    precomputed_dict = {}
    df_ratings = load_ratings()

    popularity_counts = df_ratings[C.ITEM_ID_COL].value_counts()
    item_to_rank = {item_id: rank+1 for rank, item_id in enumerate(popularity_counts.index)}
    precomputed_dict["item_to_rank"] = item_to_rank

    return precomputed_dict       


def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):
    """ Create a DataFrame evaluating various models on metrics specified in an evaluation config.  
    """
    evaluation_dict = {}
    for model_name, model, arguments in eval_config.models:
        print(f'Handling model {model_name}')
        algo = model(**arguments)
        evaluation_dict[model_name] = {}
        
        # Type 1 : split evaluations
        if len(eval_config.split_metrics) > 0:
            print('Training split predictions')
            predictions = generate_split_predictions(algo, sp_ratings, eval_config)
            for metric in eval_config.split_metrics:
                print(f'- computing metric {metric}')
                assert metric in available_metrics['split']
                evaluation_function, parameters =  available_metrics["split"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters) 

        # Type 2 : loo evaluations
        if len(eval_config.loo_metrics) > 0:
            print('Training loo predictions')
            anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config)
            for metric in eval_config.loo_metrics:
                assert metric in available_metrics['loo']
                evaluation_function, parameters =  available_metrics["loo"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)
        
        # Type 3 : full evaluations
        if len(eval_config.full_metrics) > 0:
            print('Training full predictions')
            anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config)
            for metric in eval_config.full_metrics:
                assert metric in available_metrics['full']
                evaluation_function, parameters =  available_metrics["full"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(
                    anti_testset_top_n,
                    **precomputed_dict,
                    **parameters
                )
        
    return pd.DataFrame.from_dict(evaluation_dict).T

    info = precompute_information()
    print(info)

# 2. Evaluation metrics
Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)

In [32]:
def get_hit_rate(anti_testset_top_n, testset):
    """Calcule le taux moyen de succès (hit rate) pour chaque utilisateur.

    Un "hit" signifie que le film omis du testset figure parmi les recommandations du top-n.
    """
    hits = 0
    total = 0

    # Parcours de chaque observation dans le testset
    for user, left_out_movie, _ in testset:
        # Recommandations top-n pour l'utilisateur courant
        user_top_n = anti_testset_top_n.get(user, [])

        # Liste des ID des films recommandés
        recommended_movies = [movie_id for movie_id, _ in user_top_n]

        # Incrémentation du nombre de hits si le film est dans les recommandations
        if left_out_movie in recommended_movies:
            hits += 1

        total += 1

    # Calcul final du taux de hit
    return hits / total if total else 0


def get_novelty(anti_testset_top_n, item_to_rank):
    """Compute the average novelty of the top-n recommendation over the users (full metric)
    
    The novelty is defined as the average ranking of the movies recommended
        item_to_rank (dict): {item_id: popularity_rank}
    """

    total_novelty = 0
    total_users = len(anti_testset_top_n)

    for user_id, recommendations in anti_testset_top_n.items():
        user_novelty = 0
        for item_id, _ in recommendations:
            rank = item_to_rank.get(item_id, 0)  # 0 or a high number if unknown
            user_novelty += rank
        total_novelty += user_novelty

    avg_novelty = total_novelty / total_users if total_users > 0 else 0
    return avg_novelty

# 3. Evaluation workflow
Load data, evaluate models and save the experimental outcomes

In [33]:
AVAILABLE_METRICS = {
    "split": {
        "MAE": (accuracy.mae, {'verbose': False}),
        # -- add new split metrics here --
    "RMSE" : (accuracy.rmse, {'verbose': False}),
    }
    # -- add new types of metrics here --
    ,"loo" : {"hit_rate" : (get_hit_rate, {})},
    "full" : {"novelty" : (get_novelty, {})}
}

sp_ratings = load_ratings(surprise_format=True)
algo = SVD()
test = generate_split_predictions(algo, sp_ratings, EvalConfig)

top_n_loo_top,test_set_loo = generate_loo_top_n(algo, sp_ratings, EvalConfig)
rows = []
for user_id, item_list in top_n_loo_top.items():
    for item_id, estimated_rating in item_list:
        rows.append((user_id, item_id, estimated_rating))

df_topn = pd.DataFrame(rows, columns=['user', 'item', 'estimated_rating'])
df_topn.to_csv("top_n_loo.csv", index=False)

top_n_full = generate_full_top_n(algo, sp_ratings, EvalConfig) 
rows = []
for user_id, item_list in top_n_full.items():
    for item_id, estimated_rating in item_list:
        rows.append((user_id, item_id, estimated_rating))
df_topn_full = pd.DataFrame(rows, columns=['user', 'item', 'estimated_rating'])

df_topn_full.to_csv("top_n_full.csv", index=False)
precomputed_dict = precompute_information()
evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)
display(evaluation_report)
export_evaluation_report(evaluation_report)



Handling model LinReg
Training split predictions
- computing metric MAE
- computing metric RMSE
Training loo predictions
Training full predictions


Unnamed: 0,MAE,RMSE,hit_rate,novelty
LinReg,0.7609,0.976292,0.00149,205708.457526


Evaluation report exported to: \Users\nicol\Documents\GitHub\Majeur-BA\RECOMMENDER-SYSTEM\mlsmm2156\evaluation\2025_05_26_21_14_39_report.csv



- Erreurs MAE et RMSE élevées → faible précision.
- hit_rate très bas (≈0.3 %) → recommandations peu pertinentes.
- Valeur de novelty anormalement haute → Elle peut indiquer un problème de mise à l’échelle ou de logique dans la fonction de calcul : il est possible que le modèle recommande quasi exclusivement des films non populaires

In [34]:
def evaluate_single_model(model_name, model_class, model_params, test_size=0.2):
    """
    Évalue un seul modèle et retourne le RMSE
    
    Args:
        model_name (str): Nom du modèle pour l'affichage
        model_class: Classe du modèle à évaluer
        model_params (dict): Paramètres du modèle
        test_size (float): Proportion des données à utiliser pour le test
    
    Returns:
        float: RMSE du modèle
    """
    # Charger les données
    df_ratings = load_ratings()
    reader = Reader(rating_scale=(0.5, 5.0))
    data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
    
    # Split train-test
    trainset, testset = train_test_split(data, test_size=test_size, random_state=42)
    
    # Initialiser et entraîner le modèle
    model = model_class(**model_params)
    model.fit(trainset)
    
    # Faire les prédictions
    predictions = model.test(testset)
    
    # Calculer le RMSE
    rmse = accuracy.rmse(predictions, verbose=False)
    
    print(f"Modèle: {model_name}")
    print(f"RMSE: {rmse:.4f}")
    
    return rmse


    # Exemple d'utilisation
rmse = evaluate_single_model(
    "CB_AllFeatures_RF",
    ContentBased,
    {"features_method": "all_features", "regressor_method": "random_forest"}
)

NotImplementedError: Feature method all_features not yet implemented