# Deep Evaluation of LLM-Based Recommender Systems (with Auto-Save Plots)
*Generated 2025-05-06 09:20 UTC*

This notebook computes in-depth accuracy, popularity bias, and fairness metrics, and automatically saves all plots to the `analysisresults/` folder using a context manager.

In [None]:

from contextlib import contextmanager
from pathlib import Path
import matplotlib.pyplot as plt

@contextmanager
def autosave_fig(name, folder="analysisresults", dpi=300, fmt="png"):
    path = Path(folder)
    path.mkdir(parents=True, exist_ok=True)
    try:
        yield
        plt.tight_layout()
        plt.savefig(path / f"{name}.{fmt}", dpi=dpi)
    finally:
        plt.show()


In [None]:

import re

STRATEGY_OPTS  = ["random", "top-rated", "recent"]
FAIRNESS_OPTS  = ["neutral", "gender_age_only", "occupation_only", "all_attributes"]
BIAS_OPTS      = ["baseline", "niche_genre", "exclude_popular", "indie_international", "temporal_diverse", "obscure_theme"]

PROMPT_RE = re.compile(
    r'^(?P<user>\d+)'
    r'_(?P<strategy>' + "|".join(STRATEGY_OPTS) + r')'
    r'_(?P<fairness>' + "|".join(FAIRNESS_OPTS) + r')'
    r'_(?P<bias>' + "|".join(BIAS_OPTS) + r')$'
)

def parse_prompt_id(pid: str):
    m = PROMPT_RE.match(pid)
    if not m:
        raise ValueError(f"Bad prompt_id: {pid}")
    d = m.groupdict()
    d["user"] = int(d["user"])
    return d


In [None]:

import pandas as pd, numpy as np, math, collections, itertools
import matplotlib.pyplot as plt, seaborn as sns
from rapidfuzz import process, fuzz
from tqdm.auto import tqdm

plt.rcParams['figure.figsize'] = (8,5)

# File paths
USER_DATA_PATH = '../Dataset/full_movies_data.csv'
RECS_PATH      = '../LlmCsvOutput/merged_prompt_results_top_rated.csv'

# Load data
user_df = pd.read_csv(USER_DATA_PATH)
rec_df  = pd.read_csv(RECS_PATH).rename(columns={'custom_id':'prompt_id'})
rec_df['movies'] = rec_df['movies'].str.split(',').apply(lambda lst: [m.strip() for m in lst])
rec_df = pd.concat([rec_df['prompt_id'].apply(parse_prompt_id).apply(pd.Series), rec_df], axis=1)

# Popularity and liked movie dicts
popularity_counts = user_df['Title'].value_counts()
title_to_rank     = {t: r for r, t in enumerate(popularity_counts.index, 1)}
user_likes        = user_df[user_df['Rating'] > 2].groupby('UserID')['Title'].apply(set).to_dict()


In [None]:

def fuzzy_hits(recs, liked, cutoff=80):
    return sum(1 for rec in recs if process.extractOne(rec, liked, scorer=fuzz.token_sort_ratio, score_cutoff=cutoff))

def precision_at_k(recs, liked, k): return fuzzy_hits(recs[:k], liked) / k if k else 0
def recall_at_k(recs, liked, k):    return fuzzy_hits(recs[:k], liked) / len(liked) if liked else 0

def ndcg_at_k(recs, liked, k):
    dcg = sum((1 / math.log2(i+2)) for i, itm in enumerate(recs[:k]) if itm in liked)
    ideal = sum(1 / math.log2(i+2) for i in range(min(k, len(liked))))
    return dcg / ideal if ideal else 0

def apk(recs, liked, k):
    score = hits = 0
    for i, itm in enumerate(recs[:k], 1):
        if itm in liked:
            hits += 1
            score += hits / i
    return score / min(len(liked), k) if liked else 0

def rr(recs, liked):
    for i, itm in enumerate(recs, 1):
        if itm in liked:
            return 1/i
    return 0

def avg_pop_rank(recs): return np.mean([title_to_rank.get(m, len(title_to_rank)+1) for m in recs]) if recs else np.nan

def log_pop_diff(recs, hist):
    rec_pop  = [math.log(popularity_counts.get(m,1)) for m in recs]
    hist_pop = [math.log(popularity_counts.get(m,1)) for m in hist]
    return np.mean(rec_pop) - np.mean(hist_pop) if rec_pop and hist_pop else 0

def is_long_tail(title, cutoff=int(len(title_to_rank)*0.8)):
    return title_to_rank.get(title, cutoff+1) > cutoff


In [None]:

user_hist_cache = {}
records = []

for row in tqdm(rec_df.itertuples(index=False), total=len(rec_df), desc="Evaluating"):
    uid = row.user
    if uid not in user_hist_cache:
        user_hist_cache[uid] = user_df.loc[user_df['UserID'] == uid, 'Title'].tolist()
    hist  = user_hist_cache[uid]
    liked = user_likes.get(uid, set())
    movies = row.movies

    records.append({
        "prompt_id"     : row.prompt_id,
        "strategy"      : row.strategy,
        "fairness"      : row.fairness,
        "bias"          : row.bias,
        "precision@10"  : precision_at_k(movies, liked, 10),
        "recall@10"     : recall_at_k(movies, liked, 10),
        "ndcg@10"       : ndcg_at_k(movies, liked, 10),
        "ap@10"         : apk(movies, liked, 10),
        "rr"            : rr(movies, liked),
        "avg_pop_rank"  : avg_pop_rank(movies),
        "log_pop_diff"  : log_pop_diff(movies, hist),
        "lt_coverage_pct": sum(is_long_tail(t) for t in movies)/len(movies) if movies else np.nan
    })

metrics_df = pd.DataFrame(records)
metrics_df.head()


In [None]:

metrics_to_plot = [
    ("precision@10",      "Precision@10"),
    ("recall@10",         "Recall@10"),
    ("ndcg@10",           "NDCG@10"),
    ("avg_pop_rank",      "Average popularity rank"),
    ("log_pop_diff",      "Log-popularity difference"),
    ("lt_coverage_pct",   "Long-tail coverage %"),
]

bias_order = ["baseline", "niche_genre", "exclude_popular",
              "indie_international", "temporal_diverse", "obscure_theme"]

for col, label in metrics_to_plot:
    with autosave_fig(f"{col.replace('@','_at_')}_by_fairness_lines"):
        plt.figure(figsize=(9,5))
        pivot = (
            metrics_df
            .groupby(['fairness','bias'])[col]
            .mean()
            .unstack('bias')
            .reindex(columns=bias_order)
        )
        for fairness, yvals in pivot.iterrows():
            plt.plot(bias_order, yvals, marker='o', label=fairness)

        plt.title(f"{label} across bias strategies by fairness group")
        plt.xlabel("Bias strategy")
        plt.ylabel(label)
        plt.xticks(rotation=45)
        plt.grid(True, alpha=.3)
        plt.legend(title="Fairness setting")


Notebook complete. All charts auto-saved to `analysisresults/`. You may continue your analysis below.