In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import os
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from collections import defaultdict
from tqdm import tqdm

from constants import ROOT_DIR

In [2]:
DATA_DIR = ROOT_DIR / "data"
CLICKS_DIR = DATA_DIR / "unzip_clicks"
WORK_DIR = DATA_DIR / "workbase"

# Modeling

pour la phase de modélisation, si l'on souhaite construire un profil utilisateur, on peut utilisateur plusieurs approches :
- [ ] Intégrer le clic ranking comme une note
- [ ] Prendre en compte comme catégorie : la catégorie de l'article, le catégorie des labels KMEANS


Pour la baseline, on pourra tester :
- [ ] prédire par article popularity, pop weighted by recency & by squared root recency


Pour la partie content base, on pourra tester :
- [ ] Cosine similarity sur les embeddings des articles
- [ ] Cosine similarity sur les embeddings des articles pondérés par le user profile
- [ ] Average des embeddings du jeu de train ou last embedding du jeu de train


Concernant les métriques, on pourra tester :
- [ ] Le Recall@5, l'Acc@5, F1@5, le NDCG@5, le MAP@5


Si on retient l'approche content based on envisagera de précalculer les recommandations dans l'API pour éviter de recalculer les embeddings à chaque fois.


Stratégie à tester pour la partie collaborative :
- [ ] ALS
- [ ] 2-tower model

Limitations


It is important to note that the primary limitation of Precision and Recall @ k
 is that they focus solely on whether the items in the top k
 positions are relevant, without considering the order of these items within those k
 positions. These metrics thus do not measure the ranking quality of the results.

In [195]:
# Select the split date for training and testing
SPLIT_DATE = pd.to_datetime("2017-10-10")
# Load the train & test splits with the test_df_09.pickle and test_df_10.pickle
train_df = (
    pd.read_pickle(WORK_DIR / "train_filtered_10.pickle")
    .sort_values("click_timestamp", ascending=True)
    .reset_index(drop=True)
    .astype(
        {
            "article_id": "int32",
            "category_id": "int32",
            "publisher_id": "int32",
            "words_count": "int32",
        }
    )
)
test_df = (
    pd.read_pickle(WORK_DIR / "test_filtered_10.pickle")
    .sort_values("click_timestamp", ascending=True)
    .reset_index(drop=True)
    .astype(
        {
            "article_id": "int32",
            "category_id": "int32",
            "publisher_id": "int32",
            "words_count": "int32",
        }
    )
)
traintest_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
train = train_df.groupby("user_id")["click_article_id"].agg(list).sort_index()
test = test_df.groupby("user_id")["click_article_id"].agg(list).sort_index()

In [34]:
def compute_avg_click_pos(df, alpha=0.5):
    avg_click_pos = (
        df.groupby("article_id")["click_ranking"]
        .mean()
        .reset_index()
        .rename(columns={"click_ranking": "avg_click_ranking"})
    )
    avg_click_pos["ranking_weight"] = 1 / (avg_click_pos["avg_click_ranking"] + 1)
    avg_click_pos["ranking_exp_weight"] = np.exp(
        -alpha * avg_click_pos["avg_click_ranking"]
    )
    return avg_click_pos

In [None]:
avg_click_pos = compute_avg_click_pos(train_df)
avg_click_pos

## Baseline model : popularity-based recommender

In [212]:
top_articles_df = (
    traintest_df.drop_duplicates(subset=["click_article_id"])
    .sort_values("article_popularity", ascending=False)
    .filter(["click_article_id", "article_popularity"])
)

In [214]:
article_popularity_dict = dict(
    zip(traintest_df["click_article_id"], traintest_df["article_popularity"])
)

In [197]:
# Predict for each user with the top 5 articles that he hasn't read in the train set relative to the article popularity
def get_popularity_recommendations(train_data, top_articles_df, user_id, top_n=5):
    # Get the articles read by the user
    try:
        read_articles = train_data.at[user_id]
    except KeyError:
        print(f"User ID {user_id} not found in the training data.")

    # Get the article popularity
    articles_not_read = top_articles_df[
        ~top_articles_df["click_article_id"].isin(read_articles)
    ]
    # Filter out the articles already read by the user
    recommendations = articles_not_read["click_article_id"].head(top_n)
    # Return the top N recommendations
    return recommendations

In [198]:
# Compare now the top 5 recommendations for each user in the train set
def compare_recommendations(train_data, test_data, top_articles_df, top_n=5):
    results = []
    for user_id in train_data.index:
        train_recommendations = get_popularity_recommendations(
            train_data, top_articles_df, user_id=user_id, top_n=top_n
        )
        try:
            # Get the articles read by the user in the test set
            test_articles = test_data.at[user_id][:top_n]
        except KeyError:
            print(f"User ID {user_id} not found in the test data.")

        # If a article of the top 5 recommandations is in the first 5 articles read by the user in the test set, count it as a hit
        # Count the hits
        hits = len(set(train_recommendations).intersection(set(test_articles)))
        results.append((user_id, hits))
    # Create a DataFrame with the results
    results = [(user_id, hits) for user_id, hits in results]
    if not results:
        return pd.DataFrame(columns=["user_id", "hits"])
    # Return a DataFrame with the user_id and the number of hits
    return pd.DataFrame(results, columns=["user_id", "hits"])

In [199]:
def precision_recall_at_k(train_data, test_data, top_articles_df, top_n=5):
    results = []
    for user_id in train_data.index:
        recs = get_popularity_recommendations(
            train_data, top_articles_df, user_id, top_n
        )
        try:
            test_articles = set(test_data.at[user_id][:top_n])
        except KeyError:
            test_articles = set()
        recs_set = set(recs)
        hits = len(recs_set.intersection(test_articles))
        precision = hits / top_n if top_n > 0 else 0
        recall = hits / len(test_articles) if test_articles else 0
        results.append((user_id, precision, recall))
    return pd.DataFrame(results, columns=["user_id", "precision", "recall"])

In [200]:
scores = compare_recommendations(train, test, top_articles_df)

In [201]:
# compute precision and recall at k for the recommendations for the test set
precision_recall_scores = precision_recall_at_k(train, test, top_articles_df, top_n=5)

In [202]:
precision_recall_scores["precision"].mean(), precision_recall_scores["recall"].mean()

(np.float64(0.10144188327611575), np.float64(0.10482262547000162))

# Baseline model : co-occurrence-based recommender

Parfait ! Voici une version **complète en Python** d’un système de recommandation **non personnalisé**, basé sur les **co-occurrences d’articles**, **pondérées par la récence de lecture** (par rapport à une date de split).

---

## 🎯 Objectif

À partir d’un historique utilisateur (articles + dates de lecture), recommander les **5 articles les plus pertinents**, en pondérant la contribution de chaque article lu selon sa **récence**.

---

## 🧪 Données simulées

```python
import pandas as pd
import numpy as np

# ⚙️ Co-occurrences simulées
cooccurrences = pd.DataFrame({
    'article_1': ['a', 'a', 'b', 'b', 'c', 'd', 'e'],
    'article_2': ['x', 'y', 'x', 'z', 'y', 'w', 'z'],
    'count':     [5,   2,   3,   4,   1,   6,   2]
})

# 📚 Historique de lecture de l'utilisateur (avant la date de split)
history = pd.DataFrame({
    'article': ['a', 'b', 'c', 'd', 'e'],
    'read_date': pd.to_datetime([
        '2023-12-01', '2023-12-15', '2023-12-25', '2023-12-30', '2023-12-31'
    ])
})

split_date = pd.to_datetime('2024-01-01')
```

---

## ⚙️ Étapes de recommandation

### 1. Calcul des pondérations de récence

```python
# Plus l'article a été lu récemment, plus il est important
history['days_before_split'] = (split_date - history['read_date']).dt.days
history['recency_weight'] = np.exp(-0.1 * history['days_before_split'])  # base e⁻⁰.¹ᵈ
```

### 2. Extraire les co-occurrences liées aux articles lus

```python
# Garder les paires contenant un article lu
mask = cooccurrences['article_1'].isin(history['article']) | cooccurrences['article_2'].isin(history['article'])
related = cooccurrences[mask].copy()

# Identifier l'article lu (source) et la suggestion (cible)
def get_source_target(row):
    if row['article_1'] in history['article'].values:
        return row['article_1'], row['article_2']
    else:
        return row['article_2'], row['article_1']

related[['source', 'suggestion']] = related.apply(get_source_target, axis=1, result_type='expand')
```

### 3. Ajouter la pondération de récence à chaque ligne

```python
# Fusionner avec les pondérations des articles lus
related = related.merge(history[['article', 'recency_weight']], how='left', left_on='source', right_on='article')

# Score final = cooccurrence * pondération de récence
related['score'] = related['count'] * related['recency_weight']
```

### 4. Agrégation des scores et top 5

```python
# Aggrégation par suggestion (hors articles déjà lus)
final_scores = (
    related[~related['suggestion'].isin(history['article'])]
    .groupby('suggestion')['score']
    .sum()
    .sort_values(ascending=False)
)

# Top 5 recommandations
top5 = final_scores.head(5)

print("📌 Recommandations pondérées par récence :")
print(top5)
```

---

## ✅ Exemple de sortie possible

```text
📌 Recommandations pondérées par récence :
x    1.3
z    0.9
w    0.5
y    0.3
Name: score, dtype: float64
```

---

## 🧠 Ce que tu peux améliorer ensuite

* Ajouter un **filtrage par contenu** ou thématique.
* Penser à une normalisation des co-occurrences (TF-IDF-like).
* Ajouter un **paramètre de température** ou un facteur de **diversité**.
* Passer à **PySpark** si tu as des millions de lignes.

---

Souhaite-tu que je t'aide à encapsuler ce pipeline dans une classe ou fonction réutilisable ?


Calcul des co-occurences par fréquence d'apparition des articles dans les sessions
```python

In [5]:
from itertools import combinations


# Isolate each session with a list of articles clicked
sessions = train_df_10.groupby("session_id")["click_article_id"].agg(list)

# Étape 1 : générer toutes les paires d’articles par session (ordre non important)
pairs = []
for session in sessions:
    for a, b in combinations(session, 2):
        # Étape 2 : trier chaque paire pour ignorer l’ordre
        pair = tuple(sorted((a, b)))
        pairs.append(pair)

# Étape 3 : compter les paires avec pandas
df = pd.DataFrame(pairs, columns=["article_1", "article_2"])
pair_counts = df.value_counts().reset_index(name="count")

print(pair_counts)


       article_1  article_2  count
0          64329     272143    677
1         199198     272143    649
2         160974     162655    517
3         198659     272143    505
4         160974     300470    484
...          ...        ...    ...
73644     107227     285524      1
73645     107227     285719      1
73646     107227     286321      1
73647     107233     156620      1
73648     107218     235990      1

[73649 rows x 3 columns]


In [5]:
train_df_10.groupby("user_id")["click_article_id"].agg(list)

user_id
6         [202436, 288431, 160474, 59704, 162300, 166283...
17        [157861, 314770, 324823, 300473, 161907, 16862...
25        [128289, 129960, 235230, 271551, 298687, 59057...
26        [119592, 168868, 272660, 160974, 160974, 15661...
42        [145166, 157861, 75825, 107216, 107216, 313996...
                                ...                        
253111                                     [118391, 336223]
253370                                       [271261, 7863]
253511                                             [225019]
253567                                     [336223, 270229]
253663                                             [336223]
Name: click_article_id, Length: 10195, dtype: object

In [6]:
# Isolate each session with a list of articles clicked
sessions = train_df_10.groupby("user_id")["click_article_id"].agg(list)

# Étape 1 : générer toutes les paires d’articles par session (ordre non important)
pairs = []
for session in sessions:
    for a, b in combinations(session, 2):
        # Étape 2 : trier chaque paire pour ignorer l’ordre
        pair = tuple(sorted((a, b)))
        pairs.append(pair)

# Étape 3 : compter les paires avec pandas
df = pd.DataFrame(pairs, columns=["article_1", "article_2"])
pair_counts = df.value_counts().reset_index(name="count")

print(pair_counts)

        article_1  article_2  count
0          160974     162655   1600
1          160974     272143   1498
2          160974     160974   1478
3          123909     160974   1469
4          156560     160974   1368
...           ...        ...    ...
487903     362950     363026      1
487904     362950     363234      1
487905     363026     363234      1
487906     363026     363952      1
487907       2022      49204      1

[487908 rows x 3 columns]


In [7]:
def get_cooccurrence_recommendations(user_id, train_df, pair_counts, top_n=5):
    # Get the user's history of articles read
    history = train_df[train_df["user_id"] == user_id]["click_article_id"].unique()
    if len(history) == 0:
        print(f"No history found for user_id {user_id}")
        return []

    # Select pairs containing an article from the user's history
    mask = pair_counts["article_1"].isin(history) | pair_counts["article_2"].isin(
        history
    )
    related_pairs = pair_counts[mask].copy()

    # Identify the candidate article in each pair
    def get_other(row):
        if row["article_1"] in history:
            return row["article_2"]
        else:
            return row["article_1"]

    related_pairs["candidate"] = related_pairs.apply(get_other, axis=1)

    # Aggregate scores (sum of co-occurrences)
    recommendations = (
        related_pairs.groupby("candidate")["count"].sum().sort_values(ascending=False)
    )

    # Remove articles already read
    recommendations = recommendations[~recommendations.index.isin(history)]

    # Return the top N recommendations as a list
    return recommendations.head(top_n).index.tolist()


In [8]:
# Create a function to compute the recall@5 and precision@5 for the co-occurrence-based recommendations
def evaluate_cooccurrence_recommendations(train_df, test_df, pair_counts, top_n=5):
    results = []
    for user_id in train_df["user_id"].unique():
        recs = get_cooccurrence_recommendations(user_id, train_df, pair_counts, top_n)
        try:
            test_articles = set(
                test_df[test_df["user_id"] == user_id]["click_article_id"]
            )
        except KeyError:
            test_articles = set()
        recs_set = set(recs)
        hits = len(recs_set.intersection(test_articles))
        precision = hits / top_n if top_n > 0 else 0
        recall = hits / len(test_articles) if test_articles else 0
        results.append((user_id, precision, recall))
    return pd.DataFrame(results, columns=["user_id", "precision", "recall"])

In [21]:
# Compute the precision and recall for the co-occurrence-based recommendations
cooccurrence_scores = evaluate_cooccurrence_recommendations(
    train_df_10, test_df_10, pair_counts, top_n=5
)
cooccurrence_scores["precision"].mean(), cooccurrence_scores["recall"].mean()

(np.float64(0.002491417361451692), np.float64(0.000967933102562286))

In [9]:
# Compute the precision and recall for the co-occurrence-based recommendations
cooccurrence_scores = evaluate_cooccurrence_recommendations(
    train_df_10, test_df_10, pair_counts, top_n=5
)
cooccurrence_scores["precision"].mean(), cooccurrence_scores["recall"].mean()

(np.float64(0.0008435507601765573), np.float64(0.00030655404848596346))

# Content based recommender system

In [None]:
# Load embeddings_df
embeddings_arr = pd.read_pickle(WORK_DIR / "articles_embeddings.pickle")
embeddings_df = pd.DataFrame(embeddings_arr)
valid_articles_ids = (
    traintest_df["click_article_id"].drop_duplicates().sort_values(ascending=True)
)
embeddings_filtered = embeddings_df.loc[
    embeddings_df.index.intersection(valid_articles_ids), :
]

In [203]:
# compute pca on embeddings_filtered
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings_filtered)

pca = PCA(n_components=0.95, random_state=42)
embeddings_pca = pca.fit_transform(embeddings_scaled)
embeddings_pca_df = pd.DataFrame(embeddings_pca, index=embeddings_filtered.index)

In [None]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity


def recommend_content_based(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.1, beta=0.1, top_n=5
):
    recommendations = defaultdict(list)
    user_ids = train_df["user_id"].unique()

    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def ranking_weight(position, beta):
        return np.exp(-beta * (position - 1))

    for user_id in user_ids:
        user_df = train_df.loc[train_df["user_id"] == user_id]
        weighted_sum = np.zeros(embeddings_filtered.shape[1])
        total_weight = 0.0

        for row in user_df.itertuples(index=False):
            article_id = row.click_article_id
            click_date = row.click_timestamp
            position = row.click_ranking

            w_recency = recency_weight(click_date, SPLIT_DATE, alpha)
            w_position = ranking_weight(position, beta)

            weighted_sum += (
                w_recency * w_position * embeddings_filtered.loc[article_id, :].values
            )
            total_weight += w_recency * w_position

        if total_weight == 0:
            recommendations[user_id] = []
            continue

        weighted_sum /= total_weight
        similarities = cosine_similarity(
            embeddings_filtered.values, weighted_sum.reshape(1, -1)
        ).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )
        read_articles = set(user_df["click_article_id"])
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations


In [166]:
recommendations = recommend_content_based(
    train_df,
    embeddings_filtered,
    datetime(2017, 10, 10, 0, 0, 0),
    alpha=0.1,
    beta=0.1,
    top_n=5,
)

In [204]:
recommendations_pca = recommend_content_based(
    train_df,
    embeddings_pca_df,
    datetime(2017, 10, 10, 0, 0, 0),
    alpha=0.1,
    beta=0.1,
    top_n=5,
)

In [186]:
recommendations_beta = recommend_content_based(
    train_df,
    embeddings_filtered,
    datetime(2017, 10, 10, 0, 0, 0),
    alpha=0.2,
    beta=0.5,
    top_n=5,
)

In [205]:
recommendations_beta_pca = recommend_content_based(
    train_df,
    embeddings_pca_df,
    datetime(2017, 10, 10, 0, 0, 0),
    alpha=0.2,
    beta=0.5,
    top_n=5,
)

In [187]:
def evaluate_user_recommendations(user_id, recommendations, test_df, top_n=5):
    """
    Compare recommendations to test set for a user and compute precision, recall, and f1@top_n.
    """
    # Get articles read by user in test set
    test_articles = test_df.loc[
        test_df["user_id"] == user_id, "click_article_id"
    ].values
    recs = recommendations.get(user_id, [])[:top_n]
    if len(test_articles) == 0 or len(recs) == 0:
        return 0.0, 0.0, 0.0

    # Take only top_n recommendations and test articles
    test_set = set(test_articles)

    hits = set(recs) & test_set
    precision = len(hits) / len(recs)
    recall = len(hits) / len(test_set)
    f1 = (
        (2 * precision * recall / (precision + recall))
        if (precision + recall) > 0
        else 0.0
    )
    return precision, recall, f1


In [188]:
# Compute average precision, recall, and f1 for all users
def evaluate_all_users(recommendations, test_df, top_n=5):
    all_precisions = []
    all_recalls = []
    all_f1s = []

    for user_id in test_df["user_id"].unique():
        precision, recall, f1 = evaluate_user_recommendations(
            user_id, recommendations, test_df, top_n
        )
        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1s.append(f1)

    avg_precision = np.mean(all_precisions)
    avg_recall = np.mean(all_recalls)
    avg_f1 = np.mean(all_f1s)

    return avg_precision, avg_recall, avg_f1

In [189]:
# Compute precision, recall, and f1 for all users
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0142, Average Recall@5: 0.0058, Average F1@5: 0.0077


In [206]:
# Compute precision, recall, and f1 for all users
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_pca, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0147, Average Recall@5: 0.0061, Average F1@5: 0.0081


In [190]:
# Compute precision, recall, and f1 for all users
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_beta, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0149, Average Recall@5: 0.0061, Average F1@5: 0.0082


In [207]:
# Compute precision, recall, and f1 for all users
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_beta_pca, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0147, Average Recall@5: 0.0061, Average F1@5: 0.0082


In [183]:
def recommend_content_based_avg(train_df, embeddings_filtered, top_n=5):
    """
    Recommend articles based on the average embedding of articles read by each user.

    Args:
        train_df: DataFrame with columns ['user_id', 'click_article_id']
        embeddings_filtered: DataFrame, index=article_id, values=embedding vectors
        top_n: int, number of recommendations

    Returns:
        dict[user_id, list of article_id]: top N recommendations per user
    """
    from sklearn.metrics.pairwise import cosine_similarity

    recommendations = {}
    user_ids = train_df["user_id"].unique()

    for user_id in user_ids:
        user_articles = train_df.loc[train_df["user_id"] == user_id, "click_article_id"]
        read_articles = set(user_articles)
        if not read_articles:
            recommendations[user_id] = []
            continue

        # Get embeddings for articles read by the user
        user_embs = embeddings_filtered.loc[
            embeddings_filtered.index.intersection(read_articles)
        ].values

        if user_embs.shape[0] == 0:
            recommendations[user_id] = []
            continue

        # Compute average embedding
        avg_emb = user_embs.mean(axis=0).reshape(1, -1)

        # Compute cosine similarity with all candidate articles
        similarities = cosine_similarity(embeddings_filtered.values, avg_emb).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )

        # Exclude already read articles
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations

In [None]:
# compute recommendations using the average embedding method
recommendations_avg = recommend_content_based_avg(
    train_df, embeddings_filtered, top_n=5
)

In [185]:
# Evaluate the average embedding recommendations
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_avg, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0055, Average Recall@5: 0.0057, Average F1@5: 0.0056


In [215]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity


def recommend_content_based(
    train_df,
    embeddings_filtered,
    SPLIT_DATE,
    alpha=0.2,
    beta=0.5,
    top_n=5,
    article_popularity=None,
    lambda_=0.7,
):
    """
    Content-based recommender with hybrid popularity.
    Args:
        article_popularity: dict or pd.Series {article_id: popularity_score}
        lambda_: float, weight for similarity (1-lambda_ for popularity)
    """
    recommendations = defaultdict(list)
    user_ids = train_df["user_id"].unique()

    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def position_weight(position, beta):
        return np.exp(-beta * (position - 1))

    # Normalise la popularité si fournie
    if article_popularity is not None:
        if isinstance(article_popularity, dict):
            pop_series = pd.Series(article_popularity)
        else:
            pop_series = article_popularity
        pop_min = pop_series.min()
        pop_max = pop_series.max()
        pop_norm = (pop_series - pop_min) / (pop_max - pop_min + 1e-9)
    else:
        pop_norm = None

    for user_id in user_ids:
        user_df = train_df.loc[train_df["user_id"] == user_id]
        weighted_sum = np.zeros(embeddings_filtered.shape[1])
        total_weight = 0.0

        for row in user_df.itertuples(index=False):
            article_id = row.click_article_id
            click_date = row.click_timestamp
            position = row.click_ranking

            w_recency = recency_weight(click_date, SPLIT_DATE, alpha)
            w_position = position_weight(position, beta)

            weighted_sum += (
                w_recency * w_position * embeddings_filtered.loc[article_id, :].values
            )
            total_weight += w_recency * w_position

        if total_weight == 0:
            recommendations[user_id] = []
            continue

        weighted_sum /= total_weight
        similarities = cosine_similarity(
            embeddings_filtered.values, weighted_sum.reshape(1, -1)
        ).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )
        read_articles = set(user_df["click_article_id"])
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]

        # Ajout de la popularité normalisée
        if pop_norm is not None:
            sim_df = sim_df.join(pop_norm.rename("popularity_norm"), on="article_id")
            sim_df["popularity_norm"] = sim_df["popularity_norm"].fillna(0)
            sim_df["hybrid_score"] = (
                lambda_ * sim_df["similarity"]
                + (1 - lambda_) * sim_df["popularity_norm"]
            )
            top_recommendations = sim_df.nlargest(top_n, "hybrid_score")
        else:
            top_recommendations = sim_df.nlargest(top_n, "similarity")

        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations

In [218]:
# article_popularity doit être un dict ou une Series {article_id: score}
recommendations_with_pop = recommend_content_based(
    train_df,
    embeddings_filtered,
    SPLIT_DATE,
    alpha=0.2,
    beta=0.5,
    top_n=5,
    article_popularity=article_popularity_dict,
    lambda_=0.3,
)

In [217]:
# Evaluate the average embedding recommendations
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_with_pop, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0643, Average Recall@5: 0.0263, Average F1@5: 0.0352


In [None]:
train_df.head(5)

In [222]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity


def recommend_content_based_position(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.1, beta=0.1, top_n=5
):
    recommendations = defaultdict(list)
    user_ids = train_df["user_id"].unique()

    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def position_weight(position, beta):
        return np.exp(-beta * (position - 1))

    for user_id in user_ids:
        user_df = train_df.loc[train_df["user_id"] == user_id]
        len_position = len(user_df)
        weighted_sum = np.zeros(embeddings_filtered.shape[1])
        total_weight = 0.0

        for i, row in enumerate(user_df.itertuples(index=False)):
            article_id = row.click_article_id
            click_date = row.click_timestamp
            position = len_position - i  # Invert position for ranking

            w_recency = recency_weight(click_date, SPLIT_DATE, alpha)
            w_position = position_weight(position, beta)

            weighted_sum += (
                w_recency * w_position * embeddings_filtered.loc[article_id, :].values
            )
            total_weight += w_recency * w_position

        if total_weight == 0:
            recommendations[user_id] = []
            continue

        weighted_sum /= total_weight
        similarities = cosine_similarity(
            embeddings_filtered.values, weighted_sum.reshape(1, -1)
        ).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )
        read_articles = set(user_df["click_article_id"])
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations


In [223]:
# compute recommendations using the average embedding method
recommendations_with_position = recommend_content_based_position(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.2, beta=0.5, top_n=5
)

# Evaluate the average embedding recommendations
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_with_position, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0136, Average Recall@5: 0.0055, Average F1@5: 0.0075


In [224]:
train_df

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,...,click_ranking,first_click_timestamp,article_recence,popularity_weighted_by_recence_sqrt,popularity_weighted_by_recence,article_id,category_id,created_at_ts,publisher_id,words_count
0,59,1506826329267796,2017-10-01 02:52:09,2,234853,2017-10-01 03:00:00.026,4,3,2,1,...,1,2017-10-01 03:00:00.026,0,0.000000,0.000000,234853,375,2017-09-30 12:18:09,0,140
1,154,1506826793323891,2017-10-01 02:59:53,2,96663,2017-10-01 03:00:04.207,4,3,2,1,...,1,2017-10-01 03:00:04.207,0,0.000000,0.000000,96663,209,2017-09-30 16:13:45,0,206
2,59,1506826329267796,2017-10-01 02:52:09,2,234995,2017-10-01 03:00:30.026,4,3,2,1,...,2,2017-10-01 03:00:30.026,0,0.000000,0.000000,234995,375,2017-09-30 12:07:16,0,155
3,149,1506826780136886,2017-10-01 02:59:40,2,145166,2017-10-01 03:00:31.267,4,3,20,1,...,1,2017-10-01 03:00:31.267,0,0.000000,0.000000,145166,269,2017-09-30 15:27:52,0,180
4,154,1506826793323891,2017-10-01 02:59:53,2,108854,2017-10-01 03:00:34.207,4,3,2,1,...,2,2017-10-01 03:00:34.207,0,0.000000,0.000000,108854,230,2017-09-30 21:09:43,0,167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181875,11668,1507593499203413,2017-10-09 23:58:19,2,225019,2017-10-09 23:59:35.046,4,3,2,1,...,2,2017-10-09 00:00:12.195,0,0.000000,0.000000,225019,354,2017-10-09 16:25:28,0,190
181876,15068,1507592669124580,2017-10-09 23:44:29,4,338129,2017-10-09 23:59:41.302,4,3,2,1,...,4,2017-10-09 18:23:42.835,0,0.000000,0.000000,338129,437,2017-10-09 15:20:51,0,187
181877,3086,1507593391142309,2017-10-09 23:56:31,6,119193,2017-10-09 23:59:43.075,2,3,20,1,...,2,2017-10-07 21:56:52.677,2,0.004642,0.006565,119193,247,2017-10-09 16:17:03,0,296
181878,34646,1507592869316761,2017-10-09 23:47:49,7,161149,2017-10-09 23:59:49.251,4,3,20,1,...,4,2017-10-09 17:53:37.335,0,0.000000,0.000000,161149,281,2017-10-09 14:48:21,0,205


In [225]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity


def recommend_content_based_category(
    train_df,
    embeddings_filtered,
    SPLIT_DATE,
    alpha=0.23,
    beta=0.5,
    top_n=5,
    category_weight=2.0,
):
    """
    Build user profile by weighting articles from the user's most frequent category higher.
    """
    recommendations = defaultdict(list)
    user_ids = train_df["user_id"].unique()

    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def ranking_weight(position, beta):
        return np.exp(-beta * (position - 1))

    for user_id in user_ids:
        user_df = train_df.loc[train_df["user_id"] == user_id]
        weighted_sum = np.zeros(embeddings_filtered.shape[1])
        total_weight = 0.0

        # Find user's most frequent category
        if "category_id" in user_df.columns:
            most_freq_cat = user_df["category_id"].mode().iloc[0]
        else:
            most_freq_cat = None

        for row in user_df.itertuples(index=False):
            article_id = row.click_article_id
            click_date = row.click_timestamp
            position = row.click_ranking
            category_id = row.category_id if hasattr(row, "category_id") else None

            w_recency = recency_weight(click_date, SPLIT_DATE, alpha)
            w_position = ranking_weight(position, beta)
            w_cat = category_weight if (category_id == most_freq_cat) else 1.0

            weighted_sum += (
                w_recency
                * w_position
                * w_cat
                * embeddings_filtered.loc[article_id, :].values
            )
            total_weight += w_recency * w_position * w_cat

        if total_weight == 0:
            recommendations[user_id] = []
            continue

        weighted_sum /= total_weight
        similarities = cosine_similarity(
            embeddings_filtered.values, weighted_sum.reshape(1, -1)
        ).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )
        read_articles = set(user_df["click_article_id"])
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations

In [226]:
# compute recommendations using the average embedding method
recommendations_with_category = recommend_content_based_category(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.2, beta=0.5, top_n=5
)

# Evaluate the average embedding recommendations
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_with_category, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0144, Average Recall@5: 0.0058, Average F1@5: 0.0078


# Collaborative filtering recommender system
```

In [228]:
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
import numpy as np

# Use article_id for consistency
user_codes = train_df["user_id"].astype("category").cat.codes
article_codes = train_df["article_id"].astype("category").cat.codes
user_id_map = dict(enumerate(train_df["user_id"].astype("category").cat.categories))
article_id_map = dict(
    enumerate(train_df["article_id"].astype("category").cat.categories)
)
article_id_invmap = {v: k for k, v in article_id_map.items()}

# Create sparse matrix
train_sparse = csr_matrix((np.ones(len(train_df)), (user_codes, article_codes)))

# Fit ALS model
als_model = AlternatingLeastSquares(
    factors=50, regularization=0.1, iterations=20, random_state=42
)
als_model.fit(train_sparse)


# Get recommendations by article_id
def get_als_recommendations(article_id, model, top_n=5):
    if article_id not in article_id_invmap:
        print(f"Article ID {article_id} not found in the training data.")
        return [], []
    article_index = article_id_invmap[article_id]
    similar_articles = model.similar_items(article_index, N=top_n + 1)[
        1:
    ]  # Exclude self
    return [article_id_map[item[0]] for item in similar_articles], [
        item[1] for item in similar_articles
    ]


# Example usage
article_id = 123456  # Replace with a valid article_id from your dataset
recommended_articles, scores = get_als_recommendations(article_id, als_model, top_n=5)
print(
    f"Recommended articles for article ID {article_id}: {recommended_articles} with scores {scores}"
)


  0%|          | 0/20 [00:00<?, ?it/s]

Article ID 123456 not found in the training data.
Recommended articles for article ID 123456: [] with scores []


In [110]:
# Compute als precision and recall at k for the recommendations
als_precision_recall_scores

Unnamed: 0,user_id,precision,recall
0,6,0.0,0.0
1,17,0.0,0.0
2,25,0.0,0.0
3,26,0.0,0.0
4,42,0.0,0.0
...,...,...,...
11046,235192,0.0,0.0
11047,235200,0.0,0.0
11048,235221,0.0,0.0
11049,235588,0.0,0.0


In [111]:
from implicit.datasets.lastfm import get_lastfm

artists, users, artist_user_plays = get_lastfm()

In [129]:
artists.shape

(292385,)

In [130]:
users.shape

(358868,)

In [115]:
from implicit.nearest_neighbours import bm25_weight

# weight the matrix, both to reduce impact of users that have played the same artist thousands of times
# and to reduce the weight given to popular items
artist_user_plays = bm25_weight(artist_user_plays, K1=100, B=0.8)

# get the transpose since the most of the functions in implicit expect (user, item) sparse matrices instead of (item, user)
user_plays = artist_user_plays.T.tocsr()