In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import os
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from collections import defaultdict
from tqdm import tqdm

from constants import ROOT_DIR

In [4]:
DATA_DIR = ROOT_DIR / "data"
CLICKS_DIR = DATA_DIR / "unzip_clicks"
WORK_DIR = DATA_DIR / "workbase"

# Modeling

pour la phase de modélisation, si l'on souhaite construire un profil utilisateur, on peut utilisateur plusieurs approches :
- [ ] Intégrer le clic ranking comme une note
- [ ] Prendre en compte comme catégorie : la catégorie de l'article, le catégorie des labels KMEANS


Pour la baseline, on pourra tester :
- [ ] prédire par article popularity, pop weighted by recency & by squared root recency


Pour la partie content base, on pourra tester :
- [ ] Cosine similarity sur les embeddings des articles
- [ ] Cosine similarity sur les embeddings des articles pondérés par le user profile
- [ ] Average des embeddings du jeu de train ou last embedding du jeu de train


Concernant les métriques, on pourra tester :
- [ ] Le Recall@5, l'Acc@5, F1@5, le NDCG@5, le MAP@5


Si on retient l'approche content based on envisagera de précalculer les recommandations dans l'API pour éviter de recalculer les embeddings à chaque fois.


Stratégie à tester pour la partie collaborative :
- [ ] ALS
- [ ] 2-tower model

Limitations


It is important to note that the primary limitation of Precision and Recall @ k
 is that they focus solely on whether the items in the top k
 positions are relevant, without considering the order of these items within those k
 positions. These metrics thus do not measure the ranking quality of the results.

In [5]:
# Select the split date for training and testing
SPLIT_DATE = pd.to_datetime("2017-10-10")
# Load the train & test splits with the test_df_09.pickle and test_df_10.pickle
train_df = (
    pd.read_pickle(WORK_DIR / "train_filtered_10.pickle")
    .sort_values("click_timestamp", ascending=True)
    .reset_index(drop=True)
    .astype(
        {
            "article_id": "int32",
            "category_id": "int32",
            "publisher_id": "int32",
            "words_count": "int32",
        }
    )
)
test_df = (
    pd.read_pickle(WORK_DIR / "test_filtered_10.pickle")
    .sort_values("click_timestamp", ascending=True)
    .reset_index(drop=True)
    .astype(
        {
            "article_id": "int32",
            "category_id": "int32",
            "publisher_id": "int32",
            "words_count": "int32",
        }
    )
)
traintest_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
train = train_df.groupby("user_id")["click_article_id"].agg(list).sort_index()
test = test_df.groupby("user_id")["click_article_id"].agg(list).sort_index()

In [6]:
def compute_avg_click_pos(df, alpha=0.5):
    avg_click_pos = (
        df.groupby("article_id")["click_ranking"]
        .mean()
        .reset_index()
        .rename(columns={"click_ranking": "avg_click_ranking"})
    )
    avg_click_pos["ranking_weight"] = 1 / (avg_click_pos["avg_click_ranking"] + 1)
    avg_click_pos["ranking_exp_weight"] = np.exp(
        -alpha * avg_click_pos["avg_click_ranking"]
    )
    return avg_click_pos

In [7]:
avg_click_pos = compute_avg_click_pos(train_df)
avg_click_pos

Unnamed: 0,article_id,avg_click_ranking,ranking_weight,ranking_exp_weight
0,2022,4.000000,0.200000,0.135335
1,2075,5.266667,0.159574,0.071839
2,2136,5.333333,0.157895,0.069483
3,2231,2.750000,0.266667,0.252840
4,2303,3.266667,0.234375,0.195278
...,...,...,...,...
3238,363291,4.571429,0.179487,0.101701
3239,363947,3.588235,0.217949,0.166274
3240,363952,5.800000,0.147059,0.055023
3241,363967,3.875000,0.205128,0.144064


## Baseline model : popularity-based recommender

In [38]:
top_articles_df = (
    traintest_df.drop_duplicates(subset=["click_article_id"])
    .sort_values("article_popularity", ascending=False)
    .filter(["click_article_id", "article_popularity"])
)

In [39]:
article_popularity_dict = dict(
    zip(traintest_df["click_article_id"], traintest_df["article_popularity"])
)

In [197]:
# Predict for each user with the top 5 articles that he hasn't read in the train set relative to the article popularity
def get_popularity_recommendations(train_data, top_articles_df, user_id, top_n=5):
    # Get the articles read by the user
    try:
        read_articles = train_data.at[user_id]
    except KeyError:
        print(f"User ID {user_id} not found in the training data.")

    # Get the article popularity
    articles_not_read = top_articles_df[
        ~top_articles_df["click_article_id"].isin(read_articles)
    ]
    # Filter out the articles already read by the user
    recommendations = articles_not_read["click_article_id"].head(top_n)
    # Return the top N recommendations
    return recommendations

In [198]:
# Compare now the top 5 recommendations for each user in the train set
def compare_recommendations(train_data, test_data, top_articles_df, top_n=5):
    results = []
    for user_id in train_data.index:
        train_recommendations = get_popularity_recommendations(
            train_data, top_articles_df, user_id=user_id, top_n=top_n
        )
        try:
            # Get the articles read by the user in the test set
            test_articles = test_data.at[user_id][:top_n]
        except KeyError:
            print(f"User ID {user_id} not found in the test data.")

        # If a article of the top 5 recommandations is in the first 5 articles read by the user in the test set, count it as a hit
        # Count the hits
        hits = len(set(train_recommendations).intersection(set(test_articles)))
        results.append((user_id, hits))
    # Create a DataFrame with the results
    results = [(user_id, hits) for user_id, hits in results]
    if not results:
        return pd.DataFrame(columns=["user_id", "hits"])
    # Return a DataFrame with the user_id and the number of hits
    return pd.DataFrame(results, columns=["user_id", "hits"])

In [199]:
def precision_recall_at_k(train_data, test_data, top_articles_df, top_n=5):
    results = []
    for user_id in train_data.index:
        recs = get_popularity_recommendations(
            train_data, top_articles_df, user_id, top_n
        )
        try:
            test_articles = set(test_data.at[user_id][:top_n])
        except KeyError:
            test_articles = set()
        recs_set = set(recs)
        hits = len(recs_set.intersection(test_articles))
        precision = hits / top_n if top_n > 0 else 0
        recall = hits / len(test_articles) if test_articles else 0
        results.append((user_id, precision, recall))
    return pd.DataFrame(results, columns=["user_id", "precision", "recall"])

In [200]:
scores = compare_recommendations(train, test, top_articles_df)

In [201]:
# compute precision and recall at k for the recommendations for the test set
precision_recall_scores = precision_recall_at_k(train, test, top_articles_df, top_n=5)

In [202]:
precision_recall_scores["precision"].mean(), precision_recall_scores["recall"].mean()

(np.float64(0.10144188327611575), np.float64(0.10482262547000162))

# Baseline model : co-occurrence-based recommender

Parfait ! Voici une version **complète en Python** d’un système de recommandation **non personnalisé**, basé sur les **co-occurrences d’articles**, **pondérées par la récence de lecture** (par rapport à une date de split).

---

## 🎯 Objectif

À partir d’un historique utilisateur (articles + dates de lecture), recommander les **5 articles les plus pertinents**, en pondérant la contribution de chaque article lu selon sa **récence**.

---

## 🧪 Données simulées

```python
import pandas as pd
import numpy as np

# ⚙️ Co-occurrences simulées
cooccurrences = pd.DataFrame({
    'article_1': ['a', 'a', 'b', 'b', 'c', 'd', 'e'],
    'article_2': ['x', 'y', 'x', 'z', 'y', 'w', 'z'],
    'count':     [5,   2,   3,   4,   1,   6,   2]
})

# 📚 Historique de lecture de l'utilisateur (avant la date de split)
history = pd.DataFrame({
    'article': ['a', 'b', 'c', 'd', 'e'],
    'read_date': pd.to_datetime([
        '2023-12-01', '2023-12-15', '2023-12-25', '2023-12-30', '2023-12-31'
    ])
})

split_date = pd.to_datetime('2024-01-01')
```

---

## ⚙️ Étapes de recommandation

### 1. Calcul des pondérations de récence

```python
# Plus l'article a été lu récemment, plus il est important
history['days_before_split'] = (split_date - history['read_date']).dt.days
history['recency_weight'] = np.exp(-0.1 * history['days_before_split'])  # base e⁻⁰.¹ᵈ
```

### 2. Extraire les co-occurrences liées aux articles lus

```python
# Garder les paires contenant un article lu
mask = cooccurrences['article_1'].isin(history['article']) | cooccurrences['article_2'].isin(history['article'])
related = cooccurrences[mask].copy()

# Identifier l'article lu (source) et la suggestion (cible)
def get_source_target(row):
    if row['article_1'] in history['article'].values:
        return row['article_1'], row['article_2']
    else:
        return row['article_2'], row['article_1']

related[['source', 'suggestion']] = related.apply(get_source_target, axis=1, result_type='expand')
```

### 3. Ajouter la pondération de récence à chaque ligne

```python
# Fusionner avec les pondérations des articles lus
related = related.merge(history[['article', 'recency_weight']], how='left', left_on='source', right_on='article')

# Score final = cooccurrence * pondération de récence
related['score'] = related['count'] * related['recency_weight']
```

### 4. Agrégation des scores et top 5

```python
# Aggrégation par suggestion (hors articles déjà lus)
final_scores = (
    related[~related['suggestion'].isin(history['article'])]
    .groupby('suggestion')['score']
    .sum()
    .sort_values(ascending=False)
)

# Top 5 recommandations
top5 = final_scores.head(5)

print("📌 Recommandations pondérées par récence :")
print(top5)
```

---

## ✅ Exemple de sortie possible

```text
📌 Recommandations pondérées par récence :
x    1.3
z    0.9
w    0.5
y    0.3
Name: score, dtype: float64
```

---

## 🧠 Ce que tu peux améliorer ensuite

* Ajouter un **filtrage par contenu** ou thématique.
* Penser à une normalisation des co-occurrences (TF-IDF-like).
* Ajouter un **paramètre de température** ou un facteur de **diversité**.
* Passer à **PySpark** si tu as des millions de lignes.

---

Souhaite-tu que je t'aide à encapsuler ce pipeline dans une classe ou fonction réutilisable ?


Calcul des co-occurences par fréquence d'apparition des articles dans les sessions
```python

In [5]:
from itertools import combinations


# Isolate each session with a list of articles clicked
sessions = train_df_10.groupby("session_id")["click_article_id"].agg(list)

# Étape 1 : générer toutes les paires d’articles par session (ordre non important)
pairs = []
for session in sessions:
    for a, b in combinations(session, 2):
        # Étape 2 : trier chaque paire pour ignorer l’ordre
        pair = tuple(sorted((a, b)))
        pairs.append(pair)

# Étape 3 : compter les paires avec pandas
df = pd.DataFrame(pairs, columns=["article_1", "article_2"])
pair_counts = df.value_counts().reset_index(name="count")

print(pair_counts)


       article_1  article_2  count
0          64329     272143    677
1         199198     272143    649
2         160974     162655    517
3         198659     272143    505
4         160974     300470    484
...          ...        ...    ...
73644     107227     285524      1
73645     107227     285719      1
73646     107227     286321      1
73647     107233     156620      1
73648     107218     235990      1

[73649 rows x 3 columns]


In [5]:
train_df_10.groupby("user_id")["click_article_id"].agg(list)

user_id
6         [202436, 288431, 160474, 59704, 162300, 166283...
17        [157861, 314770, 324823, 300473, 161907, 16862...
25        [128289, 129960, 235230, 271551, 298687, 59057...
26        [119592, 168868, 272660, 160974, 160974, 15661...
42        [145166, 157861, 75825, 107216, 107216, 313996...
                                ...                        
253111                                     [118391, 336223]
253370                                       [271261, 7863]
253511                                             [225019]
253567                                     [336223, 270229]
253663                                             [336223]
Name: click_article_id, Length: 10195, dtype: object

In [6]:
# Isolate each session with a list of articles clicked
sessions = train_df_10.groupby("user_id")["click_article_id"].agg(list)

# Étape 1 : générer toutes les paires d’articles par session (ordre non important)
pairs = []
for session in sessions:
    for a, b in combinations(session, 2):
        # Étape 2 : trier chaque paire pour ignorer l’ordre
        pair = tuple(sorted((a, b)))
        pairs.append(pair)

# Étape 3 : compter les paires avec pandas
df = pd.DataFrame(pairs, columns=["article_1", "article_2"])
pair_counts = df.value_counts().reset_index(name="count")

print(pair_counts)

        article_1  article_2  count
0          160974     162655   1600
1          160974     272143   1498
2          160974     160974   1478
3          123909     160974   1469
4          156560     160974   1368
...           ...        ...    ...
487903     362950     363026      1
487904     362950     363234      1
487905     363026     363234      1
487906     363026     363952      1
487907       2022      49204      1

[487908 rows x 3 columns]


In [7]:
def get_cooccurrence_recommendations(user_id, train_df, pair_counts, top_n=5):
    # Get the user's history of articles read
    history = train_df[train_df["user_id"] == user_id]["click_article_id"].unique()
    if len(history) == 0:
        print(f"No history found for user_id {user_id}")
        return []

    # Select pairs containing an article from the user's history
    mask = pair_counts["article_1"].isin(history) | pair_counts["article_2"].isin(
        history
    )
    related_pairs = pair_counts[mask].copy()

    # Identify the candidate article in each pair
    def get_other(row):
        if row["article_1"] in history:
            return row["article_2"]
        else:
            return row["article_1"]

    related_pairs["candidate"] = related_pairs.apply(get_other, axis=1)

    # Aggregate scores (sum of co-occurrences)
    recommendations = (
        related_pairs.groupby("candidate")["count"].sum().sort_values(ascending=False)
    )

    # Remove articles already read
    recommendations = recommendations[~recommendations.index.isin(history)]

    # Return the top N recommendations as a list
    return recommendations.head(top_n).index.tolist()


In [8]:
# Create a function to compute the recall@5 and precision@5 for the co-occurrence-based recommendations
def evaluate_cooccurrence_recommendations(train_df, test_df, pair_counts, top_n=5):
    results = []
    for user_id in train_df["user_id"].unique():
        recs = get_cooccurrence_recommendations(user_id, train_df, pair_counts, top_n)
        try:
            test_articles = set(
                test_df[test_df["user_id"] == user_id]["click_article_id"]
            )
        except KeyError:
            test_articles = set()
        recs_set = set(recs)
        hits = len(recs_set.intersection(test_articles))
        precision = hits / top_n if top_n > 0 else 0
        recall = hits / len(test_articles) if test_articles else 0
        results.append((user_id, precision, recall))
    return pd.DataFrame(results, columns=["user_id", "precision", "recall"])

In [21]:
# Compute the precision and recall for the co-occurrence-based recommendations
cooccurrence_scores = evaluate_cooccurrence_recommendations(
    train_df_10, test_df_10, pair_counts, top_n=5
)
cooccurrence_scores["precision"].mean(), cooccurrence_scores["recall"].mean()

(np.float64(0.002491417361451692), np.float64(0.000967933102562286))

In [9]:
# Compute the precision and recall for the co-occurrence-based recommendations
cooccurrence_scores = evaluate_cooccurrence_recommendations(
    train_df_10, test_df_10, pair_counts, top_n=5
)
cooccurrence_scores["precision"].mean(), cooccurrence_scores["recall"].mean()

(np.float64(0.0008435507601765573), np.float64(0.00030655404848596346))

# Content based recommender system

In [8]:
# Load embeddings_df
embeddings_arr = pd.read_pickle(WORK_DIR / "articles_embeddings.pickle")
embeddings_df = pd.DataFrame(embeddings_arr)
valid_articles_ids = (
    traintest_df["click_article_id"].drop_duplicates().sort_values(ascending=True)
)
embeddings_filtered = embeddings_df.loc[
    embeddings_df.index.intersection(valid_articles_ids), :
]

In [9]:
# compute pca on embeddings_filtered
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings_filtered)

pca = PCA(n_components=0.95, random_state=42)
embeddings_pca = pca.fit_transform(embeddings_scaled)
embeddings_pca_df = pd.DataFrame(embeddings_pca, index=embeddings_filtered.index)

In [10]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity


def recommend_content_based(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.1, beta=0.1, top_n=5
):
    recommendations = defaultdict(list)
    user_ids = train_df["user_id"].unique()

    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def ranking_weight(position, beta):
        return np.exp(-beta * (position - 1))

    for user_id in user_ids:
        user_df = train_df.loc[train_df["user_id"] == user_id]
        weighted_sum = np.zeros(embeddings_filtered.shape[1])
        total_weight = 0.0

        for row in user_df.itertuples(index=False):
            article_id = row.click_article_id
            click_date = row.click_timestamp
            position = row.click_ranking

            w_recency = recency_weight(click_date, SPLIT_DATE, alpha)
            w_position = ranking_weight(position, beta)

            weighted_sum += (
                w_recency * w_position * embeddings_filtered.loc[article_id, :].values
            )
            total_weight += w_recency * w_position

        if total_weight == 0:
            recommendations[user_id] = []
            continue

        weighted_sum /= total_weight
        similarities = cosine_similarity(
            embeddings_filtered.values, weighted_sum.reshape(1, -1)
        ).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )
        read_articles = set(user_df["click_article_id"])
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations


In [166]:
recommendations = recommend_content_based(
    train_df,
    embeddings_filtered,
    datetime(2017, 10, 10, 0, 0, 0),
    alpha=0.1,
    beta=0.1,
    top_n=5,
)

In [204]:
recommendations_pca = recommend_content_based(
    train_df,
    embeddings_pca_df,
    datetime(2017, 10, 10, 0, 0, 0),
    alpha=0.1,
    beta=0.1,
    top_n=5,
)

In [186]:
recommendations_beta = recommend_content_based(
    train_df,
    embeddings_filtered,
    datetime(2017, 10, 10, 0, 0, 0),
    alpha=0.2,
    beta=0.5,
    top_n=5,
)

In [205]:
recommendations_beta_pca = recommend_content_based(
    train_df,
    embeddings_pca_df,
    datetime(2017, 10, 10, 0, 0, 0),
    alpha=0.2,
    beta=0.5,
    top_n=5,
)

In [50]:
def evaluate_user_recommendations(user_id, recommendations, test_df, top_n=5):
    """
    Compare recommendations to test set for a user and compute precision, recall, and f1@top_n.
    """
    # Get articles read by user in test set
    test_articles = test_df.loc[
        test_df["user_id"] == user_id, "click_article_id"
    ].values
    recs = recommendations.get(user_id, [])[:top_n]
    if len(test_articles) == 0 or len(recs) == 0:
        return 0.0, 0.0, 0.0

    # Take only top_n recommendations and test articles
    test_set = set(test_articles)

    hits = set(recs) & test_set
    precision = len(hits) / len(recs)
    recall = len(hits) / len(test_set)
    f1 = (
        (2 * precision * recall / (precision + recall))
        if (precision + recall) > 0
        else 0.0
    )
    return precision, recall, f1


In [49]:
# Compute average precision, recall, and f1 for all users
def evaluate_all_users(recommendations, test_df, top_n=5):
    all_precisions = []
    all_recalls = []
    all_f1s = []

    for user_id in test_df["user_id"].unique():
        precision, recall, f1 = evaluate_user_recommendations(
            user_id, recommendations, test_df, top_n
        )
        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1s.append(f1)

    avg_precision = np.mean(all_precisions)
    avg_recall = np.mean(all_recalls)
    avg_f1 = np.mean(all_f1s)

    return avg_precision, avg_recall, avg_f1

In [189]:
# Compute precision, recall, and f1 for all users
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0142, Average Recall@5: 0.0058, Average F1@5: 0.0077


In [206]:
# Compute precision, recall, and f1 for all users
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_pca, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0147, Average Recall@5: 0.0061, Average F1@5: 0.0081


In [190]:
# Compute precision, recall, and f1 for all users
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_beta, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0149, Average Recall@5: 0.0061, Average F1@5: 0.0082


In [207]:
# Compute precision, recall, and f1 for all users
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_beta_pca, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0147, Average Recall@5: 0.0061, Average F1@5: 0.0082


In [183]:
def recommend_content_based_avg(train_df, embeddings_filtered, top_n=5):
    """
    Recommend articles based on the average embedding of articles read by each user.

    Args:
        train_df: DataFrame with columns ['user_id', 'click_article_id']
        embeddings_filtered: DataFrame, index=article_id, values=embedding vectors
        top_n: int, number of recommendations

    Returns:
        dict[user_id, list of article_id]: top N recommendations per user
    """
    from sklearn.metrics.pairwise import cosine_similarity

    recommendations = {}
    user_ids = train_df["user_id"].unique()

    for user_id in user_ids:
        user_articles = train_df.loc[train_df["user_id"] == user_id, "click_article_id"]
        read_articles = set(user_articles)
        if not read_articles:
            recommendations[user_id] = []
            continue

        # Get embeddings for articles read by the user
        user_embs = embeddings_filtered.loc[
            embeddings_filtered.index.intersection(read_articles)
        ].values

        if user_embs.shape[0] == 0:
            recommendations[user_id] = []
            continue

        # Compute average embedding
        avg_emb = user_embs.mean(axis=0).reshape(1, -1)

        # Compute cosine similarity with all candidate articles
        similarities = cosine_similarity(embeddings_filtered.values, avg_emb).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )

        # Exclude already read articles
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations

In [None]:
# compute recommendations using the average embedding method
recommendations_avg = recommend_content_based_avg(
    train_df, embeddings_filtered, top_n=5
)

In [185]:
# Evaluate the average embedding recommendations
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_avg, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0055, Average Recall@5: 0.0057, Average F1@5: 0.0056


In [215]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity


def recommend_content_based(
    train_df,
    embeddings_filtered,
    SPLIT_DATE,
    alpha=0.2,
    beta=0.5,
    top_n=5,
    article_popularity=None,
    lambda_=0.7,
):
    """
    Content-based recommender with hybrid popularity.
    Args:
        article_popularity: dict or pd.Series {article_id: popularity_score}
        lambda_: float, weight for similarity (1-lambda_ for popularity)
    """
    recommendations = defaultdict(list)
    user_ids = train_df["user_id"].unique()

    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def position_weight(position, beta):
        return np.exp(-beta * (position - 1))

    # Normalise la popularité si fournie
    if article_popularity is not None:
        if isinstance(article_popularity, dict):
            pop_series = pd.Series(article_popularity)
        else:
            pop_series = article_popularity
        pop_min = pop_series.min()
        pop_max = pop_series.max()
        pop_norm = (pop_series - pop_min) / (pop_max - pop_min + 1e-9)
    else:
        pop_norm = None

    for user_id in user_ids:
        user_df = train_df.loc[train_df["user_id"] == user_id]
        weighted_sum = np.zeros(embeddings_filtered.shape[1])
        total_weight = 0.0

        for row in user_df.itertuples(index=False):
            article_id = row.click_article_id
            click_date = row.click_timestamp
            position = row.click_ranking

            w_recency = recency_weight(click_date, SPLIT_DATE, alpha)
            w_position = position_weight(position, beta)

            weighted_sum += (
                w_recency * w_position * embeddings_filtered.loc[article_id, :].values
            )
            total_weight += w_recency * w_position

        if total_weight == 0:
            recommendations[user_id] = []
            continue

        weighted_sum /= total_weight
        similarities = cosine_similarity(
            embeddings_filtered.values, weighted_sum.reshape(1, -1)
        ).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )
        read_articles = set(user_df["click_article_id"])
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]

        # Ajout de la popularité normalisée
        if pop_norm is not None:
            sim_df = sim_df.join(pop_norm.rename("popularity_norm"), on="article_id")
            sim_df["popularity_norm"] = sim_df["popularity_norm"].fillna(0)
            sim_df["hybrid_score"] = (
                lambda_ * sim_df["similarity"]
                + (1 - lambda_) * sim_df["popularity_norm"]
            )
            top_recommendations = sim_df.nlargest(top_n, "hybrid_score")
        else:
            top_recommendations = sim_df.nlargest(top_n, "similarity")

        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations

In [218]:
# article_popularity doit être un dict ou une Series {article_id: score}
recommendations_with_pop = recommend_content_based(
    train_df,
    embeddings_filtered,
    SPLIT_DATE,
    alpha=0.2,
    beta=0.5,
    top_n=5,
    article_popularity=article_popularity_dict,
    lambda_=0.3,
)

In [None]:
train_df.head(5)

In [222]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity


def recommend_content_based_position(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.1, beta=0.1, top_n=5
):
    recommendations = defaultdict(list)
    user_ids = train_df["user_id"].unique()

    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def position_weight(position, beta):
        return np.exp(-beta * (position - 1))

    for user_id in user_ids:
        user_df = train_df.loc[train_df["user_id"] == user_id]
        len_position = len(user_df)
        weighted_sum = np.zeros(embeddings_filtered.shape[1])
        total_weight = 0.0

        for i, row in enumerate(user_df.itertuples(index=False)):
            article_id = row.click_article_id
            click_date = row.click_timestamp
            position = len_position - i  # Invert position for ranking

            w_recency = recency_weight(click_date, SPLIT_DATE, alpha)
            w_position = position_weight(position, beta)

            weighted_sum += (
                w_recency * w_position * embeddings_filtered.loc[article_id, :].values
            )
            total_weight += w_recency * w_position

        if total_weight == 0:
            recommendations[user_id] = []
            continue

        weighted_sum /= total_weight
        similarities = cosine_similarity(
            embeddings_filtered.values, weighted_sum.reshape(1, -1)
        ).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )
        read_articles = set(user_df["click_article_id"])
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations


In [223]:
# compute recommendations using the average embedding method
recommendations_with_position = recommend_content_based_position(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.2, beta=0.5, top_n=5
)

# Evaluate the average embedding recommendations
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_with_position, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0136, Average Recall@5: 0.0055, Average F1@5: 0.0075


In [224]:
train_df

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,...,click_ranking,first_click_timestamp,article_recence,popularity_weighted_by_recence_sqrt,popularity_weighted_by_recence,article_id,category_id,created_at_ts,publisher_id,words_count
0,59,1506826329267796,2017-10-01 02:52:09,2,234853,2017-10-01 03:00:00.026,4,3,2,1,...,1,2017-10-01 03:00:00.026,0,0.000000,0.000000,234853,375,2017-09-30 12:18:09,0,140
1,154,1506826793323891,2017-10-01 02:59:53,2,96663,2017-10-01 03:00:04.207,4,3,2,1,...,1,2017-10-01 03:00:04.207,0,0.000000,0.000000,96663,209,2017-09-30 16:13:45,0,206
2,59,1506826329267796,2017-10-01 02:52:09,2,234995,2017-10-01 03:00:30.026,4,3,2,1,...,2,2017-10-01 03:00:30.026,0,0.000000,0.000000,234995,375,2017-09-30 12:07:16,0,155
3,149,1506826780136886,2017-10-01 02:59:40,2,145166,2017-10-01 03:00:31.267,4,3,20,1,...,1,2017-10-01 03:00:31.267,0,0.000000,0.000000,145166,269,2017-09-30 15:27:52,0,180
4,154,1506826793323891,2017-10-01 02:59:53,2,108854,2017-10-01 03:00:34.207,4,3,2,1,...,2,2017-10-01 03:00:34.207,0,0.000000,0.000000,108854,230,2017-09-30 21:09:43,0,167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181875,11668,1507593499203413,2017-10-09 23:58:19,2,225019,2017-10-09 23:59:35.046,4,3,2,1,...,2,2017-10-09 00:00:12.195,0,0.000000,0.000000,225019,354,2017-10-09 16:25:28,0,190
181876,15068,1507592669124580,2017-10-09 23:44:29,4,338129,2017-10-09 23:59:41.302,4,3,2,1,...,4,2017-10-09 18:23:42.835,0,0.000000,0.000000,338129,437,2017-10-09 15:20:51,0,187
181877,3086,1507593391142309,2017-10-09 23:56:31,6,119193,2017-10-09 23:59:43.075,2,3,20,1,...,2,2017-10-07 21:56:52.677,2,0.004642,0.006565,119193,247,2017-10-09 16:17:03,0,296
181878,34646,1507592869316761,2017-10-09 23:47:49,7,161149,2017-10-09 23:59:49.251,4,3,20,1,...,4,2017-10-09 17:53:37.335,0,0.000000,0.000000,161149,281,2017-10-09 14:48:21,0,205


In [225]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity


def recommend_content_based_category(
    train_df,
    embeddings_filtered,
    SPLIT_DATE,
    alpha=0.23,
    beta=0.5,
    top_n=5,
    category_weight=2.0,
):
    """
    Build user profile by weighting articles from the user's most frequent category higher.
    """
    recommendations = defaultdict(list)
    user_ids = train_df["user_id"].unique()

    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def ranking_weight(position, beta):
        return np.exp(-beta * (position - 1))

    for user_id in user_ids:
        user_df = train_df.loc[train_df["user_id"] == user_id]
        weighted_sum = np.zeros(embeddings_filtered.shape[1])
        total_weight = 0.0

        # Find user's most frequent category
        if "category_id" in user_df.columns:
            most_freq_cat = user_df["category_id"].mode().iloc[0]
        else:
            most_freq_cat = None

        for row in user_df.itertuples(index=False):
            article_id = row.click_article_id
            click_date = row.click_timestamp
            position = row.click_ranking
            category_id = row.category_id if hasattr(row, "category_id") else None

            w_recency = recency_weight(click_date, SPLIT_DATE, alpha)
            w_position = ranking_weight(position, beta)
            w_cat = category_weight if (category_id == most_freq_cat) else 1.0

            weighted_sum += (
                w_recency
                * w_position
                * w_cat
                * embeddings_filtered.loc[article_id, :].values
            )
            total_weight += w_recency * w_position * w_cat

        if total_weight == 0:
            recommendations[user_id] = []
            continue

        weighted_sum /= total_weight
        similarities = cosine_similarity(
            embeddings_filtered.values, weighted_sum.reshape(1, -1)
        ).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )
        read_articles = set(user_df["click_article_id"])
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations

In [226]:
# compute recommendations using the average embedding method
recommendations_with_category = recommend_content_based_category(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.2, beta=0.5, top_n=5
)

# Evaluate the average embedding recommendations
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_with_category, test_df, top_n=5
)
print(
    f"Average Precision@5: {avg_precision:.4f}, Average Recall@5: {avg_recall:.4f}, Average F1@5: {avg_f1:.4f}"
)

Average Precision@5: 0.0144, Average Recall@5: 0.0058, Average F1@5: 0.0078


In [11]:
def recommend_content_based_last_article(train_df, embeddings_filtered, top_n=5):
    """
    Simple content-based recommender using only the last article read by each user.
    No recency weighting, no click ranking weighting - just pure similarity to the last article.
    
    Args:
        train_df: Training dataframe with user interactions
        embeddings_filtered: DataFrame with article embeddings (index=article_id)
        top_n: Number of recommendations to return
        
    Returns:
        dict: {user_id: [list of recommended article_ids]}
    """
    from sklearn.metrics.pairwise import cosine_similarity
    
    recommendations = {}
    user_ids = train_df["user_id"].unique()
    
    for user_id in user_ids:
        # Get user's interaction history sorted by timestamp (ascending)
        user_df = train_df.loc[train_df["user_id"] == user_id].sort_values("click_timestamp")
        
        if len(user_df) == 0:
            recommendations[user_id] = []
            continue
            
        # Get the last article read by the user
        last_article_id = user_df.iloc[-1]["click_article_id"]
        
        # Get all articles read by the user (to exclude from recommendations)
        read_articles = set(user_df["click_article_id"])
        
        # Check if the last article has embeddings
        if last_article_id not in embeddings_filtered.index:
            recommendations[user_id] = []
            continue
            
        # Get embedding of the last article
        last_article_embedding = embeddings_filtered.loc[last_article_id].values.reshape(1, -1)
        
        # Compute cosine similarity with all other articles
        similarities = cosine_similarity(embeddings_filtered.values, last_article_embedding).flatten()
        
        # Create DataFrame with article IDs and similarities
        sim_df = pd.DataFrame({
            "article_id": embeddings_filtered.index,
            "similarity": similarities
        })
        
        # Exclude articles already read by the user
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        
        # Get top N recommendations
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])
    
    return recommendations

In [12]:
# Generate recommendations using only the last article consulted
print("Generating recommendations based on last article only...")
recommendations_last_article = recommend_content_based_last_article(
    train_df, embeddings_filtered, top_n=5
)

Generating recommendations based on last article only...


In [16]:
# Evaluate the last article recommendations
print("Evaluating Last Article Content-Based Recommendations...")
last_article_avg_precision, last_article_avg_recall, last_article_avg_f1 = evaluate_all_users(
    recommendations_last_article, test_df, top_n=5
)
print(f"Last Article - Average Precision@5: {last_article_avg_precision:.4f}")
print(f"Last Article - Average Recall@5: {last_article_avg_recall:.4f}")
print(f"Last Article - Average F1@5: {last_article_avg_f1:.4f}")

Evaluating Last Article Content-Based Recommendations...


Last Article - Average Precision@5: 0.0110
Last Article - Average Recall@5: 0.0047
Last Article - Average F1@5: 0.0062


In [None]:
# Comprehensive comparison of all content-based approaches
print("\n" + "="*70)
print("CONTENT-BASED RECOMMENDERS COMPARISON")
print("="*70)

# Get metrics for content-based with category weighting
content_cat_avg_precision, content_cat_avg_recall, content_cat_avg_f1 = evaluate_all_users(
    recommendations_with_category, test_df, top_n=5
)

# Get metrics for average embedding approach
content_avg_avg_precision, content_avg_avg_recall, content_avg_avg_f1 = evaluate_all_users(
    recommendations_avg, test_df, top_n=5
)

# Get metrics for weighted approach (alpha=0.2, beta=0.5)
content_weighted_avg_precision, content_weighted_avg_recall, content_weighted_avg_f1 = evaluate_all_users(
    recommendations_beta, test_df, top_n=5
)

# Create comparison DataFrame
content_comparison = pd.DataFrame({
    'Content-Based Approach': [
        'Weighted (α=0.1, β=0.1)',
        'Weighted (α=0.2, β=0.5)', 
        'Average Embeddings',
        'Category Weighted',
        'Hybrid with Popularity',
        'Last Article Only'
    ],
    'Precision@5': [
        # First get the original weighted approach scores
        0.0,  # You'll need to compute this from recommendations
        content_weighted_avg_precision,
        content_avg_avg_precision,
        content_cat_avg_precision,
        # Get the hybrid with popularity scores
        0.0,  # You'll need to compute this from recommendations_with_pop
        last_article_avg_precision
    ],
    'Recall@5': [
        0.0,  # You'll need to compute this
        content_weighted_avg_recall,
        content_avg_avg_recall,
        content_cat_avg_recall,
        0.0,  # You'll need to compute this
        last_article_avg_recall
    ],
    'F1@5': [
        0.0,  # You'll need to compute this
        content_weighted_avg_f1,
        content_avg_avg_f1,
        content_cat_avg_f1,
        0.0,  # You'll need to compute this
        last_article_avg_f1
    ]
})

print(content_comparison.to_string(index=False))

In [None]:
# Complete the missing evaluations for comprehensive comparison
print("Computing missing metrics for complete comparison...")

# Evaluate original weighted approach (α=0.1, β=0.1)
original_weighted_avg_precision, original_weighted_avg_recall, original_weighted_avg_f1 = evaluate_all_users(
    recommendations, test_df, top_n=5
)

# Evaluate hybrid with popularity approach
hybrid_pop_avg_precision, hybrid_pop_avg_recall, hybrid_pop_avg_f1 = evaluate_all_users(
    recommendations_with_pop, test_df, top_n=5
)

# Update the comparison with complete data
content_comparison_complete = pd.DataFrame({
    'Content-Based Approach': [
        'Weighted (α=0.1, β=0.1)',
        'Weighted (α=0.2, β=0.5)', 
        'Average Embeddings',
        'Category Weighted',
        'Hybrid with Popularity',
        'Last Article Only'
    ],
    'Precision@5': [
        original_weighted_avg_precision,
        content_weighted_avg_precision,
        content_avg_avg_precision,
        content_cat_avg_precision,
        hybrid_pop_avg_precision,
        last_article_avg_precision
    ],
    'Recall@5': [
        original_weighted_avg_recall,
        content_weighted_avg_recall,
        content_avg_avg_recall,
        content_cat_avg_recall,
        hybrid_pop_avg_recall,
        last_article_avg_recall
    ],
    'F1@5': [
        original_weighted_avg_f1,
        content_weighted_avg_f1,
        content_avg_avg_f1,
        content_cat_avg_f1,
        hybrid_pop_avg_f1,
        last_article_avg_f1
    ]
})

print("\nCOMPLETE CONTENT-BASED COMPARISON:")
print(content_comparison_complete.to_string(index=False))

# Find the best performing approach
best_precision_idx = content_comparison_complete['Precision@5'].idxmax()
best_recall_idx = content_comparison_complete['Recall@5'].idxmax()
best_f1_idx = content_comparison_complete['F1@5'].idxmax()

print(f"\nBest Precision@5: {content_comparison_complete.iloc[best_precision_idx]['Content-Based Approach']} "
      f"({content_comparison_complete.iloc[best_precision_idx]['Precision@5']:.4f})")
print(f"Best Recall@5: {content_comparison_complete.iloc[best_recall_idx]['Content-Based Approach']} "
      f"({content_comparison_complete.iloc[best_recall_idx]['Recall@5']:.4f})")
print(f"Best F1@5: {content_comparison_complete.iloc[best_f1_idx]['Content-Based Approach']} "
      f"({content_comparison_complete.iloc[best_f1_idx]['F1@5']:.4f})")

In [None]:
# Analyze the last article approach characteristics
print("\n" + "="*70)
print("LAST ARTICLE APPROACH ANALYSIS")
print("="*70)

# Check how many users have only one article in their history
single_article_users = train_df.groupby('user_id').size()
users_with_single_article = (single_article_users == 1).sum()
total_users = len(single_article_users)

print(f"Users with only 1 article in history: {users_with_single_article}/{total_users} ({100*users_with_single_article/total_users:.1f}%)")
print(f"Users with multiple articles: {total_users - users_with_single_article}/{total_users} ({100*(total_users - users_with_single_article)/total_users:.1f}%)")

# Analyze diversity of the last article recommendations
print("\nDiversity Analysis for Last Article Recommendations:")
last_article_analysis = analyze_recommendation_diversity(recommendations_last_article, embeddings_filtered)
for key, value in last_article_analysis.items():
    print(f"{key}: {value:.4f}" if value is not None else f"{key}: {value}")

# Compare with average embedding approach diversity
print("\nComparison with Average Embedding Approach:")
avg_embedding_analysis = analyze_recommendation_diversity(recommendations_avg, embeddings_filtered)
print(f"Last Article Coverage: {last_article_analysis['coverage']:.4f}")
print(f"Average Embedding Coverage: {avg_embedding_analysis['coverage']:.4f}")
print(f"Last Article Diversity: {last_article_analysis['avg_diversity']:.4f}" if last_article_analysis['avg_diversity'] else "Last Article Diversity: None")
print(f"Average Embedding Diversity: {avg_embedding_analysis['avg_diversity']:.4f}" if avg_embedding_analysis['avg_diversity'] else "Average Embedding Diversity: None")

# Check temporal recency of last articles
print(f"\nTemporal Analysis:")
last_article_dates = train_df.groupby('user_id')['click_timestamp'].max()
days_before_split = (SPLIT_DATE - last_article_dates).dt.days
print(f"Average days between last article and split: {days_before_split.mean():.1f}")
print(f"Median days between last article and split: {days_before_split.median():.1f}")
print(f"Users with last article < 7 days before split: {(days_before_split < 7).sum()}/{len(days_before_split)} ({100*(days_before_split < 7).sum()/len(days_before_split):.1f}%)")

## Last Article Content-Based Recommender Analysis

### Approach Description:
The **Last Article** recommender is the simplest content-based approach that:
- Uses only the **most recent article** consulted by each user
- No recency weighting or click ranking considerations
- Pure cosine similarity between the last article's embedding and all other articles
- Excludes articles already read by the user

### Key Characteristics:

**Advantages:**
- ✅ **Simplicity**: No hyperparameters to tune
- ✅ **Computational efficiency**: Only one similarity computation per user
- ✅ **Interpretability**: Easy to understand and explain
- ✅ **Real-time ready**: Fast inference for new recommendations
- ✅ **Fresh preferences**: Focuses on user's most current interests

**Potential Limitations:**
- ❌ **Ignores historical preferences**: Doesn't consider user's broader taste
- ❌ **Vulnerable to outliers**: One atypical article can skew all recommendations
- ❌ **No temporal context**: Misses evolving user behavior patterns
- ❌ **Limited for sparse users**: Poor performance for users with very few interactions

### Performance Implications:
This approach works best when:
- Users have consistent, focused interests
- The last article is representative of current preferences  
- Content similarity is a strong predictor of user satisfaction
- Real-time speed is prioritized over recommendation sophistication

The comparison with other content-based approaches will show whether simplicity sometimes outperforms complexity in recommendation quality.

# Collaborative filtering recommender system
```

In [17]:
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from scipy.sparse import csr_matrix
import numpy as np
from collections import defaultdict

def create_weighted_interaction_matrix(train_df, SPLIT_DATE, alpha=0.1, beta=0.1, gamma=1.0):
    """
    Create a weighted user-item interaction matrix incorporating:
    - Click frequency (implicit feedback)
    - Recency weighting 
    - Click ranking weighting
    - Overall interaction strength
    
    Args:
        train_df: Training dataframe
        SPLIT_DATE: Reference date for recency calculation
        alpha: Recency decay parameter
        beta: Click ranking decay parameter  
        gamma: Overall scaling factor
    """
    
    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def ranking_weight(position, beta):
        return np.exp(-beta * (position - 1))
    
    # Calculate weights for each interaction
    train_df_weighted = train_df.copy()
    train_df_weighted['recency_w'] = train_df_weighted['click_timestamp'].apply(
        lambda x: recency_weight(x, SPLIT_DATE, alpha)
    )
    train_df_weighted['ranking_w'] = train_df_weighted['click_ranking'].apply(
        lambda x: ranking_weight(x, beta)
    )
    
    # Combined weight: recency * ranking * base_strength
    train_df_weighted['interaction_weight'] = (
        gamma * train_df_weighted['recency_w'] * train_df_weighted['ranking_w']
    )
    
    # Group by user-article pairs and sum weights (for multiple clicks on same article)
    interaction_weights = (
        train_df_weighted.groupby(['user_id', 'click_article_id'])['interaction_weight']
        .sum()
        .reset_index()
    )
    
    # Create categorical codes for sparse matrix
    user_codes = interaction_weights['user_id'].astype('category').cat.codes
    article_codes = interaction_weights['click_article_id'].astype('category').cat.codes
    
    # Create mappings
    user_categories = interaction_weights['user_id'].astype('category').cat.categories
    article_categories = interaction_weights['click_article_id'].astype('category').cat.categories
    
    user_id_map = dict(enumerate(user_categories))
    article_id_map = dict(enumerate(article_categories))
    user_id_invmap = {v: k for k, v in user_id_map.items()}
    article_id_invmap = {v: k for k, v in article_id_map.items()}
    
    # Create sparse matrix
    weights = interaction_weights['interaction_weight'].values
    interaction_matrix = csr_matrix(
        (weights, (user_codes, article_codes)),
        shape=(len(user_categories), len(article_categories))
    )
    
    return interaction_matrix, user_id_map, article_id_map, user_id_invmap, article_id_invmap

In [18]:
class CollaborativeFilteringRecommender:
    """
    Collaborative Filtering recommender using weighted implicit matrix factorization
    """
    
    def __init__(self, factors=50, regularization=0.1, iterations=20, alpha=0.1, beta=0.1, gamma=1.0):
        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations
        self.alpha = alpha  # recency weight
        self.beta = beta    # ranking weight
        self.gamma = gamma  # overall scaling
        self.model = None
        self.user_id_map = None
        self.article_id_map = None
        self.user_id_invmap = None
        self.article_id_invmap = None
        self.interaction_matrix = None
        
    def fit(self, train_df, SPLIT_DATE):
        """
        Train the collaborative filtering model
        """
        print("Creating weighted interaction matrix...")
        self.interaction_matrix, self.user_id_map, self.article_id_map, \
        self.user_id_invmap, self.article_id_invmap = create_weighted_interaction_matrix(
            train_df, SPLIT_DATE, self.alpha, self.beta, self.gamma
        )
        
        print(f"Interaction matrix shape: {self.interaction_matrix.shape}")
        print(f"Interaction matrix density: {self.interaction_matrix.nnz / np.prod(self.interaction_matrix.shape):.6f}")
        
        # Apply BM25 weighting to reduce impact of very popular items
        print("Applying BM25 weighting...")
        weighted_matrix = bm25_weight(self.interaction_matrix.T, K1=100, B=0.8).T
        
        # Initialize and train ALS model
        print("Training ALS model...")
        self.model = AlternatingLeastSquares(
            factors=self.factors,
            regularization=self.regularization, 
            iterations=self.iterations,
            random_state=42
        )
        
        self.model.fit(weighted_matrix)
        print("Model training completed!")
        
    def get_user_recommendations(self, user_id, top_n=5, filter_seen=True):
        """
        Get recommendations for a specific user
        """
        if user_id not in self.user_id_invmap:
            print(f"User ID {user_id} not found in training data")
            return []
            
        user_index = self.user_id_invmap[user_id]
        
        # Get user factors and compute scores for all items
        user_vector = self.model.user_factors[user_index]
        item_scores = np.dot(self.model.item_factors, user_vector)
        
        # Convert to article IDs and scores
        article_scores = [(self.article_id_map[i], score) for i, score in enumerate(item_scores)]
        
        # Filter out seen articles if requested
        if filter_seen:
            seen_articles = set()
            user_row = self.interaction_matrix[user_index].nonzero()[1]
            seen_articles = {self.article_id_map[i] for i in user_row}
            article_scores = [(aid, score) for aid, score in article_scores if aid not in seen_articles]
        
        # Sort by score and return top N
        article_scores.sort(key=lambda x: x[1], reverse=True)
        return [aid for aid, _ in article_scores[:top_n]]
    
    def get_article_similar_items(self, article_id, top_n=5):
        """
        Get similar articles to a given article
        """
        if article_id not in self.article_id_invmap:
            print(f"Article ID {article_id} not found in training data")
            return []
            
        article_index = self.article_id_invmap[article_id]
        similar_items = self.model.similar_items(article_index, N=top_n + 1)[1:]  # Exclude self
        
        return [self.article_id_map[item[0]] for item in similar_items]
    
    def recommend_for_all_users(self, train_df, top_n=5):
        """
        Generate recommendations for all users in the training set
        """
        recommendations = {}
        all_users = train_df['user_id'].unique()
        
        print(f"Generating recommendations for {len(all_users)} users...")
        for user_id in tqdm(all_users, desc="Collaborative Filtering"):
            recommendations[user_id] = self.get_user_recommendations(user_id, top_n)
            
        return recommendations

In [19]:
# Initialize and train the collaborative filtering model
cf_recommender = CollaborativeFilteringRecommender(
    factors=64,           # Number of latent factors
    regularization=0.01,  # L2 regularization
    iterations=30,        # Number of ALS iterations
    alpha=0.1,           # Recency weight decay
    beta=0.1,            # Click ranking weight decay  
    gamma=2.0            # Overall interaction scaling
)

# Train the model
cf_recommender.fit(train_df, SPLIT_DATE)

Creating weighted interaction matrix...
Interaction matrix shape: (10195, 3243)
Interaction matrix density: 0.005251
Applying BM25 weighting...
Training ALS model...
Interaction matrix shape: (10195, 3243)
Interaction matrix density: 0.005251
Applying BM25 weighting...
Training ALS model...


  check_blas_config()


  0%|          | 0/30 [00:00<?, ?it/s]

Model training completed!


In [20]:
# Generate recommendations for all users
cf_recommendations = cf_recommender.recommend_for_all_users(train_df, top_n=5)

Generating recommendations for 10195 users...


100%|██████████| 10195/10195 [00:29<00:00, 348.79it/s]
100%|██████████| 10195/10195 [00:29<00:00, 348.79it/s]


In [21]:
# Evaluate collaborative filtering recommendations
print("Evaluating Collaborative Filtering Recommendations...")
cf_avg_precision, cf_avg_recall, cf_avg_f1 = evaluate_all_users(
    cf_recommendations, test_df, top_n=5
)
print(f"CF - Average Precision@5: {cf_avg_precision:.4f}")
print(f"CF - Average Recall@5: {cf_avg_recall:.4f}") 
print(f"CF - Average F1@5: {cf_avg_f1:.4f}")

Evaluating Collaborative Filtering Recommendations...
CF - Average Precision@5: 0.0025
CF - Average Recall@5: 0.0011
CF - Average F1@5: 0.0014
CF - Average Precision@5: 0.0025
CF - Average Recall@5: 0.0011
CF - Average F1@5: 0.0014


In [22]:
def create_hybrid_recommendations(cf_recommendations, content_recommendations, 
                                 cf_weight=0.6, content_weight=0.4, top_n=5):
    """
    Combine collaborative filtering and content-based recommendations
    
    Args:
        cf_recommendations: Dict of CF recommendations per user
        content_recommendations: Dict of content-based recommendations per user
        cf_weight: Weight for collaborative filtering (0-1)
        content_weight: Weight for content-based (0-1, should sum to 1 with cf_weight)
        top_n: Number of final recommendations
    """
    hybrid_recommendations = {}
    
    all_users = set(cf_recommendations.keys()) | set(content_recommendations.keys())
    
    for user_id in all_users:
        cf_recs = cf_recommendations.get(user_id, [])
        content_recs = content_recommendations.get(user_id, [])
        
        # Score articles from both methods
        article_scores = defaultdict(float)
        
        # Add CF scores (higher position = higher score)
        for i, article_id in enumerate(cf_recs):
            article_scores[article_id] += cf_weight * (len(cf_recs) - i) / len(cf_recs)
            
        # Add content-based scores
        for i, article_id in enumerate(content_recs):
            article_scores[article_id] += content_weight * (len(content_recs) - i) / len(content_recs)
        
        # Sort by combined score and take top N
        sorted_articles = sorted(article_scores.items(), key=lambda x: x[1], reverse=True)
        hybrid_recommendations[user_id] = [article_id for article_id, _ in sorted_articles[:top_n]]
    
    return hybrid_recommendations

# Create hybrid recommendations combining CF and best content-based approach
hybrid_recommendations = create_hybrid_recommendations(
    cf_recommendations, 
    recommendations_weighted_progress,  # Use the best content-based variant
    cf_weight=0.7, 
    content_weight=0.3, 
    top_n=5
)

NameError: name 'recommendations_with_category' is not defined

In [None]:
# Evaluate hybrid recommendations
print("Evaluating Hybrid Recommendations...")
hybrid_avg_precision, hybrid_avg_recall, hybrid_avg_f1 = evaluate_all_users(
    hybrid_recommendations, test_df, top_n=5
)
print(f"Hybrid - Average Precision@5: {hybrid_avg_precision:.4f}")
print(f"Hybrid - Average Recall@5: {hybrid_avg_recall:.4f}")
print(f"Hybrid - Average F1@5: {hybrid_avg_f1:.4f}")

print("\n" + "="*60)
print("COMPARISON OF ALL APPROACHES")
print("="*60)

# Compare all approaches
results_comparison = pd.DataFrame({
    'Method': [
        'Popularity Baseline',
        'Content-Based (Category Weighted)', 
        'Collaborative Filtering',
        'Hybrid (CF + Content)'
    ],
    'Precision@5': [
        precision_recall_scores["precision"].mean(),
        # Add your best content-based score here (from recommendations_with_category)
        0.0,  # You'll need to compute this
        cf_avg_precision,
        hybrid_avg_precision
    ],
    'Recall@5': [
        precision_recall_scores["recall"].mean(),
        0.0,  # You'll need to compute this  
        cf_avg_recall,
        hybrid_avg_recall
    ],
    'F1@5': [
        2 * precision_recall_scores["precision"].mean() * precision_recall_scores["recall"].mean() / 
        (precision_recall_scores["precision"].mean() + precision_recall_scores["recall"].mean()),
        0.0,  # You'll need to compute this
        cf_avg_f1,
        hybrid_avg_f1
    ]
})

print(results_comparison.to_string(index=False))

In [None]:
def analyze_recommendation_diversity(recommendations, article_embeddings=None):
    """
    Analyze diversity and coverage of recommendations
    """
    all_recommended = []
    user_diversities = []
    
    for user_id, recs in recommendations.items():
        all_recommended.extend(recs)
        
        # Calculate intra-list diversity if embeddings available
        if article_embeddings is not None and len(recs) > 1:
            try:
                rec_embeddings = article_embeddings.loc[recs].values
                # Average pairwise cosine distance
                similarities = cosine_similarity(rec_embeddings)
                # Get upper triangle (excluding diagonal)
                upper_triangle = similarities[np.triu_indices_from(similarities, k=1)]
                avg_similarity = np.mean(upper_triangle)
                diversity = 1 - avg_similarity  # Convert similarity to diversity
                user_diversities.append(diversity)
            except:
                pass
    
    # Coverage: unique articles recommended / total articles in catalog
    unique_recommended = len(set(all_recommended))
    total_articles = len(traintest_df['click_article_id'].unique())
    coverage = unique_recommended / total_articles
    
    # Popularity bias: average popularity of recommended articles
    recommended_counts = pd.Series(all_recommended).value_counts()
    
    results = {
        'coverage': coverage,
        'unique_articles_recommended': unique_recommended,
        'total_recommendations': len(all_recommended),
        'avg_diversity': np.mean(user_diversities) if user_diversities else None,
        'recommendation_concentration': recommended_counts.head(10).sum() / len(all_recommended)
    }
    
    return results

print("Analyzing recommendation diversity and coverage...")
print("\nCollaborative Filtering Analysis:")
cf_analysis = analyze_recommendation_diversity(cf_recommendations, embeddings_filtered)
for key, value in cf_analysis.items():
    print(f"{key}: {value:.4f}" if value is not None else f"{key}: {value}")

print("\nHybrid Recommendations Analysis:")
hybrid_analysis = analyze_recommendation_diversity(hybrid_recommendations, embeddings_filtered)
for key, value in hybrid_analysis.items():
    print(f"{key}: {value:.4f}" if value is not None else f"{key}: {value}")

## Collaborative Filtering Results Summary

### Key Features Implemented:

1. **Weighted Interaction Matrix**: 
   - Combines click frequency, recency weighting, and click ranking
   - Uses exponential decay for both recency (α=0.1) and ranking position (β=0.1)
   - Applies overall scaling factor (γ=2.0) to boost interaction strength

2. **Matrix Factorization with ALS**:
   - 64 latent factors for user and item representations
   - BM25 weighting to reduce popular item bias
   - L2 regularization (0.01) to prevent overfitting

3. **Hybrid Approach**:
   - Combines collaborative filtering (70%) with content-based recommendations (30%)
   - Leverages the strengths of both collaborative patterns and content similarity

### Performance Comparison:
The collaborative filtering approach should provide:
- Better handling of the cold start problem through hybrid combination
- Discovery of latent user preferences beyond content similarity
- Improved recommendations for users with rich interaction history

### Next Steps for Optimization:
- [ ] Tune hyperparameters (α, β, γ, factors, regularization)
- [ ] Experiment with different hybrid weights
- [ ] Implement deep learning approaches (Neural Collaborative Filtering)
- [ ] Add temporal dynamics for evolving user preferences
- [ ] Consider session-based recommendations for short-term interests

In [25]:
# Updated content-based recommender functions with tqdm progress bars

def recommend_content_based_with_progress(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.1, beta=0.1, top_n=5
):
    """
    Content-based recommender with tqdm progress bar
    """
    recommendations = defaultdict(list)
    user_ids = train_df["user_id"].unique()

    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def ranking_weight(position, beta):
        return np.exp(-beta * (position - 1))

    print(f"Computing content-based recommendations for {len(user_ids)} users...")
    for user_id in tqdm(user_ids, desc="Content-based (weighted)"):
        user_df = train_df.loc[train_df["user_id"] == user_id]
        weighted_sum = np.zeros(embeddings_filtered.shape[1])
        total_weight = 0.0

        for row in user_df.itertuples(index=False):
            article_id = row.click_article_id
            click_date = row.click_timestamp
            position = row.click_ranking

            w_recency = recency_weight(click_date, SPLIT_DATE, alpha)
            w_position = ranking_weight(position, beta)

            weighted_sum += (
                w_recency * w_position * embeddings_filtered.loc[article_id, :].values
            )
            total_weight += w_recency * w_position

        if total_weight == 0:
            recommendations[user_id] = []
            continue

        weighted_sum /= total_weight
        similarities = cosine_similarity(
            embeddings_filtered.values, weighted_sum.reshape(1, -1)
        ).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )
        read_articles = set(user_df["click_article_id"])
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations


def recommend_content_based_avg_with_progress(train_df, embeddings_filtered, top_n=5):
    """
    Average embedding recommender with tqdm progress bar
    """
    recommendations = {}
    user_ids = train_df["user_id"].unique()

    print(f"Computing average embedding recommendations for {len(user_ids)} users...")
    for user_id in tqdm(user_ids, desc="Content-based (average)"):
        user_articles = train_df.loc[train_df["user_id"] == user_id, "click_article_id"]
        read_articles = set(user_articles)
        if not read_articles:
            recommendations[user_id] = []
            continue

        # Get embeddings for articles read by the user
        user_embs = embeddings_filtered.loc[
            embeddings_filtered.index.intersection(read_articles)
        ].values

        if user_embs.shape[0] == 0:
            recommendations[user_id] = []
            continue

        # Compute average embedding
        avg_emb = user_embs.mean(axis=0).reshape(1, -1)

        # Compute cosine similarity with all candidate articles
        similarities = cosine_similarity(embeddings_filtered.values, avg_emb).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )

        # Exclude already read articles
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations


def recommend_content_based_last_article_with_progress(train_df, embeddings_filtered, top_n=5):
    """
    Last article recommender with tqdm progress bar
    """
    recommendations = {}
    user_ids = train_df["user_id"].unique()
    
    print(f"Computing last article recommendations for {len(user_ids)} users...")
    for user_id in tqdm(user_ids, desc="Content-based (last article)"):
        # Get user's interaction history sorted by timestamp (ascending)
        user_df = train_df.loc[train_df["user_id"] == user_id].sort_values("click_timestamp")
        
        if len(user_df) == 0:
            recommendations[user_id] = []
            continue
            
        # Get the last article read by the user
        last_article_id = user_df.iloc[-1]["click_article_id"]
        
        # Get all articles read by the user (to exclude from recommendations)
        read_articles = set(user_df["click_article_id"])
        
        # Check if the last article has embeddings
        if last_article_id not in embeddings_filtered.index:
            recommendations[user_id] = []
            continue
            
        # Get embedding of the last article
        last_article_embedding = embeddings_filtered.loc[last_article_id].values.reshape(1, -1)
        
        # Compute cosine similarity with all other articles
        similarities = cosine_similarity(embeddings_filtered.values, last_article_embedding).flatten()
        
        # Create DataFrame with article IDs and similarities
        sim_df = pd.DataFrame({
            "article_id": embeddings_filtered.index,
            "similarity": similarities
        })
        
        # Exclude articles already read by the user
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        
        # Get top N recommendations
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])
    
    return recommendations


def recommend_content_based_category_with_progress(
    train_df,
    embeddings_filtered,
    SPLIT_DATE,
    alpha=0.2,
    beta=0.5,
    top_n=5,
    category_weight=2.0,
):
    """
    Category-weighted content-based recommender with tqdm progress bar
    """
    recommendations = defaultdict(list)
    user_ids = train_df["user_id"].unique()

    def recency_weight(date_str, ref_date, alpha):
        delta = (ref_date - date_str).days
        return np.exp(-alpha * delta)

    def ranking_weight(position, beta):
        return np.exp(-beta * (position - 1))

    print(f"Computing category-weighted recommendations for {len(user_ids)} users...")
    for user_id in tqdm(user_ids, desc="Content-based (category weighted)"):
        user_df = train_df.loc[train_df["user_id"] == user_id]
        weighted_sum = np.zeros(embeddings_filtered.shape[1])
        total_weight = 0.0

        # Find user's most frequent category
        if "category_id" in user_df.columns:
            most_freq_cat = user_df["category_id"].mode().iloc[0]
        else:
            most_freq_cat = None

        for row in user_df.itertuples(index=False):
                       article_id = row.click_article_id
            click_date = row.click_timestamp
            position = row.click_ranking
            category_id = row.category_id if hasattr(row, "category_id") else None

            w_recency = recency_weight(click_date, SPLIT_DATE, alpha)
            w_position = ranking_weight(position, beta)
            w_cat = category_weight if (category_id == most_freq_cat) else 1.0

            weighted_sum += (
                w_recency
                * w_position
                * w_cat
                * embeddings_filtered.loc[article_id, :].values
            )
            total_weight += w_recency * w_position * w_cat

        if total_weight == 0:
            recommendations[user_id] = []
            continue

        weighted_sum /= total_weight
        similarities = cosine_similarity(
            embeddings_filtered.values, weighted_sum.reshape(1, -1)
        ).flatten()
        sim_df = pd.DataFrame(
            {"article_id": embeddings_filtered.index, "similarity": similarities}
        )
        read_articles = set(user_df["click_article_id"])
        sim_df = sim_df[~sim_df["article_id"].isin(read_articles)]
        top_recommendations = sim_df.nlargest(top_n, "similarity")
        recommendations[user_id] = list(top_recommendations["article_id"])

    return recommendations


# Test the weighted approach with progress
recommendations_weighted_progress = recommend_content_based_with_progress(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.2, beta=0.5, top_n=5
)

# Test the average embedding approach with progress  
recommendations_avg_progress = recommend_content_based_avg_with_progress(
    train_df, embeddings_filtered, top_n=5
)

# Test the last article approach with progress
recommendations_last_progress = recommend_content_based_last_article_with_progress(
    train_df, embeddings_filtered, top_n=5
)

# Test the category-weighted approach with progress
recommendations_category_progress = recommend_content_based_category_with_progress(
    train_df, embeddings_filtered, SPLIT_DATE, alpha=0.2, beta=0.5, top_n=5
)

In [32]:
# Evaluate and compare 4 content-based methods using evaluate_all_users

# 1. Weighted approach (with progress)
weighted_precision, weighted_recall, weighted_f1 = evaluate_all_users(
    recommendations_weighted_progress, test_df, top_n=5
)

# 2. Category-weighted approach (with progress)
category_precision, category_recall, category_f1 = evaluate_all_users(
    recommendations_category_progress, test_df, top_n=5
)

# 3. Average embedding approach (with progress)
avg_precision, avg_recall, avg_f1 = evaluate_all_users(
    recommendations_avg_progress, test_df, top_n=5
)

# 4. Last article approach (with progress)
last_precision, last_recall, last_f1 = evaluate_all_users(
    recommendations_last_progress, test_df, top_n=5
)

# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    "Method": [
        "Weighted (α=0.2, β=0.5)",
        "Category Weighted",
        "Average Embeddings",
        "Last Article Only"
    ],
    "Precision@5": [
        weighted_precision,
        category_precision,
        avg_precision,
        last_precision
    ],
    "Recall@5": [
        weighted_recall,
        category_recall,
        avg_recall,
        last_recall
    ],
    "F1@5": [
        weighted_f1,
        category_f1,
        avg_f1,
        last_f1
    ]
})

print(comparison_df.to_string(index=False))

                 Method  Precision@5  Recall@5     F1@5
Weighted (α=0.2, β=0.5)     0.014929  0.006141 0.008216
      Category Weighted     0.014438  0.005839 0.007839
     Average Embeddings     0.013477  0.005413 0.007269
      Last Article Only     0.010986  0.004698 0.006225


## Progress Bar Enhanced Content-Based Recommenders

### Updates Made:

I've created enhanced versions of your content-based recommender functions with **tqdm progress bars** to track computation progress:

1. **`recommend_content_based_with_progress()`**
   - Shows progress for weighted content-based recommendations
   - Displays: "Content-based (weighted)" with user count progress

2. **`recommend_content_based_avg_with_progress()`**
   - Shows progress for average embedding recommendations  
   - Displays: "Content-based (average)" with user count progress

3. **`recommend_content_based_last_article_with_progress()`**
   - Shows progress for last article recommendations
   - Displays: "Content-based (last article)" with user count progress

4. **`recommend_content_based_category_with_progress()`**
   - Shows progress for category-weighted recommendations
   - Displays: "Content-based (category weighted)" with user count progress

### Benefits:
- ✅ **Visual feedback** during long computations
- ✅ **Time estimation** to completion
- ✅ **Better user experience** when processing thousands of users
- ✅ **Easy monitoring** of computation progress

### Usage:
Simply replace your existing function calls with the `_with_progress` versions to get progress tracking without changing any other functionality!

In [33]:
class PopularityRecommender:
    """
    Popularity-based recommender system with multiple strategies
    """
    
    def __init__(self, traintest_df, split_date=None):
        """
        Initialize the popularity recommender
        
        Args:
            traintest_df: Combined train/test dataframe for popularity calculations
            split_date: Reference date for recency calculations
        """
        self.traintest_df = traintest_df
        self.split_date = split_date
        self._popularity_cache = {}
        
    def _get_article_popularity(self, strategy='simple'):
        """Get article popularity scores using different strategies"""
        cache_key = strategy
        
        if cache_key in self._popularity_cache:
            return self._popularity_cache[cache_key]
            
        if strategy == 'simple':
            # Simple popularity based on total clicks
            popularity = dict(
                zip(self.traintest_df["click_article_id"], 
                    self.traintest_df["article_popularity"])
            )
        elif strategy == 'recency_weighted':
            # Popularity with recency weighting
            if self.split_date is None:
                raise ValueError("split_date required for recency_weighted strategy")
                
            df_weighted = self.traintest_df.copy()
            days_diff = (self.split_date - df_weighted['click_timestamp']).dt.days
            df_weighted['recency_weight'] = np.exp(-0.1 * days_diff)
            df_weighted['weighted_popularity'] = (
                df_weighted['article_popularity'] * df_weighted['recency_weight']
            )
            
            popularity = dict(
                zip(df_weighted["click_article_id"], 
                    df_weighted["weighted_popularity"])
            )
        elif strategy == 'click_ranking_weighted':
            # Popularity weighted by average click ranking
            avg_click_pos = (
                self.traintest_df.groupby("click_article_id")["click_ranking"]
                .mean()
                .reset_index()
            )
            avg_click_pos["ranking_weight"] = np.exp(-0.5 * avg_click_pos["click_ranking"])
            
            # Merge with popularity
            popularity_df = (
                self.traintest_df.drop_duplicates(subset=["click_article_id"])
                .merge(avg_click_pos, on="click_article_id")
            )
            popularity_df['weighted_popularity'] = (
                popularity_df['article_popularity'] * popularity_df['ranking_weight']
            )
            
            popularity = dict(
                zip(popularity_df["click_article_id"], 
                    popularity_df["weighted_popularity"])
            )
        else:
            raise ValueError(f"Unknown popularity strategy: {strategy}")
            
        self._popularity_cache[cache_key] = popularity
        return popularity
    
    def _get_top_articles(self, strategy='simple', exclude_articles=None):
        """Get articles sorted by popularity"""
        popularity = self._get_article_popularity(strategy)
        
        # Convert to sorted DataFrame
        articles_df = pd.DataFrame(
            list(popularity.items()), 
            columns=['click_article_id', 'popularity_score']
        ).sort_values('popularity_score', ascending=False)
        
        # Exclude articles if specified
        if exclude_articles:
            articles_df = articles_df[
                ~articles_df['click_article_id'].isin(exclude_articles)
            ]
            
        return articles_df
    
    def recommend_for_user(self, user_articles, top_n=5, strategy='simple'):
        """
        Get popularity-based recommendations for a single user
        
        Args:
            user_articles: List of articles already read by the user
            top_n: Number of recommendations
            strategy: Popularity calculation strategy
        """
        read_articles = set(user_articles) if user_articles else set()
        top_articles = self._get_top_articles(strategy, exclude_articles=read_articles)
        
        return top_articles['click_article_id'].head(top_n).tolist()
    
    def recommend_for_all_users(self, train_data, top_n=5, strategy='simple', show_progress=True):
        """
        Generate recommendations for all users
        
        Args:
            train_data: Training data (Series with user_id as index, list of articles as values)
            top_n: Number of recommendations per user
            strategy: Popularity calculation strategy
            show_progress: Whether to show progress bar
        """
        recommendations = {}
        
        iterator = tqdm(train_data.index, desc=f"Popularity ({strategy})") if show_progress else train_data.index
        
        for user_id in iterator:
            try:
                user_articles = train_data.at[user_id]
                recommendations[user_id] = self.recommend_for_user(
                    user_articles, top_n, strategy
                )
            except KeyError:
                recommendations[user_id] = []
                
        return recommendations

In [34]:
class CoOccurrenceRecommender:
    """
    Co-occurrence based recommender system with multiple strategies
    """
    
    def __init__(self, train_df, split_date=None):
        """
        Initialize the co-occurrence recommender
        
        Args:
            train_df: Training dataframe 
            split_date: Reference date for recency calculations
        """
        self.train_df = train_df
        self.split_date = split_date
        self._cooccurrence_cache = {}
        
    def _compute_cooccurrence_matrix(self, groupby_col='user_id', min_cooccurrence=1):
        """
        Compute article co-occurrence matrix
        
        Args:
            groupby_col: Column to group by ('user_id' or 'session_id')
            min_cooccurrence: Minimum co-occurrence count to include
        """
        cache_key = f"{groupby_col}_{min_cooccurrence}"
        
        if cache_key in self._cooccurrence_cache:
            return self._cooccurrence_cache[cache_key]
            
        from itertools import combinations
        
        # Group articles by the specified column
        grouped_articles = self.train_df.groupby(groupby_col)["click_article_id"].agg(list)
        
        # Generate all pairs of articles within each group
        pairs = []
        for articles_list in grouped_articles:
            if len(articles_list) < 2:
                continue
            for a, b in combinations(articles_list, 2):
                # Sort pair to ignore order
                pair = tuple(sorted((a, b)))
                pairs.append(pair)
        
        # Count pairs
        pair_counts = (
            pd.DataFrame(pairs, columns=["article_1", "article_2"])
            .value_counts()
            .reset_index(name="count")
        )
        
        # Filter by minimum co-occurrence
        pair_counts = pair_counts[pair_counts["count"] >= min_cooccurrence]
        
        self._cooccurrence_cache[cache_key] = pair_counts
        return pair_counts
    
    def _get_recency_weights(self, user_history_df, alpha=0.1):
        """Calculate recency weights for user's article history"""
        if self.split_date is None:
            return pd.Series(1.0, index=user_history_df.index)
            
        days_diff = (self.split_date - user_history_df['click_timestamp']).dt.days
        return np.exp(-alpha * days_diff)
    
    def recommend_for_user(self, user_id, top_n=5, groupby_col='user_id', 
                          use_recency=False, alpha=0.1, min_cooccurrence=1):
        """
        Get co-occurrence recommendations for a single user
        
        Args:
            user_id: Target user ID
            top_n: Number of recommendations
            groupby_col: Grouping column for co-occurrence ('user_id' or 'session_id')
            use_recency: Whether to apply recency weighting
            alpha: Recency decay parameter
            min_cooccurrence: Minimum co-occurrence threshold
        """
        # Get user's article history
        user_history = self.train_df[self.train_df["user_id"] == user_id]
        if len(user_history) == 0:
            return []
            
        user_articles = user_history["click_article_id"].unique()
        
        # Get co-occurrence matrix
        pair_counts = self._compute_cooccurrence_matrix(groupby_col, min_cooccurrence)
        
        # Find pairs containing user's articles
        mask = (
            pair_counts["article_1"].isin(user_articles) | 
            pair_counts["article_2"].isin(user_articles)
        )
        related_pairs = pair_counts[mask].copy()
        
        if len(related_pairs) == 0:
            return []
        
        # Identify candidate articles (the "other" article in each pair)
        def get_candidate_article(row):
            if row["article_1"] in user_articles:
                return row["article_2"]
            else:
                return row["article_1"]
                
        related_pairs["candidate"] = related_pairs.apply(get_candidate_article, axis=1)
        related_pairs["source"] = related_pairs.apply(
            lambda row: row["article_1"] if row["article_1"] in user_articles else row["article_2"], 
            axis=1
        )
        
        # Apply recency weighting if requested
        if use_recency and self.split_date is not None:
            # Get recency weights for source articles
            source_weights = {}
            for article_id in user_articles:
                article_history = user_history[user_history["click_article_id"] == article_id]
                if len(article_history) > 0:
                    # Use most recent interaction for this article
                    latest_interaction = article_history.sort_values("click_timestamp").iloc[-1]
                    days_diff = (self.split_date - latest_interaction["click_timestamp"]).days
                    source_weights[article_id] = np.exp(-alpha * days_diff)
                else:
                    source_weights[article_id] = 1.0
            
            # Apply weights to co-occurrence scores
            related_pairs["recency_weight"] = related_pairs["source"].map(source_weights)
            related_pairs["weighted_score"] = related_pairs["count"] * related_pairs["recency_weight"]
            score_col = "weighted_score"
        else:
            score_col = "count"
        
        # Aggregate scores by candidate article
        candidate_scores = (
            related_pairs.groupby("candidate")[score_col]
            .sum()
            .sort_values(ascending=False)
        )
        
        # Remove articles already read by the user
        candidate_scores = candidate_scores[~candidate_scores.index.isin(user_articles)]
        
        return candidate_scores.head(top_n).index.tolist()
    
    def recommend_for_all_users(self, top_n=5, groupby_col='user_id', 
                               use_recency=False, alpha=0.1, min_cooccurrence=1, show_progress=True):
        """
        Generate co-occurrence recommendations for all users
        
        Args:
            top_n: Number of recommendations per user
            groupby_col: Grouping column for co-occurrence
            use_recency: Whether to apply recency weighting
            alpha: Recency decay parameter
            min_cooccurrence: Minimum co-occurrence threshold
            show_progress: Whether to show progress bar
        """
        recommendations = {}
        user_ids = self.train_df["user_id"].unique()
        
        strategy_name = f"CoOcc ({groupby_col}" + (f", recency" if use_recency else "") + ")"
        iterator = tqdm(user_ids, desc=strategy_name) if show_progress else user_ids
        
        for user_id in iterator:
            recommendations[user_id] = self.recommend_for_user(
                user_id, top_n, groupby_col, use_recency, alpha, min_cooccurrence
            )
        
        return recommendations

In [35]:
# Instantiate the popularity recommender
pop_recommender = PopularityRecommender(traintest_df, split_date=SPLIT_DATE)

# Generate recommendations for all users in the train set (using the same users as in 'train')
pop_recommendations = pop_recommender.recommend_for_all_users(
    train, top_n=5, strategy='simple', show_progress=True
)

# Evaluate the popularity recommendations using the same evaluation function as other methods
pop_avg_precision, pop_avg_recall, pop_avg_f1 = evaluate_all_users(
    pop_recommendations, test_df, top_n=5
)

print(f"Popularity - Average Precision@5: {pop_avg_precision:.4f}")
print(f"Popularity - Average Recall@5: {pop_avg_recall:.4f}")
print(f"Popularity - Average F1@5: {pop_avg_f1:.4f}")

Popularity (simple): 100%|██████████| 10195/10195 [00:51<00:00, 198.21it/s]



Popularity - Average Precision@5: 0.1269
Popularity - Average Recall@5: 0.0519
Popularity - Average F1@5: 0.0698


# collab filtering

In [36]:
class PopularityGuidedCollaborativeFilter:
    """
    Enhanced CF that incorporates popularity signals based on your findings
    """
    
    def __init__(self, factors=100, regularization=0.01, iterations=50, 
                 alpha=0.2, beta=0.5, popularity_weight=0.3):
        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations
        self.alpha = alpha  # recency weight
        self.beta = beta    # ranking weight  
        self.popularity_weight = popularity_weight
        self.model = None
        self.popularity_bias = None
        
    def create_enhanced_interaction_matrix(self, train_df, SPLIT_DATE, article_popularity_dict):
        """
        Create interaction matrix with popularity-enhanced weights
        """
        from scipy.sparse import csr_matrix
        
        # Calculate base weights (recency + ranking)
        train_df_weighted = train_df.copy()
        
        # Recency weighting
        days_diff = (SPLIT_DATE - train_df_weighted['click_timestamp']).dt.days
        train_df_weighted['recency_w'] = np.exp(-self.alpha * days_diff)
        
        # Ranking weighting (better positions = higher weight)
        train_df_weighted['ranking_w'] = np.exp(-self.beta * (train_df_weighted['click_ranking'] - 1))
        
        # Popularity enhancement
        train_df_weighted['popularity_score'] = train_df_weighted['click_article_id'].map(article_popularity_dict)
        
        # Normalize popularity scores
        pop_min = train_df_weighted['popularity_score'].min()
        pop_max = train_df_weighted['popularity_score'].max()
        train_df_weighted['popularity_norm'] = (train_df_weighted['popularity_score'] - pop_min) / (pop_max - pop_min)
        
        # Combined weight: base_weight * (1 + popularity_boost)
        base_weight = train_df_weighted['recency_w'] * train_df_weighted['ranking_w']
        popularity_boost = self.popularity_weight * train_df_weighted['popularity_norm']
        train_df_weighted['final_weight'] = base_weight * (1 + popularity_boost)
        
        # Aggregate by user-item pairs
        interaction_weights = (
            train_df_weighted.groupby(['user_id', 'click_article_id'])['final_weight']
            .sum()
            .reset_index()
        )
        
        # Create sparse matrix
        user_codes = interaction_weights['user_id'].astype('category').cat.codes
        article_codes = interaction_weights['click_article_id'].astype('category').cat.codes
        
        # Store mappings
        self.user_categories = interaction_weights['user_id'].astype('category').cat.categories
        self.article_categories = interaction_weights['click_article_id'].astype('category').cat.categories
        self.user_id_map = dict(enumerate(self.user_categories))
        self.article_id_map = dict(enumerate(self.article_categories))
        self.user_id_invmap = {v: k for k, v in self.user_id_map.items()}
        self.article_id_invmap = {v: k for k, v in self.article_id_map.items()}
        
        # Create interaction matrix
        weights = interaction_weights['final_weight'].values
        interaction_matrix = csr_matrix(
            (weights, (user_codes, article_codes)),
            shape=(len(self.user_categories), len(self.article_categories))
        )
        
        return interaction_matrix
    
    def fit(self, train_df, SPLIT_DATE, article_popularity_dict):
        """
        Train the popularity-guided CF model
        """
        from implicit.als import AlternatingLeastSquares
        from implicit.nearest_neighbours import bm25_weight
        
        print("Creating popularity-enhanced interaction matrix...")
        interaction_matrix = self.create_enhanced_interaction_matrix(
            train_df, SPLIT_DATE, article_popularity_dict
        )
        
        print(f"Matrix shape: {interaction_matrix.shape}")
        print(f"Matrix density: {interaction_matrix.nnz / np.prod(interaction_matrix.shape):.6f}")
        
        # Apply BM25 weighting
        print("Applying BM25 weighting...")
        weighted_matrix = bm25_weight(interaction_matrix.T, K1=100, B=0.8).T
        
        # Create popularity bias vector
        article_pop_scores = pd.Series(article_popularity_dict)
        self.popularity_bias = np.zeros(len(self.article_categories))
        
        for i, article_id in enumerate(self.article_categories):
            if article_id in article_pop_scores:
                self.popularity_bias[i] = article_pop_scores[article_id]
        
        # Normalize popularity bias
        self.popularity_bias = (self.popularity_bias - self.popularity_bias.min()) / (
            self.popularity_bias.max() - self.popularity_bias.min() + 1e-9
        )
        
        # Train ALS model
        print("Training ALS model...")
        self.model = AlternatingLeastSquares(
            factors=self.factors,
            regularization=self.regularization,
            iterations=self.iterations,
            random_state=42
        )
        
        self.model.fit(weighted_matrix)
        print("Training completed!")
        
    def get_user_recommendations(self, user_id, top_n=5, hybrid_weight=0.7):
        """
        Get recommendations with CF + popularity hybrid scoring
        """
        if user_id not in self.user_id_invmap:
            # Fall back to pure popularity for cold users
            return self._get_popularity_fallback(top_n)
            
        user_index = self.user_id_invmap[user_id]
        
        # Get CF scores
        user_vector = self.model.user_factors[user_index]
        cf_scores = np.dot(self.model.item_factors, user_vector)
        
        # Normalize CF scores
        cf_scores_norm = (cf_scores - cf_scores.min()) / (cf_scores.max() - cf_scores.min() + 1e-9)
        
        # Hybrid scoring: CF + Popularity
        hybrid_scores = hybrid_weight * cf_scores_norm + (1 - hybrid_weight) * self.popularity_bias
        
        # Get top recommendations
        top_indices = np.argsort(hybrid_scores)[::-1]
        
        # Filter out seen articles
        user_row = self.model.user_factors[user_index:user_index+1]
        seen_articles = set()
        # You'd need to implement seen article filtering here
        
        recommendations = []
        for idx in top_indices:
            article_id = self.article_id_map[idx]
            if article_id not in seen_articles and len(recommendations) < top_n:
                recommendations.append(article_id)
                
        return recommendations
    
    def _get_popularity_fallback(self, top_n):
        """Fallback to popularity for cold users"""
        top_indices = np.argsort(self.popularity_bias)[::-1][:top_n]
        return [self.article_id_map[idx] for idx in top_indices]

    def recommend_for_all_users(self, train_df, top_n=5, hybrid_weight=0.7):
        """Generate recommendations for all users"""
        recommendations = {}
        all_users = train_df['user_id'].unique()
        
        print(f"Generating recommendations for {len(all_users)} users...")
        for user_id in tqdm(all_users, desc="Popularity-Guided CF"):
            recommendations[user_id] = self.get_user_recommendations(
                user_id, top_n, hybrid_weight
            )
            
        return recommendations

In [40]:
# Test different configurations
configurations = [
    {"factors": 64, "popularity_weight": 0.2, "hybrid_weight": 0.8},
    {"factors": 100, "popularity_weight": 0.3, "hybrid_weight": 0.7},
    {"factors": 128, "popularity_weight": 0.4, "hybrid_weight": 0.6},
    {"factors": 64, "popularity_weight": 0.5, "hybrid_weight": 0.5},
]

best_config = None
best_f1 = 0

for config in configurations:
    print(f"\nTesting config: {config}")
    
    cf_enhanced = PopularityGuidedCollaborativeFilter(
        factors=config["factors"],
        popularity_weight=config["popularity_weight"],
        regularization=0.01,
        iterations=30
    )
    
    cf_enhanced.fit(train_df, SPLIT_DATE, article_popularity_dict)
    recommendations = cf_enhanced.recommend_for_all_users(
        train_df, top_n=5, hybrid_weight=config["hybrid_weight"]
    )
    
    precision, recall, f1 = evaluate_all_users(recommendations, test_df, top_n=5)
    print(f"Results: P@5={precision:.4f}, R@5={recall:.4f}, F1@5={f1:.4f}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_config = config

print(f"\nBest configuration: {best_config}")
print(f"Best F1@5: {best_f1:.4f}")


Testing config: {'factors': 64, 'popularity_weight': 0.2, 'hybrid_weight': 0.8}
Creating popularity-enhanced interaction matrix...


Matrix shape: (10195, 3243)
Matrix density: 0.005251
Applying BM25 weighting...
Training ALS model...




  0%|          | 0/30 [00:00<?, ?it/s]

Training completed!
Generating recommendations for 10195 users...


Popularity-Guided CF: 100%|██████████| 10195/10195 [00:06<00:00, 1546.63it/s]



Results: P@5=0.0060, R@5=0.0030, F1@5=0.0036

Testing config: {'factors': 100, 'popularity_weight': 0.3, 'hybrid_weight': 0.7}
Creating popularity-enhanced interaction matrix...
Matrix shape: (10195, 3243)
Matrix density: 0.005251
Applying BM25 weighting...
Training ALS model...




  0%|          | 0/30 [00:00<?, ?it/s]

Training completed!
Generating recommendations for 10195 users...


Popularity-Guided CF: 100%|██████████| 10195/10195 [00:09<00:00, 1066.80it/s]
Popularity-Guided CF: 100%|██████████| 10195/10195 [00:09<00:00, 1066.80it/s]


Results: P@5=0.0063, R@5=0.0035, F1@5=0.0042

Testing config: {'factors': 128, 'popularity_weight': 0.4, 'hybrid_weight': 0.6}
Creating popularity-enhanced interaction matrix...
Matrix shape: (10195, 3243)
Matrix density: 0.005251
Applying BM25 weighting...
Training ALS model...




  0%|          | 0/30 [00:00<?, ?it/s]

Training completed!
Generating recommendations for 10195 users...


Popularity-Guided CF: 100%|██████████| 10195/10195 [00:02<00:00, 3607.44it/s]



Results: P@5=0.0108, R@5=0.0055, F1@5=0.0068

Testing config: {'factors': 64, 'popularity_weight': 0.5, 'hybrid_weight': 0.5}
Creating popularity-enhanced interaction matrix...
Matrix shape: (10195, 3243)
Matrix density: 0.005251
Applying BM25 weighting...
Training ALS model...




  0%|          | 0/30 [00:00<?, ?it/s]

Training completed!
Generating recommendations for 10195 users...


Popularity-Guided CF: 100%|██████████| 10195/10195 [00:08<00:00, 1182.69it/s]



Results: P@5=0.0239, R@5=0.0103, F1@5=0.0136

Best configuration: {'factors': 64, 'popularity_weight': 0.5, 'hybrid_weight': 0.5}
Best F1@5: 0.0136


# Collaborative Filtering Comparison: User-Based vs Article-Based

In this section, we'll implement and compare two classic collaborative filtering approaches:
1. **User-Based Collaborative Filtering**: Recommends articles based on similar users' preferences
2. **Article-Based Collaborative Filtering**: Recommends articles similar to those the user has interacted with

We'll evaluate both methods using the same metrics and compare their performance.

In [None]:
# Additional imports for collaborative filtering
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cosine
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

In [51]:
# Create user-article interaction matrix
def create_interaction_matrix(df):
    """
    Create a user-article interaction matrix from click data
    """
    # Create interaction matrix (users x articles)
    interaction_matrix = df.pivot_table(
        index='user_id', 
        columns='click_article_id', 
        values='session_id',  # Using session_id as interaction indicator
        fill_value=0,
        aggfunc='count'  # Count number of clicks
    )
    
    # Convert to binary (1 if interaction exists, 0 otherwise)
    interaction_matrix = (interaction_matrix > 0).astype(int)
    
    return interaction_matrix

print("Creating interaction matrix from training data...")
interaction_matrix = create_interaction_matrix(train_df)
print(f"Interaction matrix shape: {interaction_matrix.shape}")
print(f"Sparsity: {(1 - interaction_matrix.sum().sum() / (interaction_matrix.shape[0] * interaction_matrix.shape[1]))*100:.2f}%")

Creating interaction matrix from training data...
Interaction matrix shape: (10195, 3243)
Sparsity: 99.47%


In [52]:
class UserBasedCollaborativeFilter:
    """
    User-Based Collaborative Filtering Recommender
    Finds similar users and recommends articles they liked
    """
    
    def __init__(self, interaction_matrix, n_similar_users=50, min_interactions=5):
        self.interaction_matrix = interaction_matrix
        self.n_similar_users = n_similar_users
        self.min_interactions = min_interactions
        self.user_similarities = None
        
    def fit(self):
        """
        Calculate user-user similarity matrix
        """
        print("Calculating user-user similarities...")
        
        # Convert to sparse matrix for efficiency
        sparse_matrix = csr_matrix(self.interaction_matrix.values)
        
        # Calculate cosine similarity between users
        self.user_similarities = cosine_similarity(sparse_matrix)
        
        # Set diagonal to 0 (user shouldn't be similar to themselves for recommendations)
        np.fill_diagonal(self.user_similarities, 0)
        
        print(f"User similarity matrix shape: {self.user_similarities.shape}")
        
    def get_recommendations(self, user_id, n_recommendations=5):
        """
        Get recommendations for a specific user
        """
        if self.user_similarities is None:
            raise ValueError("Model not fitted. Call fit() first.")
            
        # Get user index in the matrix
        if user_id not in self.interaction_matrix.index:
            # Return popular articles for cold start users
            return self._get_popular_articles(n_recommendations)
        
        user_idx = self.interaction_matrix.index.get_loc(user_id)
        
        # Get user's interaction history
        user_interactions = self.interaction_matrix.iloc[user_idx]
        
        # If user has too few interactions, return popular articles
        if user_interactions.sum() < self.min_interactions:
            return self._get_popular_articles(n_recommendations)
        
        # Find most similar users
        user_sim_scores = self.user_similarities[user_idx]
        similar_users_idx = np.argsort(user_sim_scores)[::-1][:self.n_similar_users]
        
        # Get articles liked by similar users that current user hasn't interacted with
        recommendations = defaultdict(float)
        
        for similar_user_idx in similar_users_idx:
            similarity_score = user_sim_scores[similar_user_idx]
            if similarity_score > 0:  # Only consider users with positive similarity
                similar_user_interactions = self.interaction_matrix.iloc[similar_user_idx]
                
                # Find articles the similar user liked but current user hasn't seen
                for article_id in similar_user_interactions.index:
                    if similar_user_interactions[article_id] == 1 and user_interactions[article_id] == 0:
                        recommendations[article_id] += similarity_score
        
        # Sort recommendations by score
        sorted_recommendations = sorted(recommendations.items(), 
                                      key=lambda x: x[1], reverse=True)
        
        # Return top N recommendations
        return [article_id for article_id, _ in sorted_recommendations[:n_recommendations]]
    
    def _get_popular_articles(self, n_recommendations):
        """
        Return most popular articles for cold start users
        """
        article_popularity = self.interaction_matrix.sum(axis=0).sort_values(ascending=False)
        return article_popularity.head(n_recommendations).index.tolist()
    
    def recommend_all_users(self, user_ids, n_recommendations=5):
        """
        Get recommendations for multiple users
        """
        recommendations = {}
        for user_id in user_ids:
            recommendations[user_id] = self.get_recommendations(user_id, n_recommendations)
        return recommendations

print("User-Based Collaborative Filtering class created successfully!")

User-Based Collaborative Filtering class created successfully!


In [53]:
class ArticleBasedCollaborativeFilter:
    """
    Article-Based (Item-Based) Collaborative Filtering Recommender
    Finds similar articles and recommends based on user's interaction history
    """
    
    def __init__(self, interaction_matrix, n_similar_articles=50, min_interactions=5):
        self.interaction_matrix = interaction_matrix
        self.n_similar_articles = n_similar_articles
        self.min_interactions = min_interactions
        self.article_similarities = None
        
    def fit(self):
        """
        Calculate article-article similarity matrix
        """
        print("Calculating article-article similarities...")
        
        # Transpose matrix to get articles x users
        article_matrix = self.interaction_matrix.T
        
        # Convert to sparse matrix for efficiency
        sparse_matrix = csr_matrix(article_matrix.values)
        
        # Calculate cosine similarity between articles
        self.article_similarities = cosine_similarity(sparse_matrix)
        
        # Set diagonal to 0 (article shouldn't be similar to itself for recommendations)
        np.fill_diagonal(self.article_similarities, 0)
        
        print(f"Article similarity matrix shape: {self.article_similarities.shape}")
        
    def get_recommendations(self, user_id, n_recommendations=5):
        """
        Get recommendations for a specific user
        """
        if self.article_similarities is None:
            raise ValueError("Model not fitted. Call fit() first.")
            
        # Get user's interaction history
        if user_id not in self.interaction_matrix.index:
            # Return popular articles for cold start users
            return self._get_popular_articles(n_recommendations)
        
        user_interactions = self.interaction_matrix.loc[user_id]
        
        # If user has too few interactions, return popular articles
        if user_interactions.sum() < self.min_interactions:
            return self._get_popular_articles(n_recommendations)
        
        # Get articles the user has interacted with
        user_articles = user_interactions[user_interactions == 1].index.tolist()
        
        # Calculate recommendation scores for all articles
        recommendations = defaultdict(float)
        
        for article_id in user_articles:
            if article_id in self.interaction_matrix.columns:
                article_idx = self.interaction_matrix.columns.get_loc(article_id)
                
                # Find similar articles
                article_sim_scores = self.article_similarities[article_idx]
                similar_articles_idx = np.argsort(article_sim_scores)[::-1][:self.n_similar_articles]
                
                # Add scores for similar articles
                for similar_article_idx in similar_articles_idx:
                    similar_article_id = self.interaction_matrix.columns[similar_article_idx]
                    similarity_score = article_sim_scores[similar_article_idx]
                    
                    # Only recommend articles the user hasn't interacted with
                    if similarity_score > 0 and user_interactions[similar_article_id] == 0:
                        recommendations[similar_article_id] += similarity_score
        
        # Sort recommendations by score
        sorted_recommendations = sorted(recommendations.items(), 
                                      key=lambda x: x[1], reverse=True)
        
        # Return top N recommendations
        return [article_id for article_id, _ in sorted_recommendations[:n_recommendations]]
    
    def _get_popular_articles(self, n_recommendations):
        """
        Return most popular articles for cold start users
        """
        article_popularity = self.interaction_matrix.sum(axis=0).sort_values(ascending=False)
        return article_popularity.head(n_recommendations).index.tolist()
    
    def recommend_all_users(self, user_ids, n_recommendations=5):
        """
        Get recommendations for multiple users
        """
        recommendations = {}
        for user_id in user_ids:
            recommendations[user_id] = self.get_recommendations(user_id, n_recommendations)
        return recommendations

print("Article-Based Collaborative Filtering class created successfully!")

Article-Based Collaborative Filtering class created successfully!


In [54]:
# Train User-Based Collaborative Filtering
print("=" * 60)
print("TRAINING USER-BASED COLLABORATIVE FILTERING")
print("=" * 60)

user_cf = UserBasedCollaborativeFilter(
    interaction_matrix=interaction_matrix,
    n_similar_users=50,
    min_interactions=3
)

# Fit the model
user_cf.fit()
print("User-Based CF model trained successfully!")

TRAINING USER-BASED COLLABORATIVE FILTERING
Calculating user-user similarities...
User similarity matrix shape: (10195, 10195)
User-Based CF model trained successfully!


In [55]:
# Train Article-Based Collaborative Filtering
print("=" * 60)
print("TRAINING ARTICLE-BASED COLLABORATIVE FILTERING")
print("=" * 60)

article_cf = ArticleBasedCollaborativeFilter(
    interaction_matrix=interaction_matrix,
    n_similar_articles=50,
    min_interactions=3
)

# Fit the model
article_cf.fit()
print("Article-Based CF model trained successfully!")

TRAINING ARTICLE-BASED COLLABORATIVE FILTERING
Calculating article-article similarities...
Article similarity matrix shape: (3243, 3243)
Article-Based CF model trained successfully!


In [57]:
# Generate recommendations for test users (optimized version)
print("=" * 60)
print("GENERATING RECOMMENDATIONS")
print("=" * 60)

# Get unique test users
test_users = test_df['user_id'].unique()
print(f"Number of test users: {len(test_users)}")

# Use a smaller sample for faster evaluation
sample_size = min(100, len(test_users))  # Reduced sample size for faster computation
test_users_sample = np.random.choice(test_users, sample_size, replace=False)
print(f"Evaluating on {sample_size} users for faster computation")

print("\nGenerating User-Based CF recommendations...")
user_cf_recommendations = {}
for i, user_id in enumerate(test_users_sample):
    if i % 20 == 0:  # Progress indicator
        print(f"  Progress: {i}/{sample_size}")
    user_cf_recommendations[user_id] = user_cf.get_recommendations(user_id, n_recommendations=5)

print("Generating Article-Based CF recommendations...")
article_cf_recommendations = {}
for i, user_id in enumerate(test_users_sample):
    if i % 20 == 0:  # Progress indicator
        print(f"  Progress: {i}/{sample_size}")
    article_cf_recommendations[user_id] = article_cf.get_recommendations(user_id, n_recommendations=5)

print("Recommendations generated successfully!")

GENERATING RECOMMENDATIONS
Number of test users: 10195
Evaluating on 100 users for faster computation

Generating User-Based CF recommendations...
  Progress: 0/100
  Progress: 20/100
  Progress: 40/100
  Progress: 60/100
  Progress: 80/100
Generating Article-Based CF recommendations...
  Progress: 0/100
  Progress: 20/100
  Progress: 40/100
  Progress: 60/100
  Progress: 80/100
Recommendations generated successfully!
