# MovieLens-100K Top-N 实验（Colab版，独立于现有 `.py`）

本 notebook 是**全新实现**，用于在 Google Colab 上复现实验，不修改你项目中的 `backend/experiments/movielens_experiment.py`。

对比方法：
- CB（TF-IDF）
- Item-CF
- User-CF
- BERT-Enhanced（Item-CF + User-CF + BERTopic向量）

评估指标：Precision@5/10、Recall@5/10、Hit@5/10。

In [None]:
# Colab 依赖安装（首次运行）
!pip -q install numpy pandas scikit-learn matplotlib tqdm

In [None]:
import os
import json
import random
import zipfile
import urllib.request
from dataclasses import dataclass
from collections import defaultdict
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print('Imports OK')

In [None]:
@dataclass
class Config:
    dataset_dir: str = './ml-100k'
    test_ratio: float = 0.2
    seed: int = 42
    positive_threshold: float = 4.0
    top_ks: tuple = (5, 10)

cfg = Config()
cfg

In [None]:
def download_movielens_100k(dataset_dir='./ml-100k'):
    if os.path.exists(dataset_dir):
        print('Dataset exists:', dataset_dir)
        return

    url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
    zip_path = './ml-100k.zip'
    print('Downloading from', url)
    urllib.request.urlretrieve(url, zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall('./')
    os.remove(zip_path)
    print('Done')

download_movielens_100k(cfg.dataset_dir)

In [None]:
def load_movies(dataset_dir):
    genre_names = [
        'unknown','Action','Adventure','Animation','Children','Comedy','Crime',
        'Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery',
        'Romance','Sci-Fi','Thriller','War','Western'
    ]

    items = []
    with open(os.path.join(dataset_dir, 'u.item'), 'r', encoding='latin-1') as f:
        for line in f:
            parts = line.strip().split('|')
            movie_id = int(parts[0])
            title = parts[1]
            flags = parts[5:24]
            genres = [g for g, v in zip(genre_names, flags) if v == '1']
            content = f"{title} {' '.join(genres)} movie film"
            items.append({'id': movie_id, 'title': title, 'genres': genres, 'content': content})
    return items

def load_ratings(dataset_dir):
    out = []
    with open(os.path.join(dataset_dir, 'u.data'), 'r', encoding='utf-8') as f:
        for line in f:
            uid, mid, rating, ts = line.strip().split('\t')
            out.append({
                'user_id': int(uid),
                'poem_id': int(mid),
                'rating': float(rating),
                'created_at': datetime.fromtimestamp(int(ts)),
            })
    return out

movies = load_movies(cfg.dataset_dir)
ratings = load_ratings(cfg.dataset_dir)
print('movies=', len(movies), 'ratings=', len(ratings), 'users=', len(set(x['user_id'] for x in ratings)))

In [None]:
def split_by_user_random(interactions, test_ratio=0.2, seed=42):
    rng = random.Random(seed)
    by_user = defaultdict(list)
    for x in interactions:
        by_user[x['user_id']].append(x)

    train, test = [], []
    for uid, xs in by_user.items():
        xs = xs.copy()
        rng.shuffle(xs)
        n_test = max(1, int(len(xs) * test_ratio))
        test.extend(xs[:n_test])
        train.extend(xs[n_test:])
    return train, test

def build_user_index(interactions):
    d = defaultdict(list)
    for x in interactions:
        d[x['user_id']].append(x)
    return d

train, test = split_by_user_random(ratings, cfg.test_ratio, cfg.seed)
user_train = build_user_index(train)
user_test = build_user_index(test)
len(train), len(test)

In [None]:
class ContentBasedRecommender:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=8000)
        self.items = None
        self.item_vectors = None
        self.id_to_idx = {}

    def fit(self, items):
        self.items = items
        self.id_to_idx = {x['id']: i for i, x in enumerate(items)}
        self.item_vectors = self.vectorizer.fit_transform([x['content'] for x in items])

    def recommend(self, user_interactions, exclude_ids, top_k=10):
        if self.item_vectors is None:
            return []
        rated = [x for x in user_interactions if x['poem_id'] in self.id_to_idx]
        if not rated:
            return []
        docs = [self.items[self.id_to_idx[x['poem_id']]]['content'] for x in rated]
        vecs = self.vectorizer.transform(docs).toarray()
        ratings = np.array([x.get('rating', 3.0) for x in rated])
        weights = np.clip(ratings - 2.5, 0, None)
        profile = np.average(vecs, axis=0, weights=weights) if weights.sum() > 0 else np.mean(vecs, axis=0)
        sims = cosine_similarity([profile], self.item_vectors)[0]
        recs = []
        for i, s in enumerate(sims):
            pid = self.items[i]['id']
            if pid in exclude_ids:
                continue
            recs.append({'poem_id': pid, 'score': float(s)})
        recs.sort(key=lambda x: x['score'], reverse=True)
        return recs[:top_k]

class ItemBasedCF:
    def __init__(self):
        self.item_sim = None
        self.item_ids = None
        self.item_id_to_idx = {}
        self.idx_to_item_id = {}
        self.rating_matrix = None

    def fit(self, interactions, item_ids):
        users = sorted(set(x['user_id'] for x in interactions))
        user_map = {u:i for i,u in enumerate(users)}
        self.item_ids = list(item_ids)
        self.item_id_to_idx = {pid:i for i,pid in enumerate(self.item_ids)}
        self.idx_to_item_id = {i:pid for pid,i in self.item_id_to_idx.items()}

        R = np.zeros((len(users), len(self.item_ids)))
        for x in interactions:
            u = user_map[x['user_id']]
            p = self.item_id_to_idx.get(x['poem_id'])
            if p is not None:
                R[u, p] = x.get('rating', 3.0)
        self.rating_matrix = R

        n_items = R.shape[1]
        sim = np.zeros((n_items, n_items))
        for i in range(n_items):
            sim[i, i] = 1.0
            for j in range(i + 1, n_items):
                mask = (R[:, i] > 0) & (R[:, j] > 0)
                if mask.sum() == 0:
                    s = 0.0
                else:
                    vi, vj = R[mask, i], R[mask, j]
                    vi, vj = vi - vi.mean(), vj - vj.mean()
                    s = float((vi*vj).sum() / (np.sqrt((vi**2).sum()) * np.sqrt((vj**2).sum()) + 1e-8))
                sim[i, j] = s
                sim[j, i] = s
        self.item_sim = sim

    def recommend(self, user_interactions, exclude_ids, top_k=10):
        if self.item_sim is None:
            return []
        user_ratings = np.zeros(len(self.item_ids))
        for x in user_interactions:
            idx = self.item_id_to_idx.get(x['poem_id'])
            if idx is not None:
                user_ratings[idx] = x.get('rating', 3.0)

        rated = np.where(user_ratings > 0)[0]
        if len(rated) == 0:
            return []

        scores = np.zeros(len(self.item_ids))
        for i in range(len(self.item_ids)):
            if user_ratings[i] > 0:
                continue
            neigh = self.item_sim[i, rated]
            rr = user_ratings[rated]
            m = neigh > 0
            if m.sum() > 0:
                scores[i] = np.dot(neigh[m], rr[m]) / (np.abs(neigh[m]).sum() + 1e-8)

        recs = []
        for i, s in enumerate(scores):
            pid = self.idx_to_item_id[i]
            if pid not in exclude_ids:
                recs.append({'poem_id': pid, 'score': float(s)})
        recs.sort(key=lambda x: x['score'], reverse=True)
        return recs[:top_k]

class UserBasedCF:
    def __init__(self, k_neighbors=40):
        self.k_neighbors = k_neighbors
        self.R = None
        self.user_map = {}
        self.item_map = {}
        self.idx_to_item = {}
        self.user_sim = None

    def fit(self, interactions, item_ids):
        users = sorted(set(x['user_id'] for x in interactions))
        self.user_map = {u:i for i,u in enumerate(users)}
        self.item_map = {pid:i for i,pid in enumerate(item_ids)}
        self.idx_to_item = {i:pid for pid,i in self.item_map.items()}

        R = np.zeros((len(users), len(item_ids)))
        for x in interactions:
            u = self.user_map[x['user_id']]
            p = self.item_map.get(x['poem_id'])
            if p is not None:
                R[u,p] = x.get('rating', 3.0)
        self.R = R

        n = R.shape[0]
        sim = np.zeros((n, n))
        for i in range(n):
            sim[i,i]=1.0
            for j in range(i+1,n):
                m = (R[i]>0)&(R[j]>0)
                if m.sum()==0:
                    s=0.0
                else:
                    vi, vj = R[i,m], R[j,m]
                    vi, vj = vi-vi.mean(), vj-vj.mean()
                    s=float((vi*vj).sum()/(np.sqrt((vi**2).sum())*np.sqrt((vj**2).sum())+1e-8))
                sim[i,j]=sim[j,i]=s
        self.user_sim = sim

    def recommend(self, user_interactions, exclude_ids, top_k=10):
        if not user_interactions:
            return []
        uid = user_interactions[0]['user_id']
        tidx = self.user_map.get(uid)
        if tidx is None:
            return []

        sims = self.user_sim[tidx].copy()
        sims[tidx] = -np.inf
        neigh_idx = np.argsort(sims)[-self.k_neighbors:]
        neigh = [(i, sims[i]) for i in neigh_idx if sims[i] > 0]
        if not neigh:
            return []

        recs = []
        for i in range(self.R.shape[1]):
            pid = self.idx_to_item[i]
            if pid in exclude_ids:
                continue
            ws, ss = 0.0, 0.0
            for nidx, nsim in neigh:
                r = self.R[nidx, i]
                if r > 0:
                    ws += nsim * r
                    ss += abs(nsim)
            if ss > 0:
                recs.append({'poem_id': pid, 'score': float(ws/(ss+1e-8))})

        recs.sort(key=lambda x: x['score'], reverse=True)
        return recs[:top_k]

In [None]:
# BERT-Enhanced：优先使用你仓库里的 BERTopicEnhancedCF 类
# 如果在 Colab 没有该源码，可把 backend/core/bertopic_enhanced_cf.py 上传后再运行。

BERTopicEnhancedCF = None
try:
    from backend.core.bertopic_enhanced_cf import BERTopicEnhancedCF
except Exception as e:
    print('暂时无法 import BERTopicEnhancedCF:', e)

print('BERTopicEnhancedCF loaded =', BERTopicEnhancedCF is not None)

In [None]:
def evaluate_topn(method_name, model, user_train, user_test, ks=(5,10), positive_threshold=4.0, all_train=None):
    metrics = {k: {'precision': [], 'recall': [], 'hit': []} for k in ks}
    users = sorted(set(user_train.keys()) & set(user_test.keys()))

    for uid in users:
        train_inter = user_train[uid]
        test_inter = user_test[uid]

        relevant = {x['poem_id'] for x in test_inter if x['rating'] >= positive_threshold}
        if not train_inter or not relevant:
            continue

        exclude = {x['poem_id'] for x in train_inter}
        max_k = max(ks)

        if method_name == 'BERT-Enhanced':
            recs = model.recommend(train_inter, all_train or [], top_k=max_k)
        else:
            recs = model.recommend(train_inter, exclude_ids=exclude, top_k=max_k)

        ranked = [x['poem_id'] for x in recs]
        for k in ks:
            topk = ranked[:k]
            hits = len(set(topk) & relevant)
            metrics[k]['precision'].append(hits / k)
            metrics[k]['recall'].append(hits / len(relevant))
            metrics[k]['hit'].append(1.0 if hits > 0 else 0.0)

    out = {}
    for k in ks:
        out[f'Precision@{k}'] = float(np.mean(metrics[k]['precision'])) if metrics[k]['precision'] else 0.0
        out[f'Recall@{k}'] = float(np.mean(metrics[k]['recall'])) if metrics[k]['recall'] else 0.0
        out[f'Hit@{k}'] = float(np.mean(metrics[k]['hit'])) if metrics[k]['hit'] else 0.0
    return out

In [None]:
item_ids = [m['id'] for m in movies]

cb = ContentBasedRecommender()
cb.fit(movies)

item_cf = ItemBasedCF()
item_cf.fit(train, item_ids)

user_cf = UserBasedCF()
user_cf.fit(train, item_ids)

methods = {
    'CB': cb,
    'Item-CF': item_cf,
    'User-CF': user_cf,
}

if BERTopicEnhancedCF is not None:
    bert_enhanced = BERTopicEnhancedCF(item_cf_weight=0.5, user_cf_weight=0.3, topic_weight=0.2)
    bert_enhanced.fit(movies, train)
    methods['BERT-Enhanced'] = bert_enhanced
else:
    print('跳过 BERT-Enhanced（未成功导入类）')

rows = []
for name, model in methods.items():
    m = evaluate_topn(name, model, user_train, user_test, ks=cfg.top_ks, positive_threshold=cfg.positive_threshold, all_train=train)
    rows.append({'method': name, **m})

df_results = pd.DataFrame(rows).sort_values('Precision@10', ascending=False).reset_index(drop=True)
df_results

In [None]:
plt.figure(figsize=(8, 4.5))
bars = plt.bar(df_results['method'], df_results['Precision@10'])
plt.title('MovieLens-100K Precision@10 Comparison')
plt.ylabel('Precision@10')
for b, v in zip(bars, df_results['Precision@10']):
    plt.text(b.get_x() + b.get_width()/2, b.get_height(), f'{v:.3f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()

In [None]:
os.makedirs('./outputs', exist_ok=True)
csv_path = './outputs/movielens_results_colab.csv'
json_path = './outputs/movielens_results_colab.json'
png_path = './outputs/movielens_precision_colab.png'

df_results.to_csv(csv_path, index=False, encoding='utf-8')

payload = {
    'config': cfg.__dict__,
    'dataset_stats': {
        'n_users': len(set(x['user_id'] for x in ratings)),
        'n_items': len(movies),
        'n_ratings': len(ratings),
        'n_train': len(train),
        'n_test': len(test),
    },
    'results': df_results.to_dict(orient='records')
}
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)

plt.figure(figsize=(8, 4.5))
bars = plt.bar(df_results['method'], df_results['Precision@10'])
plt.title('MovieLens-100K Precision@10 Comparison')
plt.ylabel('Precision@10')
for b, v in zip(bars, df_results['Precision@10']):
    plt.text(b.get_x() + b.get_width()/2, b.get_height(), f'{v:.3f}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig(png_path, dpi=150)
plt.close()

print('Saved:', csv_path, json_path, png_path)

## Colab 使用说明

1. 直接顺序运行全部单元即可。
2. 若你要启用 `BERT-Enhanced`：请确保当前 runtime 可导入 `backend.core.bertopic_enhanced_cf`。
3. 如果你不想安装 BERTopic 依赖，也可以先只跑 3 个 baseline。
4. 结果文件在 `./outputs/` 下。