# MovieLens-100K 实验（Colab独立版，算法对齐系统实现）

这个 notebook **完全不依赖仓库文件导入**，可单独上传到 Colab 运行。  
同时实现与系统 `BERTopicEnhancedCF` 一致的核心逻辑：
- Item-CF：Pearson（共同评分项）
- User-CF：Pearson（共同评分项）
- Hybrid User Similarity：`alpha * rating_sim + (1-alpha) * topic_sim`
- Confidence Filter：最小相似度阈值 + 共同评分 shrinkage
- Enhanced Item Similarity：`item_cf_weight * norm(item_sim) + topic_weight * norm(topic_sim)`

对比方法：CB、Item-CF、User-CF、BERT-Enhanced。


In [None]:
# 1) 安装依赖（Colab）
!pip -q install numpy pandas scikit-learn matplotlib tqdm sentence-transformers


In [None]:
# 2) 导入依赖
import os
import json
import random
import zipfile
import urllib.request
from dataclasses import dataclass
from datetime import datetime
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

print('imports ok')


In [None]:
# 3) 配置
@dataclass
class Config:
    dataset_dir: str = './ml-100k'
    test_ratio: float = 0.2
    seed: int = 42
    positive_threshold: float = 4.0
    top_ks: tuple = (5, 10)

cfg = Config()
cfg


In [None]:
# 4) 下载并加载 MovieLens-100K

def download_movielens(dataset_dir='./ml-100k'):
    if os.path.exists(dataset_dir):
        print('dataset exists:', dataset_dir)
        return
    url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
    zip_path = './ml-100k.zip'
    print('downloading:', url)
    urllib.request.urlretrieve(url, zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall('./')
    os.remove(zip_path)
    print('download done')


def load_movies(dataset_dir):
    genre_names = [
        'unknown','Action','Adventure','Animation','Children','Comedy','Crime',
        'Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery',
        'Romance','Sci-Fi','Thriller','War','Western'
    ]
    movies=[]
    with open(os.path.join(dataset_dir,'u.item'),'r',encoding='latin-1') as f:
        for line in f:
            p=line.strip().split('|')
            mid=int(p[0])
            title=p[1]
            flags=p[5:24]
            genres=[g for g,v in zip(genre_names,flags) if v=='1']
            content=f"{title} {' '.join(genres)} movie film"
            movies.append({'id':mid,'title':title,'genres':genres,'content':content})
    return movies


def load_ratings(dataset_dir):
    out=[]
    with open(os.path.join(dataset_dir,'u.data'),'r',encoding='utf-8') as f:
        for line in f:
            uid,mid,r,ts=line.strip().split('	')
            out.append({
                'user_id': int(uid),
                'poem_id': int(mid),
                'rating': float(r),
                'created_at': datetime.fromtimestamp(int(ts))
            })
    return out


def split_by_user_random(interactions, test_ratio=0.2, seed=42):
    rng = random.Random(seed)
    by_user = defaultdict(list)
    for x in interactions:
        by_user[x['user_id']].append(x)

    train,test=[],[]
    for uid, xs in by_user.items():
        xs=xs.copy(); rng.shuffle(xs)
        n_test=max(1,int(len(xs)*test_ratio))
        test.extend(xs[:n_test])
        train.extend(xs[n_test:])
    return train,test


def build_user_index(interactions):
    d=defaultdict(list)
    for x in interactions:
        d[x['user_id']].append(x)
    return d


download_movielens(cfg.dataset_dir)
movies=load_movies(cfg.dataset_dir)
ratings=load_ratings(cfg.dataset_dir)
train,test=split_by_user_random(ratings,cfg.test_ratio,cfg.seed)
user_train=build_user_index(train)
user_test=build_user_index(test)

print('movies:',len(movies),'ratings:',len(ratings),'users:',len(set(x['user_id'] for x in ratings)))
print('train:',len(train),'test:',len(test))


In [None]:
# 5) 三个baseline：CB / Item-CF / User-CF
class ContentBasedRecommender:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=8000)
        self.items=None
        self.item_vectors=None
        self.id_to_idx={}

    def fit(self, items):
        self.items=items
        self.id_to_idx={x['id']:i for i,x in enumerate(items)}
        self.item_vectors=self.vectorizer.fit_transform([x.get('content','') for x in items])

    def recommend(self, user_interactions, exclude_ids=None, top_k=10):
        if self.item_vectors is None:
            return []
        exclude_ids=exclude_ids or set()
        rated=[x for x in user_interactions if x['poem_id'] in self.id_to_idx]
        if not rated:
            return []
        docs=[self.items[self.id_to_idx[x['poem_id']]]['content'] for x in rated]
        vecs=self.vectorizer.transform(docs).toarray()
        ratings=np.array([x.get('rating',3.0) for x in rated],dtype=float)
        weights=np.clip(ratings-2.5,0,None)
        profile=np.average(vecs,axis=0,weights=weights) if weights.sum()>0 else np.mean(vecs,axis=0)
        sims=cosine_similarity([profile],self.item_vectors)[0]
        recs=[]
        for i,s in enumerate(sims):
            pid=self.items[i]['id']
            if pid in exclude_ids:
                continue
            recs.append({'poem_id':pid,'score':float(s)})
        recs.sort(key=lambda x:x['score'], reverse=True)
        return recs[:top_k]


class ItemBasedCFRecommender:
    def __init__(self):
        self.item_similarity=None
        self.rating_matrix=None
        self.poem_id_to_idx={}
        self.idx_to_poem_id={}

    def fit(self, interactions, poem_ids):
        self.poem_id_to_idx={pid:i for i,pid in enumerate(poem_ids)}
        self.idx_to_poem_id={i:pid for pid,i in self.poem_id_to_idx.items()}
        users=sorted(set(i['user_id'] for i in interactions))
        user_id_to_idx={uid:i for i,uid in enumerate(users)}
        R=np.zeros((len(users),len(poem_ids)))
        for inter in interactions:
            u=user_id_to_idx[inter['user_id']]
            p=self.poem_id_to_idx.get(inter['poem_id'])
            if p is not None:
                R[u,p]=inter.get('rating',3.0)
        self.rating_matrix=R
        self._compute_similarity()

    def _compute_similarity(self):
        n_items=self.rating_matrix.shape[1]
        sim=np.zeros((n_items,n_items))
        for i in range(n_items):
            sim[i,i]=1.0
            for j in range(i+1,n_items):
                mask=(self.rating_matrix[:,i]>0) & (self.rating_matrix[:,j]>0)
                if mask.sum()==0:
                    s=0.0
                else:
                    vi=self.rating_matrix[mask,i]; vj=self.rating_matrix[mask,j]
                    vi=vi-vi.mean(); vj=vj-vj.mean()
                    s=float((vi*vj).sum()/(np.sqrt((vi**2).sum())*np.sqrt((vj**2).sum())+1e-8))
                sim[i,j]=sim[j,i]=s
        self.item_similarity=sim

    def recommend(self, user_interactions, exclude_ids=None, top_k=10):
        exclude_ids=exclude_ids or set()
        user_ratings=np.zeros(len(self.poem_id_to_idx))
        for inter in user_interactions:
            p=self.poem_id_to_idx.get(inter['poem_id'])
            if p is not None:
                user_ratings[p]=inter.get('rating',3.0)
        rated=np.where(user_ratings>0)[0]
        if len(rated)==0:
            return []
        scores=np.zeros(len(self.poem_id_to_idx))
        for i in range(len(self.poem_id_to_idx)):
            if user_ratings[i]>0:
                continue
            neighbors=self.item_similarity[i,rated]
            rr=user_ratings[rated]
            m=neighbors>0
            if m.sum()>0:
                scores[i]=np.dot(neighbors[m],rr[m])/(np.abs(neighbors[m]).sum()+1e-8)
        recs=[]
        for i,s in enumerate(scores):
            pid=self.idx_to_poem_id[i]
            if pid not in exclude_ids:
                recs.append({'poem_id':pid,'score':float(s)})
        recs.sort(key=lambda x:x['score'], reverse=True)
        return recs[:top_k]


class UserBasedCFRecommender:
    def __init__(self, k_neighbors=40):
        self.k_neighbors=k_neighbors
        self.rating_matrix=None
        self.user_id_to_idx={}
        self.item_id_to_idx={}
        self.idx_to_item_id={}
        self.user_similarity=None

    def fit(self, interactions, item_ids):
        users=sorted(set(x['user_id'] for x in interactions))
        self.user_id_to_idx={u:i for i,u in enumerate(users)}
        self.item_id_to_idx={pid:i for i,pid in enumerate(item_ids)}
        self.idx_to_item_id={i:pid for pid,i in self.item_id_to_idx.items()}
        R=np.zeros((len(users),len(item_ids)))
        for x in interactions:
            u=self.user_id_to_idx[x['user_id']]
            p=self.item_id_to_idx.get(x['poem_id'])
            if p is not None:
                R[u,p]=x.get('rating',3.0)
        self.rating_matrix=R
        self._compute_user_similarity()

    def _compute_user_similarity(self):
        n_users=self.rating_matrix.shape[0]
        sim=np.zeros((n_users,n_users))
        for i in range(n_users):
            sim[i,i]=1.0
            for j in range(i+1,n_users):
                m=(self.rating_matrix[i]>0) & (self.rating_matrix[j]>0)
                if m.sum()==0:
                    s=0.0
                else:
                    vi=self.rating_matrix[i,m]; vj=self.rating_matrix[j,m]
                    vi=vi-vi.mean(); vj=vj-vj.mean()
                    s=float((vi*vj).sum()/(np.sqrt((vi**2).sum())*np.sqrt((vj**2).sum())+1e-8))
                sim[i,j]=sim[j,i]=s
        self.user_similarity=sim

    def recommend(self, user_interactions, exclude_ids=None, top_k=10):
        exclude_ids=exclude_ids or set()
        if not user_interactions:
            return []
        uid=user_interactions[0]['user_id']
        tidx=self.user_id_to_idx.get(uid)
        if tidx is None:
            return []
        sims=self.user_similarity[tidx].copy()
        sims[tidx]=-np.inf
        neigh_idx=np.argsort(sims)[-self.k_neighbors:]
        neigh=[(i,sims[i]) for i in neigh_idx if sims[i]>0]
        if not neigh:
            return []

        recs=[]
        for i in range(self.rating_matrix.shape[1]):
            pid=self.idx_to_item_id[i]
            if pid in exclude_ids:
                continue
            ws,ss=0.0,0.0
            for nidx,nsim in neigh:
                r=self.rating_matrix[nidx,i]
                if r>0:
                    ws+=nsim*r
                    ss+=abs(nsim)
            if ss>0:
                recs.append({'poem_id':pid,'score':float(ws/(ss+1e-8))})
        recs.sort(key=lambda x:x['score'], reverse=True)
        return recs[:top_k]


In [None]:
# 6) BERT-Enhanced（独立实现，逻辑对齐系统版）
class BERTopicVectorizer:
    """在Colab独立生成主题/语义向量（SentenceTransformer embeddings）。"""
    def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
        self.model_name=model_name
        self.encoder=None
        self.topic_matrix=None
        self.poem_ids=[]

    def fit(self, poems):
        self.poem_ids=[p['id'] for p in poems]
        docs=[p.get('content','') for p in poems]
        self.encoder=SentenceTransformer(self.model_name)
        self.topic_matrix=self.encoder.encode(docs, show_progress_bar=True)


class BERTopicEnhancedCFStandalone:
    """与系统 BERTopicEnhancedCF 保持同构思路（便于Colab独立运行）。"""
    def __init__(self, item_cf_weight=0.5, user_cf_weight=0.3, topic_weight=0.2):
        self.item_cf_weight=item_cf_weight
        self.user_cf_weight=user_cf_weight
        self.topic_weight=topic_weight

        self.poems=None
        self.interactions=None
        self.poem_ids=[]
        self.poem_id_map={}

        self.user_id_map={}
        self.rating_matrix=None
        self.item_similarity=None
        self.user_similarity=None
        self.hybrid_similarity=None
        self.enhanced_similarity=None

        self.k_neighbors=30
        self.min_similarity=0.1
        self.min_common_ratings=3
        self.hybrid_alpha=0.6

        self.bertopic=BERTopicVectorizer()

    def fit(self, poems, interactions):
        self.poems=poems
        self.interactions=interactions
        self.poem_ids=[p['id'] for p in poems]
        self.poem_id_map={pid:i for i,pid in enumerate(self.poem_ids)}

        self._build_rating_matrix(interactions)
        self._compute_item_similarity()
        self._compute_user_similarity()

        # 与系统修复后的顺序一致：先得到topic_matrix，再算hybrid/enhanced
        self.bertopic.fit(poems)
        self._compute_hybrid_user_similarity()
        self._compute_enhanced_similarity()

    def _build_rating_matrix(self, interactions):
        users=sorted(set(i['user_id'] for i in interactions))
        self.user_id_map={uid:i for i,uid in enumerate(users)}
        R=np.zeros((len(users), len(self.poem_ids)))
        for inter in interactions:
            u=self.user_id_map[inter['user_id']]
            p=self.poem_id_map.get(inter['poem_id'])
            if p is not None:
                R[u,p]=inter.get('rating',3.0)
        self.rating_matrix=R

    def _compute_item_similarity(self):
        n_items=self.rating_matrix.shape[1]
        sim=np.zeros((n_items,n_items))
        for i in range(n_items):
            sim[i,i]=1.0
            for j in range(i+1,n_items):
                m=(self.rating_matrix[:,i]>0)&(self.rating_matrix[:,j]>0)
                if m.sum()==0:
                    s=0.0
                else:
                    vi=self.rating_matrix[m,i]; vj=self.rating_matrix[m,j]
                    vi=vi-vi.mean(); vj=vj-vj.mean()
                    s=float((vi*vj).sum()/(np.sqrt((vi**2).sum())*np.sqrt((vj**2).sum())+1e-8))
                sim[i,j]=sim[j,i]=s
        self.item_similarity=sim

    def _compute_user_similarity(self):
        n_users=self.rating_matrix.shape[0]
        sim=np.zeros((n_users,n_users))
        for i in range(n_users):
            sim[i,i]=1.0
            for j in range(i+1,n_users):
                m=(self.rating_matrix[i]>0)&(self.rating_matrix[j]>0)
                if m.sum()==0:
                    s=0.0
                else:
                    vi=self.rating_matrix[i,m]; vj=self.rating_matrix[j,m]
                    vi=vi-vi.mean(); vj=vj-vj.mean()
                    s=float((vi*vj).sum()/(np.sqrt((vi**2).sum())*np.sqrt((vj**2).sum())+1e-8))
                sim[i,j]=sim[j,i]=s
        self.user_similarity=sim

    @staticmethod
    def _min_max_normalize(matrix):
        mn,mx=matrix.min(),matrix.max()
        if mx-mn<1e-8:
            return np.zeros_like(matrix)
        return (matrix-mn)/(mx-mn)

    def _compute_hybrid_user_similarity(self):
        rating_sim=self.user_similarity.copy()
        if self.bertopic is not None and self.bertopic.topic_matrix is not None:
            n_users=self.rating_matrix.shape[0]
            topic_dim=self.bertopic.topic_matrix.shape[1]
            user_topic=np.zeros((n_users,topic_dim))
            for u in range(n_users):
                rated=self.rating_matrix[u]>0
                if rated.sum()>0:
                    ratings=self.rating_matrix[u,rated]
                    vecs=self.bertopic.topic_matrix[rated]
                    w=ratings/(ratings.sum()+1e-8)
                    user_topic[u]=np.sum(vecs*w[:,None],axis=0)
            topic_sim=cosine_similarity(user_topic)
            self.hybrid_similarity=self.hybrid_alpha*rating_sim+(1-self.hybrid_alpha)*topic_sim
        else:
            self.hybrid_similarity=rating_sim

    def _compute_enhanced_similarity(self):
        item_sim=self.item_similarity
        n_items=len(self.poem_ids)
        if self.bertopic is not None and self.bertopic.topic_matrix is not None:
            topic_sim=cosine_similarity(self.bertopic.topic_matrix)
        else:
            topic_sim=np.zeros((n_items,n_items)); np.fill_diagonal(topic_sim,1.0)

        if item_sim.shape!=topic_sim.shape:
            m=min(item_sim.shape[0],topic_sim.shape[0])
            item_sim=item_sim[:m,:m]; topic_sim=topic_sim[:m,:m]
            n_items=m

        item_norm=self._min_max_normalize(item_sim[:n_items,:n_items])
        topic_norm=self._min_max_normalize(topic_sim[:n_items,:n_items])
        self.enhanced_similarity=self.item_cf_weight*item_norm + self.topic_weight*topic_norm

    def _apply_confidence_filter(self, sim_matrix):
        n=sim_matrix.shape[0]
        filtered=sim_matrix.copy()
        for i in range(n):
            for j in range(i+1,n):
                common=((self.rating_matrix[i]>0)&(self.rating_matrix[j]>0)).sum()
                if common < self.min_common_ratings:
                    shrinkage=common/(common+10)
                    filtered[i,j]*=shrinkage; filtered[j,i]*=shrinkage
                if abs(filtered[i,j]) < self.min_similarity:
                    filtered[i,j]=0; filtered[j,i]=0
        return filtered

    def _get_top_k_neighbors(self, target_idx, sim_matrix, k=None):
        if k is None:
            k=self.k_neighbors
        sims=sim_matrix[target_idx].copy()
        sims[target_idx]=-np.inf
        idx=np.argsort(sims)[-k:]
        vals=sims[idx]
        m=vals>0
        return list(zip(idx[m], vals[m]))

    def _get_user_cf_scores(self, user_interactions, exclude_ids):
        if not user_interactions or self.hybrid_similarity is None:
            return {}
        uid=user_interactions[0]['user_id']
        target_idx=self.user_id_map.get(uid)
        if target_idx is None:
            return {}

        target_rated={x['poem_id']:x.get('rating',3.0) for x in user_interactions if x['poem_id'] in self.poem_id_map}
        if not target_rated:
            return {}

        filtered=self._apply_confidence_filter(self.hybrid_similarity)
        neighbors=self._get_top_k_neighbors(target_idx, filtered)
        if not neighbors:
            return {}

        scores={}
        for item_idx,item_id in enumerate(self.poem_ids):
            if item_id in exclude_ids or item_id in target_rated:
                continue
            ws,ss=0.0,0.0
            for nidx,sim in neighbors:
                nr=self.rating_matrix[nidx,item_idx]
                if nr>0:
                    neigh_rated=self.rating_matrix[nidx]>0
                    if neigh_rated.sum()>0:
                        neigh_mean=self.rating_matrix[nidx,neigh_rated].mean()
                        ws += sim*(nr-neigh_mean)
                        ss += abs(sim)
            if ss>0:
                target_mask=self.rating_matrix[target_idx]>0
                if target_mask.sum()>0:
                    target_mean=self.rating_matrix[target_idx,target_mask].mean()
                    scores[item_id]=target_mean+ws/ss
        return scores

    def recommend(self, user_interactions, all_interactions=None, top_k=10):
        if self.enhanced_similarity is None:
            return self._popular_fallback(top_k)
        exclude_ids=set(x['poem_id'] for x in user_interactions)

        user_ratings=np.zeros(len(self.poem_ids))
        for x in user_interactions:
            idx=self.poem_id_map.get(x['poem_id'])
            if idx is not None:
                user_ratings[idx]=x.get('rating',3.0)

        rated=np.where(user_ratings>0)[0]
        if len(rated)==0:
            return self._popular_fallback(top_k, exclude_ids)

        item_scores=np.zeros(len(self.poem_ids))
        for i in range(len(self.poem_ids)):
            if user_ratings[i]>0:
                continue
            neigh=self.enhanced_similarity[i,rated]
            rr=user_ratings[rated]
            m=neigh>0
            if m.sum()>0:
                item_scores[i]=np.dot(neigh[m],rr[m])/(np.abs(neigh[m]).sum()+1e-8)

        user_cf_scores=self._get_user_cf_scores(user_interactions, exclude_ids)

        results=[]
        for i,item_score in enumerate(item_scores):
            pid=self.poem_ids[i]
            if pid in exclude_ids:
                continue
            score=item_score
            if pid in user_cf_scores and self.user_cf_weight>0:
                max_ucf=max(user_cf_scores.values()) if user_cf_scores else 1
                u_norm=user_cf_scores[pid]/max_ucf if max_ucf>0 else 0
                score=(1-self.user_cf_weight)*score + self.user_cf_weight*u_norm
            if score>0:
                results.append({'poem_id':pid,'score':float(score)})
        results.sort(key=lambda x:x['score'], reverse=True)
        return results[:top_k]

    def _popular_fallback(self, top_k, exclude_ids=None):
        exclude_ids=exclude_ids or set()
        c=Counter()
        for x in self.interactions:
            if x['poem_id'] not in exclude_ids:
                c[x['poem_id']]+=x.get('rating',3.0)
        return [{'poem_id':pid,'score':float(s)} for pid,s in c.most_common(top_k)]


In [None]:
# 7) 训练四个方法
item_ids=[m['id'] for m in movies]

cb=ContentBasedRecommender(); cb.fit(movies)
item_cf=ItemBasedCFRecommender(); item_cf.fit(train, item_ids)
user_cf=UserBasedCFRecommender(); user_cf.fit(train, item_ids)
bert_enhanced=BERTopicEnhancedCFStandalone(item_cf_weight=0.5, user_cf_weight=0.3, topic_weight=0.2)
bert_enhanced.fit(movies, train)

methods={
    'CB': cb,
    'Item-CF': item_cf,
    'User-CF': user_cf,
    'BERT-Enhanced': bert_enhanced,
}
print('models ready')


In [None]:
# 8) Top-N 评估（P@K / R@K / Hit@K）
def evaluate_topn(method_name, model, user_train, user_test, ks=(5,10), positive_threshold=4.0, all_train=None):
    metrics={k:{'precision':[],'recall':[],'hit':[]} for k in ks}
    users=sorted(set(user_train.keys()) & set(user_test.keys()))

    for uid in users:
        train_inter=user_train[uid]
        test_inter=user_test[uid]
        relevant={x['poem_id'] for x in test_inter if x.get('rating',0)>=positive_threshold}
        if not train_inter or not relevant:
            continue

        exclude={x['poem_id'] for x in train_inter}
        max_k=max(ks)
        if method_name=='BERT-Enhanced':
            recs=model.recommend(train_inter, all_train, top_k=max_k)
        else:
            recs=model.recommend(train_inter, exclude_ids=exclude, top_k=max_k)

        ranked=[x['poem_id'] for x in recs]
        for k in ks:
            topk=ranked[:k]
            hits=len(set(topk)&relevant)
            metrics[k]['precision'].append(hits/k)
            metrics[k]['recall'].append(hits/len(relevant))
            metrics[k]['hit'].append(1.0 if hits>0 else 0.0)

    out={}
    for k in ks:
        out[f'Precision@{k}']=float(np.mean(metrics[k]['precision'])) if metrics[k]['precision'] else 0.0
        out[f'Recall@{k}']=float(np.mean(metrics[k]['recall'])) if metrics[k]['recall'] else 0.0
        out[f'Hit@{k}']=float(np.mean(metrics[k]['hit'])) if metrics[k]['hit'] else 0.0
    return out

rows=[]
for name,model in methods.items():
    m=evaluate_topn(name, model, user_train, user_test, ks=cfg.top_ks, positive_threshold=cfg.positive_threshold, all_train=train)
    rows.append({'method':name, **m})

df_results=pd.DataFrame(rows).sort_values('Precision@10', ascending=False).reset_index(drop=True)
df_results


In [None]:
# 9) 可视化 + 导出结果
plt.figure(figsize=(8,4.5))
bars=plt.bar(df_results['method'], df_results['Precision@10'])
plt.title('MovieLens-100K Precision@10 Comparison')
plt.ylabel('Precision@10')
for b,v in zip(bars, df_results['Precision@10']):
    plt.text(b.get_x()+b.get_width()/2, b.get_height(), f'{v:.3f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()

os.makedirs('./outputs', exist_ok=True)
csv_path='./outputs/movielens_results_colab.csv'
json_path='./outputs/movielens_results_colab.json'
png_path='./outputs/movielens_precision_colab.png'

df_results.to_csv(csv_path, index=False, encoding='utf-8')

payload={
    'config': cfg.__dict__,
    'dataset_stats': {
        'n_users': len(set(x['user_id'] for x in ratings)),
        'n_items': len(movies),
        'n_ratings': len(ratings),
        'n_train': len(train),
        'n_test': len(test),
    },
    'results': df_results.to_dict(orient='records')
}
with open(json_path,'w',encoding='utf-8') as f:
    json.dump(payload,f,ensure_ascii=False,indent=2)

plt.figure(figsize=(8,4.5))
bars=plt.bar(df_results['method'], df_results['Precision@10'])
plt.title('MovieLens-100K Precision@10 Comparison')
plt.ylabel('Precision@10')
for b,v in zip(bars, df_results['Precision@10']):
    plt.text(b.get_x()+b.get_width()/2, b.get_height(), f'{v:.3f}', ha='center', va='bottom')
plt.tight_layout(); plt.savefig(png_path,dpi=150); plt.close()

print('saved:', csv_path, json_path, png_path)


## 说明
- 该 notebook 不依赖 `backend/` 目录导入，可直接独立运行。  
- 其中 `BERTopicEnhancedCFStandalone` 的关键流程和融合逻辑与系统实现保持一致。  
- 若想进一步“逐行一致”，可把系统类定义直接复制到本 notebook 对应单元替换。
