In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import pickle

In [2]:
os.chdir('f:/NTU Learn/DATA MINING/DMproject')
os.getcwd()

'f:\\NTU Learn\\DATA MINING\\DMproject'

In [3]:
# 加载示例数据 (假设 movielens 数据已经加载为 DataFrame)
ratings = pd.read_csv('data/ml-20m/ratings.csv')

# 创建用户-电影评分矩阵 (用户为行，电影为列)
print('building user-item matrix...')
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

building user-item matrix...


ValueError: negative dimensions are not allowed

In [4]:
user_item_matrix.shape

(138493, 26744)

## UserCF

In [4]:
def calculate_user_similarity_batch(user_item_matrix, batch_size):
    users = user_item_matrix.index
    num_users = len(users)
    
    user_similarity_dict = {}
    
    with tqdm(total=num_users // batch_size, desc="Calculating user similarities", unit="batch") as pbar:
        for start in range(0, num_users, batch_size):
            end = min(start + batch_size, num_users)
            batch_users = user_item_matrix.iloc[start:end]
            
            similarity_matrix = cosine_similarity(batch_users, user_item_matrix)
            
            for idx, user_id in enumerate(batch_users.index):
                
                user_similarity_dict[user_id] = {
                    other_user_id: similarity
                    for other_user_id, similarity in zip(user_item_matrix.index, similarity_matrix[idx])
                }
            
            pbar.update(1)
    
    return user_similarity_dict


batch_size = 100

user_similarity = calculate_user_similarity_batch(user_item_matrix, batch_size)

# 输出前5个用户的相似度
for user_id, similarities in list(user_similarity.items())[:5]:
    print(f"User {user_id} similarities: {similarities[:5]}")  # 仅展示前5个相似用户的相似度

Calculating user similarities:   1%|          | 13/1384 [08:40<15:14:11, 40.01s/batch]


KeyboardInterrupt: 

In [None]:
# 基于用户相似度推荐函数
def recommend_items(user_similarity, user_movie_matrix, user_id, top_n=5):
    user_index = user_movie_matrix.index.get_loc(user_id)
    
    # 获取用户的相似度向量
    user_similarities = user_similarity[user_id]
    
    # 计算加权评分 (相似用户的评分加权求和)
    user_ratings = user_movie_matrix.values
    weighted_ratings = np.dot(user_similarities, user_ratings) / np.array([np.abs(user_similarities).sum()])
    
    # 排序并获取用户没有评分过的电影
    user_watched = user_movie_matrix.loc[user_id] > 0
    recommendations = np.argsort(weighted_ratings[user_watched == False])[::-1][:top_n]
    
    # 返回推荐的电影ID
    return user_movie_matrix.columns[recommendations]

# 为某个用户推荐电影 (例如 userId 1)
recommendations = recommend_items(user_similarity, user_item_matrix, user_id=1, top_n=5)
print(f"Top 5 recommendations for user 1: {recommendations}")

## ItemCF

In [None]:
def calculate_item_similarity_batch(user_item_matrix, batch_size):
    items = user_item_matrix.columns
    num_items = len(items)
    
    item_similarity_dict = {}
    
    with tqdm(total=num_items // batch_size, desc="Calculating item similarities", unit="batch") as pbar:
        for start in range(0, num_items, batch_size):
            end = min(start + batch_size, num_items)
            item_user_matrix = user_item_matrix.T
            batch_items = item_user_matrix.iloc[start:end]
            
            similarity_matrix = cosine_similarity(batch_items, item_user_matrix)
            
            for idx, user_id in enumerate(batch_items.index):
                
                item_similarity_dict[user_id] = {
                    other_user_id: similarity
                    for other_user_id, similarity in zip(item_user_matrix.index, similarity_matrix[idx])
                }
            
            pbar.update(1)
    
    return item_similarity_dict


batch_size = 1000

user_similarity = calculate_user_similarity_batch(user_item_matrix, batch_size)

# 输出前5个用户的相似度
for user_id, similarities in list(user_similarity.items())[:5]:
    print(f"User {user_id} similarities: {similarities[:5]}")  # 仅展示前5个相似用户的相似度