In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
import time
from itertools import combinations
import collections
import math
import random

np.random.seed(seed=17)
# 读取.npy文件
file_path = '/home/s3963616/ADM/user_movie_rating.npy'
data = np.load(file_path)

# 打印数据的形状
print("数据形状:", data.shape)
print(data)


data[:, 2] = 1

user_ids = data[:, 0]
movie_ids = data[:, 1]
ratings = data[:, 2]

user_indices = np.array(user_ids) - 1
movie_indices = np.array(movie_ids) - 1

# Create sparse matrix using CSR format
user_movie_matrix = csr_matrix((np.ones_like(user_ids), (user_indices, movie_indices)))

# # 获取用户和电影的最大ID
# max_user_id = np.max(user_ids)
# max_movie_id = np.max(movie_ids)

# # 获取用户和电影的最大ID
# max_user_id = np.max(user_ids)
# max_movie_id = np.max(movie_ids)

# # 创建一个稀疏矩阵（电影 x 用户）
# user_movie_sparse = coo_matrix((ratings, (user_ids - 1, movie_ids - 1)), shape=(max_user_id, max_movie_id))

# # 将稀疏矩阵转换为密集矩阵
# user_movie_matrix = user_movie_sparse.tocsr()

# 显示矩阵的一部分进行检查
print(user_movie_matrix, user_movie_matrix.shape)

# 检查用户1对电影30的评分
user_id_to_check = 1
movie_id_to_check = 157

# 由于矩阵索引是零基的，我们需要减去1
rating_check = user_movie_matrix[user_id_to_check - 1, movie_id_to_check - 1]

print(f"用户 {user_id_to_check} 对电影 {movie_id_to_check} 的评分是：{rating_check}")

数据形状: (65225506, 3)
[[     1     30      3]
 [     1    157      3]
 [     1    173      4]
 ...
 [103703  17622      2]
 [103703  17627      4]
 [103703  17764      4]]
  (0, 29)	1
  (0, 156)	1
  (0, 172)	1
  (0, 174)	1
  (0, 190)	1
  (0, 196)	1
  (0, 240)	1
  (0, 294)	1
  (0, 298)	1
  (0, 328)	1
  (0, 360)	1
  (0, 444)	1
  (0, 456)	1
  (0, 467)	1
  (0, 493)	1
  (0, 500)	1
  (0, 527)	1
  (0, 563)	1
  (0, 579)	1
  (0, 657)	1
  (0, 704)	1
  (0, 705)	1
  (0, 722)	1
  (0, 787)	1
  (0, 824)	1
  :	:
  (103702, 16551)	1
  (103702, 16603)	1
  (103702, 16667)	1
  (103702, 16706)	1
  (103702, 16739)	1
  (103702, 16764)	1
  (103702, 16829)	1
  (103702, 16881)	1
  (103702, 16921)	1
  (103702, 16953)	1
  (103702, 17052)	1
  (103702, 17087)	1
  (103702, 17148)	1
  (103702, 17153)	1
  (103702, 17156)	1
  (103702, 17250)	1
  (103702, 17307)	1
  (103702, 17329)	1
  (103702, 17345)	1
  (103702, 17423)	1
  (103702, 17478)	1
  (103702, 17620)	1
  (103702, 17621)	1
  (103702, 17626)	1
  (103702, 17763)	1 

In [2]:
num_hyperplanes = 100

# 创建随机投影矩阵
num_users = user_movie_matrix.shape[1]  # 获取用户数量
projection_matrix = np.random.randn(num_users, num_hyperplanes)

# 应用投影矩阵
# 注意：由于 user_movie_sparse 是 CSR 格式，这个矩阵乘法会非常高效
projected_matrix = user_movie_matrix.dot(projection_matrix)

# 生成哈希签名
user_hash_signatures = np.where(projected_matrix > 0, 1, 0)

# 显示哈希签名的一部分进行检查
print(user_hash_signatures, user_hash_signatures.shape)

[[1 1 0 ... 1 1 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 1 1 0]
 ...
 [0 1 0 ... 1 1 1]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 1 0 0]] (103703, 100)


In [4]:
b = 6
r = 15
num_buckets = 40000

# Initialize hash bucket
buckets = {i: [] for i in range(num_buckets)}
for user_id, signature in enumerate(user_hash_signatures):
    for band in range(b):
        start_row = band * r
        end_row = start_row + r
        band_signature = tuple(signature[start_row:end_row])
        # Same signature has same hash value 
        hash_value = hash(band_signature) % num_buckets
        # Add user id into bucket
        buckets[hash_value].append(user_id)


def calculate_adjusted_cosine_similarity(p1, p2):
    # 计算点积
    dot_product = np.dot(p1, p2)
    # 计算两个向量的欧几里得范数
    norm_p1 = np.linalg.norm(p1)
    norm_p2 = np.linalg.norm(p2)
    # 计算余弦相似度
    similarity = dot_product / (norm_p1 * norm_p2)
    # 限制余弦相似度的范围以防止计算错误
    similarity = np.clip(similarity, -1, 1)
    # 将余弦值转换为角度
    angle_in_degrees = np.arccos(similarity) * (180 / np.pi)
    # 根据教科书方法调整余弦相似度
    adjusted_similarity = 1 - angle_in_degrees / 180
    return adjusted_similarity

num_movies = np.max(data[:, 1])

def original_rating_vector(user_id, data, num_movies):
    user_ratings = data[data[:, 0] == user_id]
    movie_indices = user_ratings[:, 1].astype(int) - 1  # 电影ID从1开始，转换为从0开始的索引
    rating_vector = np.bincount(movie_indices, weights=user_ratings[:, 2], minlength=num_movies)
    return rating_vector

        
print(f"总共有 {len(buckets)} 个桶")

already_compared = set()
bucket_counter = 0
similar_users = []
time_limit = 28 * 60 

# 开始计时
start_time = time.time()

for i, (bucket, users) in enumerate(buckets.items()):
    if i >= 20:  # 只打印前20个桶
        break
    print(f"桶 {bucket}: 用户数量 {len(users)}, 用户ID列表 {users}")
    

for bucket in buckets.values():
    for user1, user2 in combinations(bucket, 2):
        if user1 == user2:
            continue
        if time.time() - start_time > time_limit:
            print("已达到时间限制，停止计算。")
            break
        if (user1, user2) not in already_compared and (user2, user1) not in already_compared:
            similarity = calculate_adjusted_cosine_similarity(user_hash_signatures[user1, :], user_hash_signatures[user2, :])
            already_compared.add((user1, user2))
            if similarity > 0.73:
                # ori_user1 = original_rating_vector(user1+1, data, num_movies)
                # ori_user2 = original_rating_vector(user2+1, data, num_movies)
                ori_user1 = user_movie_matrix.getrow(user1).toarray().ravel()
                ori_user2 = user_movie_matrix.getrow(user2).toarray().ravel()
                real_similarity = calculate_adjusted_cosine_similarity(ori_user1, ori_user2)
                if real_similarity > 0.73:
                    similar_users.append(tuple(sorted([user1, user2])))
                    print(f"用户 {user1} 和用户 {user2} 的余弦相似度为: {similarity}")
    if time.time() - start_time > time_limit:
        break
    
unique_similar_users = list(set(similar_users))
print(len(unique_similar_users))

with open('/home/s3963616/ADM/similar_users_2.txt', 'w') as file:
    for pair in unique_similar_users:
        file.write(f"{pair[0]}, {pair[1]}\n")

总共有 40000 个桶
桶 0: 用户数量 0, 用户ID列表 []
桶 1: 用户数量 0, 用户ID列表 []
桶 2: 用户数量 8, 用户ID列表 [40541, 72671, 74703, 82332, 85540, 89817, 95021, 98141]
桶 3: 用户数量 4, 用户ID列表 [48449, 49009, 60303, 94942]
桶 4: 用户数量 0, 用户ID列表 []
桶 5: 用户数量 0, 用户ID列表 []
桶 6: 用户数量 419, 用户ID列表 [52, 140, 730, 886, 1174, 1542, 1651, 1882, 1978, 2283, 2683, 3159, 3455, 3737, 3847, 3858, 3931, 4164, 4270, 4655, 4886, 4944, 5473, 6347, 6351, 6433, 6767, 7056, 7426, 8227, 8317, 8368, 8581, 8751, 8769, 8894, 9555, 9589, 9737, 9812, 9947, 10208, 10654, 10861, 11061, 11980, 12345, 12578, 12848, 13650, 14039, 14500, 14542, 14807, 15344, 15706, 15803, 15808, 16324, 16512, 16552, 16725, 16732, 17167, 17528, 17775, 18297, 18384, 18744, 18782, 18794, 19132, 19183, 19415, 19469, 19565, 20237, 20392, 20619, 20868, 21645, 21663, 21808, 22112, 22561, 22724, 22777, 23034, 23573, 23644, 23672, 24916, 25314, 25642, 26108, 26157, 26217, 26239, 26645, 26653, 26896, 27226, 27509, 27572, 27721, 27815, 27937, 28052, 28095, 28114, 28258, 28297, 28665, 2