<h1 style="color:darkblue">Problem 1</h1>

In [4]:
# 导入必要的库
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

ratings_path = 'u.data'  
users_path = 'u.user'    
items_path = 'u.item'    

# 读取评分数据
ratings_cols = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(ratings_path, sep='\t', names=ratings_cols)

# 创建用户-电影评分矩阵
utility_matrix = pd.pivot_table(ratings, values='rating', index='user_id', columns='item_id')
print(utility_matrix)
# 计算中心化的用户评分
# 从每个用户的评分中减去该用户的平均评分
user_means = utility_matrix.mean(axis=1)
centered_utility_matrix = utility_matrix.subtract(user_means, axis=0)

# 填充缺失值为0（用户未评分的电影）
centered_utility_matrix = centered_utility_matrix.fillna(0)

# 计算用户1与所有其他用户的余弦相似度
user_1 = centered_utility_matrix.loc[1]
similarities = {}

for user_id in centered_utility_matrix.index:
    if user_id != 1:  # 不需要计算用户1与自己的相似度
        user_i = centered_utility_matrix.loc[user_id]
        # 注意：使用1-cosine是因为scipy的cosine计算的是余弦距离而不是余弦相似度
        similarity = 1 - cosine(user_1, user_i)
        similarities[user_id] = similarity

# 找出与用户1最相似的10个用户
most_similar_users = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:10]
print("The 10 users most similar to User 1：")
for user, similarity in most_similar_users:
    print(f"User ID: {user}, Similarity: {similarity:.4f}")

# 获取这10个最相似用户对电影508的评分
similar_user_ids = [user for user, _ in most_similar_users]
similar_users_ratings = []
similar_users_ratings_with_id = []
similar_users_ratings_with_details = []

for user_id, similarity in most_similar_users:
    if 508 in utility_matrix.columns and not pd.isna(utility_matrix.loc[user_id, 508]):
        rating = utility_matrix.loc[user_id, 508]
        similar_users_ratings.append(rating)
        similar_users_ratings_with_id.append((user_id, rating))
        similar_users_ratings_with_details.append((user_id, rating, similarity))

# 展示相似用户对电影508的评分详情
print("\nRating details of similar users for movie 508：")
if similar_users_ratings_with_id:
    for user_id, rating in similar_users_ratings_with_id:
        print(f"User ID: {user_id}, Rating for Movie 508: {rating}, Average User Rating: {user_means[user_id]:.2f}")
else:
    print("No similar users have rated movie 508")

# 计算对电影508的简单平均预期评分
if similar_users_ratings:
    expected_rating = sum(similar_users_ratings) / len(similar_users_ratings)
    print(f"\nBased on the simple average prediction score, User 1's expected rating for Movie 508: {expected_rating:.4f}")
    print(f"Number of similar users participating in the prediction: {len(similar_users_ratings)}")
else:
    print("\nNo similar users have rated movie 508, so predictions cannot be made.")

# 计算对电影508的相似度加权预期评分
if similar_users_ratings_with_details:
    weighted_sum = 0
    similarity_sum = 0
    
    print("\nWeighted details of similar user ratings:")
    for user_id, rating, similarity in similar_users_ratings_with_details:
        weighted_rating = rating * similarity
        weighted_sum += weighted_rating
        similarity_sum += similarity
        print(f"User ID: {user_id}, Rating: {rating}, Similarity: {similarity:.4f}, Weighted Scoring: {weighted_rating:.4f}")
    
    if similarity_sum > 0:  # 避免除以零
        weighted_expected_rating = weighted_sum / similarity_sum
        print(f"\nPredicted score based on similarity weighting, User 1's expected rating for movie 508: {weighted_expected_rating:.4f}")
    else:
        print("\nThe total similarity is zero, making it impossible to calculate the weighted expected score.")
else:
    print("\nThere are no similar users who have rated movie 508, so it cannot be predicted.")

# 查看用户1是否已经对电影508进行了评分
if 508 in utility_matrix.columns and not pd.isna(utility_matrix.loc[1, 508]):
    actual_rating = utility_matrix.loc[1, 508]
    print(f"User 1's actual rating for movie 508: {actual_rating}")
else:
    print("User 1 has not yet rated movie 508.")

# 比较两种评分方法
if similar_users_ratings and similar_users_ratings_with_details and similarity_sum > 0:
    print("\nComparison of Scoring Methods：")
    print(f"Simple average rating: {expected_rating:.4f}")
    print(f"Similarity Weighted Scoring: {weighted_expected_rating:.4f}")
    print(f"Difference: {abs(expected_rating - weighted_expected_rating):.4f}")

item_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                              ...   
1         5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         4.0   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   5.0   NaN  ...   
940       NaN   NaN   NaN   2.0   NaN   NaN   4.0   5.0   3.0   NaN  ...   
941       5.0   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...   
942       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
943       NaN   5.0   NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN  ...   

item_id  16

<h1 style="color:darkblue">Problem 2</h1>

In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 加载数据
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=ratings_cols, encoding='latin-1')

movies_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('u.item', sep='|', names=movies_cols, usecols=range(5), encoding='latin-1')

# 创建用户-电影评分矩阵
user_item_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating')

# 中心化处理（按用户均值）
user_means = user_item_matrix.mean(axis=1)
centered_matrix = user_item_matrix.sub(user_means, axis=0)

# 获取用户200和用户15的评分档案
user_200_profile = centered_matrix.loc[200].dropna()
user_15_profile = centered_matrix.loc[15].dropna()

# 获取电影95的评分
movie_95_ratings = centered_matrix[95].dropna()

# 找出共同评分的电影
common_movies_200 = user_200_profile.index.intersection(movie_95_ratings.index)
common_movies_15 = user_15_profile.index.intersection(movie_95_ratings.index)

# 相似性计算函数
def calculate_similarity(user_profile, movie_vector, common_items):
    user_vector = user_profile[common_items].values.reshape(1, -1)
    movie_vector_values = movie_vector[common_items].values.reshape(1, -1)
    
    cosine_sim = cosine_similarity(user_vector, movie_vector_values)[0][0]
    cosine_dist = 1 - cosine_sim
    return cosine_sim, cosine_dist

# 计算用户200和用户15与电影95的相似性
cosine_sim_200, cosine_dist_200 = calculate_similarity(user_200_profile, centered_matrix[95], common_movies_200)
cosine_sim_15, cosine_dist_15 = calculate_similarity(user_15_profile, centered_matrix[95], common_movies_15)

# 输出结果
print("User 200:")
print(f"Cosine Similarity: {cosine_sim_200:.4f}")
print(f"Cosine Distance: {cosine_dist_200:.4f}")

print("\nUser 15:")
print(f"Cosine Similarity: {cosine_sim_15:.4f}")
print(f"Cosine Distance: {cosine_dist_15:.4f}")

# 确定推荐系统应该向哪位用户推荐电影95
if cosine_sim_200 > cosine_sim_15:
    print("\nThe recommendation system should recommend movie 95 to user 200 because it has a higher cosine similarity.")
else:
    print("\nThe recommendation system should suggest movie 95 to user 15, as the cosine similarity is higher.")

User 200:
Cosine Similarity: 0.0890
Cosine Distance: 0.9110

User 15:
Cosine Similarity: 0.1305
Cosine Distance: 0.8695

The recommendation system should suggest movie 95 to user 15, as the cosine similarity is higher.
