<h1 style="color:darkblue">Problem 1</h1>

In [4]:
# 导入必要的库
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

ratings_path = 'u.data'  
users_path = 'u.user'    
items_path = 'u.item'    

# 读取评分数据
ratings_cols = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(ratings_path, sep='\t', names=ratings_cols)

# 创建用户-电影评分矩阵
utility_matrix = pd.pivot_table(ratings, values='rating', index='user_id', columns='item_id')
print(utility_matrix)
# 计算中心化的用户评分
# 从每个用户的评分中减去该用户的平均评分
user_means = utility_matrix.mean(axis=1)
centered_utility_matrix = utility_matrix.subtract(user_means, axis=0)

# 填充缺失值为0（用户未评分的电影）
centered_utility_matrix = centered_utility_matrix.fillna(0)

# 计算用户1与所有其他用户的余弦相似度
user_1 = centered_utility_matrix.loc[1]
similarities = {}

for user_id in centered_utility_matrix.index:
    if user_id != 1:  # 不需要计算用户1与自己的相似度
        user_i = centered_utility_matrix.loc[user_id]
        # 注意：使用1-cosine是因为scipy的cosine计算的是余弦距离而不是余弦相似度
        similarity = 1 - cosine(user_1, user_i)
        similarities[user_id] = similarity

# 找出与用户1最相似的10个用户
most_similar_users = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:10]
print("The 10 users most similar to User 1：")
for user, similarity in most_similar_users:
    print(f"User ID: {user}, Similarity: {similarity:.4f}")

# 获取这10个最相似用户对电影508的评分
similar_user_ids = [user for user, _ in most_similar_users]
similar_users_ratings = []
similar_users_ratings_with_id = []
similar_users_ratings_with_details = []

for user_id, similarity in most_similar_users:
    if 508 in utility_matrix.columns and not pd.isna(utility_matrix.loc[user_id, 508]):
        rating = utility_matrix.loc[user_id, 508]
        similar_users_ratings.append(rating)
        similar_users_ratings_with_id.append((user_id, rating))
        similar_users_ratings_with_details.append((user_id, rating, similarity))

# 展示相似用户对电影508的评分详情
print("\nRating details of similar users for movie 508：")
if similar_users_ratings_with_id:
    for user_id, rating in similar_users_ratings_with_id:
        print(f"User ID: {user_id}, Rating for Movie 508: {rating}, Average User Rating: {user_means[user_id]:.2f}")
else:
    print("No similar users have rated movie 508")

# 计算对电影508的简单平均预期评分
if similar_users_ratings:
    expected_rating = sum(similar_users_ratings) / len(similar_users_ratings)
    print(f"\nBased on the simple average prediction score, User 1's expected rating for Movie 508: {expected_rating:.4f}")
    print(f"Number of similar users participating in the prediction: {len(similar_users_ratings)}")
else:
    print("\nNo similar users have rated movie 508, so predictions cannot be made.")

# 计算对电影508的相似度加权预期评分
if similar_users_ratings_with_details:
    weighted_sum = 0
    similarity_sum = 0
    
    print("\nWeighted details of similar user ratings:")
    for user_id, rating, similarity in similar_users_ratings_with_details:
        weighted_rating = rating * similarity
        weighted_sum += weighted_rating
        similarity_sum += similarity
        print(f"User ID: {user_id}, Rating: {rating}, Similarity: {similarity:.4f}, Weighted Scoring: {weighted_rating:.4f}")
    
    if similarity_sum > 0:  # 避免除以零
        weighted_expected_rating = weighted_sum / similarity_sum
        print(f"\nPredicted score based on similarity weighting, User 1's expected rating for movie 508: {weighted_expected_rating:.4f}")
    else:
        print("\nThe total similarity is zero, making it impossible to calculate the weighted expected score.")
else:
    print("\nThere are no similar users who have rated movie 508, so it cannot be predicted.")

# 查看用户1是否已经对电影508进行了评分
if 508 in utility_matrix.columns and not pd.isna(utility_matrix.loc[1, 508]):
    actual_rating = utility_matrix.loc[1, 508]
    print(f"User 1's actual rating for movie 508: {actual_rating}")
else:
    print("User 1 has not yet rated movie 508.")

# 比较两种评分方法
if similar_users_ratings and similar_users_ratings_with_details and similarity_sum > 0:
    print("\nComparison of Scoring Methods：")
    print(f"Simple average rating: {expected_rating:.4f}")
    print(f"Similarity Weighted Scoring: {weighted_expected_rating:.4f}")
    print(f"Difference: {abs(expected_rating - weighted_expected_rating):.4f}")

item_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                              ...   
1         5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         4.0   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   5.0   NaN  ...   
940       NaN   NaN   NaN   2.0   NaN   NaN   4.0   5.0   3.0   NaN  ...   
941       5.0   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...   
942       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
943       NaN   5.0   NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN  ...   

item_id  16

<h1 style="color:darkblue">Problem 2</h1>

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import zipfile
import os

def load_movielens_data():
    """加载MovieLens 100k数据集"""
    print("Loading MovieLens 100k dataset...")
    
    # 尝试从zip文件加载数据
    try:
        with zipfile.ZipFile('ml-100k.zip', 'r') as zip_ref:
            zip_ref.extractall('ml-100k/')
    except:
        print("Note: ml-100k.zip file not found, assuming data is already extracted")
    
    # 加载评分数据
    ratings_cols = ['user_id', 'item_id', 'rating', 'timestamp']
    ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols)
    
    # 加载电影信息（可选，用于显示电影名称）
    try:
        movies_cols = ['item_id', 'title', 'release_date', 'video_release_date', 'imdb_url'] + [f'genre_{i}' for i in range(19)]
        movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_cols, encoding='latin1')
        movies = movies[['item_id', 'title']]
    except:
        movies = None
        print("Unable to load movie information file")
    
    return ratings, movies

def create_user_item_matrix(ratings):
    """创建用户-物品评分矩阵"""
    print("Creating user-item rating matrix...")
    user_item_matrix = ratings.pivot(index='user_id', columns='item_id', values='rating')
    user_item_matrix = user_item_matrix.fillna(0)  # 用0填充未评分项
    return user_item_matrix

def center_user_ratings(user_item_matrix):
    """对用户评分进行中心化处理"""
    print("Centering user ratings (subtracting user mean ratings)...")
    
    # 创建中心化矩阵的副本
    centered_matrix = user_item_matrix.copy()
    user_means = {}
    
    # 对每个用户计算平均评分（只考虑非零评分）
    for user_id in user_item_matrix.index:
        user_ratings = user_item_matrix.loc[user_id]
        # 只计算非零评分的平均值
        non_zero_ratings = user_ratings[user_ratings > 0]
        if len(non_zero_ratings) > 0:
            user_mean = non_zero_ratings.mean()
            user_means[user_id] = user_mean
            
            # 只对该用户评过分的电影进行中心化
            for item_id in user_item_matrix.columns:
                if user_item_matrix.loc[user_id, item_id] > 0:
                    centered_matrix.loc[user_id, item_id] = user_item_matrix.loc[user_id, item_id] - user_mean
        else:
            user_means[user_id] = 0
    
    print(f"User mean ratings calculated for {len(user_means)} users")
    
    # 显示几个用户的平均评分作为示例
    example_users = [200, 15] if all(uid in user_means for uid in [200, 15]) else list(user_means.keys())[:5]
    print("Sample user mean ratings:")
    for user_id in example_users[:5]:
        if user_id in user_means:
            print(f"  User {user_id}: mean rating = {user_means[user_id]:.2f}")
    
    return centered_matrix, user_means

def check_user_rating(ratings, user_id, item_id):
    """检查特定用户是否对特定物品评分并返回评分"""
    user_rating = ratings[(ratings['user_id'] == user_id) & (ratings['item_id'] == item_id)]
    if not user_rating.empty:
        rating_value = user_rating['rating'].iloc[0]
        print(f"User {user_id} has rated movie {item_id} with score: {rating_value}")
        return rating_value
    else:
        print(f"User {user_id} has NOT rated movie {item_id}")
        return None

def calculate_user_similarity(centered_matrix, target_users):
    """基于中心化数据计算目标用户与所有其他用户的余弦相似度"""
    print("Calculating user similarities using centered ratings...")
    
    # 计算所有用户之间的余弦相似度（基于中心化数据）
    similarity_matrix = cosine_similarity(centered_matrix)
    similarity_df = pd.DataFrame(similarity_matrix, 
                                index=centered_matrix.index, 
                                columns=centered_matrix.index)
    
    results = {}
    for user_id in target_users:
        if user_id in similarity_df.index:
            # 获取该用户与所有其他用户的相似度
            user_similarities = similarity_df.loc[user_id].sort_values(ascending=False)
            # 排除自己
            user_similarities = user_similarities[user_similarities.index != user_id]
            results[user_id] = user_similarities
            
            # 显示与该用户最相似的前3个用户
            print(f"Top 3 most similar users to User {user_id}:")
            for neighbor_id, similarity in user_similarities.head(3).items():
                print(f"  User {neighbor_id}: similarity = {similarity:.4f}")
        else:
            print(f"Warning: User {user_id} does not exist in the dataset")
            results[user_id] = pd.Series()
    
    return results

def find_neighbors(user_similarities, k=10):
    """找到最相似的k个邻居用户"""
    neighbors = {}
    for user_id, similarities in user_similarities.items():
        if len(similarities) > 0:
            top_neighbors = similarities.head(k)
            neighbors[user_id] = top_neighbors
            print(f"\nTop {k} most similar neighbors for User {user_id}:")
            for neighbor_id, similarity in top_neighbors.items():
                print(f"  User {neighbor_id}: similarity = {similarity:.4f}")
        else:
            neighbors[user_id] = pd.Series()
    
    return neighbors

def calculate_neighbors_avg_rating(user_item_matrix, neighbors, item_id):
    """计算邻居用户对特定电影的平均评分"""
    ratings_list = []
    valid_neighbors = []
    
    for neighbor_id, similarity in neighbors.items():
        if neighbor_id in user_item_matrix.index and item_id in user_item_matrix.columns:
            neighbor_rating = user_item_matrix.loc[neighbor_id, item_id]
            if neighbor_rating > 0:  # 该邻居确实评过这部电影
                ratings_list.append(neighbor_rating)
                valid_neighbors.append((neighbor_id, neighbor_rating, similarity))
    
    if ratings_list:
        avg_rating = np.mean(ratings_list)
        print(f"\nNeighbors' ratings for Movie {item_id}:")
        for neighbor_id, rating, similarity in valid_neighbors:
            print(f"  User {neighbor_id}: rating = {rating}, similarity = {similarity:.4f}")
        print(f"Average rating from {len(ratings_list)} neighbors: {avg_rating:.4f}")
        return avg_rating, len(ratings_list)
    else:
        print(f"No neighbors have rated Movie {item_id}")
        return None, 0

def predict_rating_centered(user_item_matrix, user_id, item_id, neighbors_similarities, user_means):
    """使用中心化方法和加权平均预测用户对物品的评分"""
    if user_id not in neighbors_similarities or len(neighbors_similarities[user_id]) == 0:
        return None
    
    # 获取邻居用户
    neighbors = neighbors_similarities[user_id]
    
    # 先计算邻居用户的平均评分（原始评分）
    avg_rating, neighbor_count = calculate_neighbors_avg_rating(user_item_matrix, neighbors, item_id)
    
    if neighbor_count == 0:
        return None
    
    # 使用中心化方法计算加权平均评分
    weighted_sum = 0
    similarity_sum = 0
    
    print(f"\nPredicting rating for User {user_id} on Movie {item_id} using centered weighted average:")
    
    user_mean = user_means.get(user_id, 0)
    print(f"User {user_id} mean rating: {user_mean:.2f}")
    
    for neighbor_id, similarity in neighbors.items():
        if neighbor_id in user_item_matrix.index and item_id in user_item_matrix.columns:
            neighbor_rating = user_item_matrix.loc[neighbor_id, item_id]
            if neighbor_rating > 0:  # 该邻居确实评过这部电影
                neighbor_mean = user_means.get(neighbor_id, 0)
                # 中心化的邻居评分
                centered_neighbor_rating = neighbor_rating - neighbor_mean
                weighted_sum += similarity * centered_neighbor_rating
                similarity_sum += abs(similarity)
    
    if similarity_sum > 0:
        # 预测评分 = 用户平均评分 + 加权的中心化评分
        predicted_rating = user_mean + (weighted_sum / similarity_sum)
        print(f"Centered weighted prediction: {user_mean:.2f} + {(weighted_sum / similarity_sum):.2f} = {predicted_rating:.4f}")
        print(f"Simple average from neighbors: {avg_rating:.4f}")
        return predicted_rating
    else:
        return avg_rating

def recommend_movie(predictions, movie_id, movies_df=None):
    """根据预测评分推荐电影"""
    print(f"\n=== Recommendation Results for Movie {movie_id} ===")
    
    if movies_df is not None and movie_id in movies_df['item_id'].values:
        movie_title = movies_df[movies_df['item_id'] == movie_id]['title'].iloc[0]
        print(f"Movie Title: {movie_title}")
    
    valid_predictions = {user_id: rating for user_id, rating in predictions.items() if rating is not None}
    
    if not valid_predictions:
        print("Unable to generate predicted ratings for any user")
        return None
    
    print("\nPredicted Ratings:")
    for user_id, predicted_rating in valid_predictions.items():
        print(f"User {user_id}: {predicted_rating:.4f}")
    
    # 找到预测评分最高的用户
    best_user = max(valid_predictions.items(), key=lambda x: x[1])
    print(f"\nRecommendation Result: Suggest Movie {movie_id} to User {best_user[0]}")
    print(f"Predicted Rating: {best_user[1]:.4f}")
    
    return best_user

def analyze_target_users_ratings(ratings, target_users, target_movie):
    """分析目标用户是否对目标电影进行了评分"""
    print(f"\n=== Analysis: Do Users {target_users} Have Ratings for Movie {target_movie}? ===")
    
    user_ratings = {}
    for user_id in target_users:
        rating = check_user_rating(ratings, user_id, target_movie)
        user_ratings[user_id] = rating
    
    return user_ratings

def main():
    """主函数"""
    print("=== MovieLens 100k Recommendation System with Centered Ratings ===\n")
    
    # 1. 加载数据
    ratings, movies = load_movielens_data()
    print(f"Dataset contains {len(ratings)} rating records")
    print(f"Number of users: {ratings['user_id'].nunique()}")
    print(f"Number of movies: {ratings['item_id'].nunique()}")
    
    # 2. 创建用户-物品矩阵
    user_item_matrix = create_user_item_matrix(ratings)
    print(f"User-item matrix shape: {user_item_matrix.shape}")
    
    # 3. 对用户评分进行中心化
    centered_matrix, user_means = center_user_ratings(user_item_matrix)
    print(f"Centered matrix shape: {centered_matrix.shape}")
    
    # 4. 目标用户和电影
    target_users = [200, 15]
    target_movie = 95
    
    # 检查目标用户是否存在
    existing_users = [user for user in target_users if user in user_item_matrix.index]
    if len(existing_users) != len(target_users):
        missing_users = set(target_users) - set(existing_users)
        print(f"Warning: The following users do not exist in the dataset: {missing_users}")
    
    # 5. 基于中心化数据计算用户相似度
    user_similarities = calculate_user_similarity(centered_matrix, existing_users)
    
    # 6. 找到邻居用户
    neighbors = find_neighbors(user_similarities, k=10)
    
    # 7. 使用中心化方法预测评分
    predictions = {}
    for user_id in existing_users:
        predicted_rating = predict_rating_centered(user_item_matrix, user_id, target_movie, neighbors, user_means)
        predictions[user_id] = predicted_rating
    
    # 8. 生成推荐
    recommendation = recommend_movie(predictions, target_movie, movies)
    
    # 9. 现在显示真实评分并进行比较
    print(f"\n=== Actual vs Predicted Ratings Comparison ===")
    user_ratings = analyze_target_users_ratings(ratings, target_users, target_movie)
    
    for user_id in existing_users:
        actual_rating = user_ratings.get(user_id)
        predicted_rating = predictions.get(user_id)
        user_mean = user_means.get(user_id, 0)
        
        if actual_rating is not None and predicted_rating is not None:
            error = abs(actual_rating - predicted_rating)
            centered_actual = actual_rating - user_mean
            print(f"User {user_id}: Actual = {actual_rating}, Predicted = {predicted_rating:.4f}, Error = {error:.4f}")
            print(f"  User mean = {user_mean:.2f}, Centered actual = {centered_actual:.2f}")
        elif actual_rating is not None:
            print(f"User {user_id}: Has actual rating = {actual_rating}, but no prediction was made")
        elif predicted_rating is not None:
            print(f"User {user_id}: No actual rating, Predicted = {predicted_rating:.4f}")
    
    # 10. 显示额外统计信息
    print(f"\n=== Additional Statistics ===")
    for user_id in existing_users:
        if user_id in user_item_matrix.index:
            user_ratings_data = user_item_matrix.loc[user_id]
            rated_count = (user_ratings_data > 0).sum()
            avg_rating = user_ratings_data[user_ratings_data > 0].mean() if rated_count > 0 else 0
            user_mean = user_means.get(user_id, 0)
            print(f"User {user_id}: Number of rated movies = {rated_count}, Average rating = {avg_rating:.2f}, Mean used for centering = {user_mean:.2f}")
    
    # 检查目标电影的评分情况
    if target_movie in user_item_matrix.columns:
        movie_ratings_data = user_item_matrix[target_movie]
        rated_users = (movie_ratings_data > 0).sum()
        avg_movie_rating = movie_ratings_data[movie_ratings_data > 0].mean() if rated_users > 0 else 0
        print(f"Movie {target_movie}: Number of users who rated = {rated_users}, Average rating = {avg_movie_rating:.2f}")
    
    # 11. 最终推荐决策
    print(f"\n=== Final Recommendation Decision ===")
    if recommendation:
        recommended_user = recommendation[0]
        predicted_score = recommendation[1]
        print(f"The recommendation system suggests promoting Movie {target_movie} to User {recommended_user}")
        print(f"Reason: User {recommended_user} has the highest predicted interest score of {predicted_score:.4f}")
        
        # 与实际评分比较（如果有的话）
        actual_rating = user_ratings.get(recommended_user)
        if actual_rating is not None:
            print(f"Note: User {recommended_user} has actually rated this movie with {actual_rating}")
            print(f"Prediction vs Reality: Predicted = {predicted_score:.4f}, Actual = {actual_rating}")

if __name__ == "__main__":
    main()

=== MovieLens 100k Recommendation System with Centered Ratings ===

Loading MovieLens 100k dataset...
Dataset contains 100000 rating records
Number of users: 943
Number of movies: 1682
Creating user-item rating matrix...
User-item matrix shape: (943, 1682)
Centering user ratings (subtracting user mean ratings)...
User mean ratings calculated for 943 users
Sample user mean ratings:
  User 200: mean rating = 4.03
  User 15: mean rating = 2.88
Centered matrix shape: (943, 1682)
Calculating user similarities using centered ratings...
Top 3 most similar users to User 200:
  User 332: similarity = 0.2604
  User 882: similarity = 0.2506
  User 881: similarity = 0.2341
Top 3 most similar users to User 15:
  User 794: similarity = 0.2543
  User 637: similarity = 0.2169
  User 26: similarity = 0.2156

Top 10 most similar neighbors for User 200:
  User 332: similarity = 0.2604
  User 882: similarity = 0.2506
  User 881: similarity = 0.2341
  User 178: similarity = 0.2289
  User 109: similarity = 