## Description:
这是一个使用协同过滤算法的小案例， 实现一下博客上的那个用户评分的案例， 任务是预测某个用户对于某个物品的打分情况，具体可以参考给出的博客链接。

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

import warnings 
warnings.filterwarnings('ignore')

In [None]:
# 定义数据集, 为了简单起见, 我们只定义了5个用户5个物品
def loadData():
    items = {'物品1': {"Alice": 5, "用户1": 3, "用户2": 4, "用户3": 3, "用户4": 1},
           '物品2': {"Alice": 3, "用户1": 1, "用户2": 3, "用户3": 3, "用户4": 5},
           '物品3': {"Alice": 4, "用户1": 2, "用户2": 4, "用户3": 1, "用户4": 5},
           '物品4': {"Alice": 4, "用户1": 3, "用户2": 3, "用户3": 5, "用户4": 2},
           '物品5': {"用户2": 3, "用户3": 5, "用户4": 4, "用户5": 1}
          }
    users = {"Alice": {'物品1': 5, '物品2': 3, '物品3': 4, '物品4': 4},
           "用户1": {'物品1': 3, '物品2': 1, '物品3': 2, '物品4': 3, '物品5': 3},
           "用户2": {'物品1': 4, '物品2': 3, '物品3': 4, '物品4': 3, '物品5': 5},
           "用户3": {'物品1': 3, '物品2': 3, '物品3': 1, '物品4': 5, '物品5': 4},
           "用户4": {'物品1': 1, '物品2': 5, '物品3': 5, '物品4': 2, '物品5': 1}
          }
    return items,users

In [None]:
items, users = loadData()
item_df = pd.DataFrame(items).T
user_df = pd.DataFrame(users).T

In [None]:
"""计算用户相似性矩阵"""
similarity_matrix = pd.DataFrame(np.zeros((len(users), len(users))), index=["Alice", "用户1", "用户2", "用户3", "用户4"], 
                                 columns=["Alice", "用户1", "用户2", "用户3", "用户4"])

# 遍历每条用户-物品评分数据
for userID in users:
    for otheruserId in users:
        vec_user = []
        vec_otheruser = []
        if userID != otheruserId:
            for itemId in items:   # 遍历物品-用户评分数据
                itemRatings = items[itemId]        # 这也是个字典  每条数据为所有用户对当前物品的评分
                if userID in itemRatings and otheruserId in itemRatings:  # 说明两个用户都对该物品评过分
                    vec_user.append(itemRatings[userID])
                    vec_otheruser.append(itemRatings[otheruserId])
            # 计算相似度矩阵共现矩阵
            similarity_matrix[userID][otheruserId] = np.corrcoef(np.array(vec_user), np.array(vec_otheruser))[0][1]

In [None]:
similarity_matrix

In [None]:
"""计算前n个相似的用户"""
n = 2
similarity_users = similarity_matrix["Alice"].sort_values(ascending=False)[:n].index.tolist()

In [None]:
"""计算最终得分"""
base_score = np.mean(np.array([value for value in users["Alice"].values()]))
weighted_scores = 0.
corr_values_sum = 0.
for user in similarity_users:
    corr_value = similarity_matrix["Alice"][user]            # 两个用户之间的相似性
    mean_user_score = np.mean(np.array([value for value in users[user].values()]))    # 每个用户的打分平均值
    weighted_scores += corr_value * (users[user]["物品5"]-mean_user_score)      # 加权分数
    corr_values_sum += corr_value
final_scores = base_score + weighted_scores / corr_values_sum
print('用户Alice对物品5的打分: ', final_scores)
user_df.loc["Alice"]["物品5"] = final_scores
user_df

In [None]:
"""计算物品的相似矩阵"""
similarity_matrix = pd.DataFrame(np.zeros((len(items), len(items))), 
                                 index=['物品1', '物品2', '物品3', '物品4','物品5'], 
                                 columns=['物品1', '物品2', '物品3', '物品4','物品5'])

# 遍历每条物品-用户评分数据
for itemId in items:
    for otheritemId in items:
        vec_item = []         # 定义列表， 保存当前两个物品的向量值
        vec_otheritem = []
        #userRagingPairCount = 0     # 两件物品均评过分的用户数
        if itemId != otheritemId:    # 物品不同
            for userId in users:    # 遍历用户-物品评分数据
                userRatings = users[userId]    # 每条数据为该用户对所有物品的评分， 这也是个字典
                
                if itemId in userRatings and otheritemId in userRatings:   # 用户对这两个物品都评过分
                    vec_item.append(userRatings[itemId])
                    vec_otheritem.append(userRatings[otheritemId])
            
            # 计算相似度矩阵共现矩阵
            similarity_matrix[itemId][otheritemId] = np.corrcoef(np.array(vec_item), np.array(vec_otheritem))[0][1]

In [None]:
similarity_matrix

In [None]:
"""得到与物品5相似的前n个物品"""
n = 2
similarity_items = similarity_matrix['物品5'].sort_values(ascending=False)[:n].index.tolist()
similarity_items

In [None]:
"""计算最终得分"""
base_score = np.mean(np.array([value for value in items['物品5'].values()]))
weighted_scores = 0.
corr_values_sum = 0.
for item in similarity_items:
    corr_value = similarity_matrix['物品5'][item]            # 两个物品之间的相似性
    mean_item_score = np.mean(np.array([value for value in items[item].values()]))    # 每个物品的打分平均值
    weighted_scores += corr_value * (users["Alice"][item]-mean_item_score)      # 加权分数
    corr_values_sum += corr_value
final_scores = base_score + weighted_scores / corr_values_sum
print('用户Alice对物品5的打分: ', final_scores)
user_df.loc["Alice"]['物品5'] = final_scores
user_df