In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

In [2]:
# 定义数据集， 也就是那个表格， 注意这里我们采用字典存放数据， 因为实际情况中数据是非常稀疏的， 很少有情况是现在这样
def loadData():
    ratings={'Alice': {'item1': 5, 'item2': 3, 'item3': 4, 'item4': 4},
           'user1': {'item1': 3, 'item2': 1, 'item3': 2, 'item4': 3, 'item5': 3},
           'user2': {'item1': 4, 'item2': 3, 'item3': 4, 'item4': 3, 'item5': 5},
           'user3': {'item1': 3, 'item2': 3, 'item3': 1, 'item4': 5, 'item5': 4},
           'user4': {'item1': 1, 'item2': 5, 'item3': 5, 'item4': 2, 'item5': 1}
          }
    return ratings
ratings = loadData()
ratings = pd.DataFrame(ratings).T
ratings

Unnamed: 0,item1,item2,item3,item4,item5
Alice,5.0,3.0,4.0,4.0,
user1,3.0,1.0,2.0,3.0,3.0
user2,4.0,3.0,4.0,3.0,5.0
user3,3.0,3.0,1.0,5.0,4.0
user4,1.0,5.0,5.0,2.0,1.0


### 计算Alice与其他用户的相似度（这里使用皮尔逊相关系数）

In [28]:
Alice = ratings.loc['Alice',:'item4']
user1 = ratings.loc['user1',:'item4']
user2 = ratings.loc['user2',:'item4']
user3 = ratings.loc['user3',:'item4']
user4 = ratings.loc['user4',:'item4']

In [4]:
def pearsonrSim(x,y):
    """
    皮尔森相似度
    """
    return pearsonr(x,y)[0]

In [30]:
cosine_similarity([Alice,user1])

array([[1.       , 0.9753213],
       [0.9753213, 1.       ]])

In [5]:
Alice_user1_similarity = pearsonrSim(Alice,user1)
Alice_user2_similarity = pearsonrSim(Alice,user2)
Alice_user3_similarity = pearsonrSim(Alice,user3)
Alice_user4_similarity = pearsonrSim(Alice,user4)
Alice_user1_similarity,Alice_user2_similarity,Alice_user3_similarity,Alice_user4_similarity

(0.8528028654224415, 0.7071067811865475, 0.0, -0.7921180343813393)

In [6]:
ratings.loc['Alice','item5']=4.87

In [7]:
ratings

Unnamed: 0,item1,item2,item3,item4,item5
Alice,5.0,3.0,4.0,4.0,4.87
user1,3.0,1.0,2.0,3.0,3.0
user2,4.0,3.0,4.0,3.0,5.0
user3,3.0,3.0,1.0,5.0,4.0
user4,1.0,5.0,5.0,2.0,1.0


# UserCF 代码实现

In [8]:
# 定义数据集， 也就是那个表格， 注意这里我们采用字典存放数据， 因为实际情况中数据是非常稀疏的， 很少有情况是现在这样
def loadData():
    items={'A': {1: 5, 2: 3, 3: 4, 4: 3, 5: 1},
           'B': {1: 3, 2: 1, 3: 3, 4: 3, 5: 5},
           'C': {1: 4, 2: 2, 3: 4, 4: 1, 5: 5},
           'D': {1: 4, 2: 3, 3: 3, 4: 5, 5: 2},
           'E': {2: 3, 3: 5, 4: 4, 5: 1}
          }
    users={1: {'A': 5, 'B': 3, 'C': 4, 'D': 4},
           2: {'A': 3, 'B': 1, 'C': 2, 'D': 3, 'E': 3},
           3: {'A': 4, 'B': 3, 'C': 4, 'D': 3, 'E': 5},
           4: {'A': 3, 'B': 3, 'C': 1, 'D': 5, 'E': 4},
           5: {'A': 1, 'B': 5, 'C': 5, 'D': 2, 'E': 1}
          }
    return items,users

items, users = loadData()
item_df = pd.DataFrame(items).T
user_df = pd.DataFrame(users).T


In [9]:
item_df

Unnamed: 0,1,2,3,4,5
A,5.0,3.0,4.0,3.0,1.0
B,3.0,1.0,3.0,3.0,5.0
C,4.0,2.0,4.0,1.0,5.0
D,4.0,3.0,3.0,5.0,2.0
E,,3.0,5.0,4.0,1.0


In [10]:
user_df

Unnamed: 0,A,B,C,D,E
1,5.0,3.0,4.0,4.0,
2,3.0,1.0,2.0,3.0,3.0
3,4.0,3.0,4.0,3.0,5.0
4,3.0,3.0,1.0,5.0,4.0
5,1.0,5.0,5.0,2.0,1.0


In [11]:
"""计算用户相似性矩阵"""
similarity_matrix = pd.DataFrame(-1 * np.ones((len(users), len(users))), index=[1, 2, 3, 4, 5], columns=[1, 2, 3, 4, 5])

for userx in users:
    for usery in users:
        userxVec=[]
        useryVec=[]
        if userx == usery:
            continue
        else:
            userx_history = users[userx].keys()
            usery_history = users[usery].keys()
            intersection = set(userx_history).intersection(usery_history) # 用户x和用户y行为历史的交集，否则有nan无法计算相似性
            for i in intersection:
                userxVec.append(users[userx][i])
                useryVec.append(users[usery][i])
            similarity_matrix[userx][usery]=np.corrcoef(np.array(userxVec),np.array(useryVec))[0][1]

In [12]:
similarity_matrix

Unnamed: 0,1,2,3,4,5
1,-1.0,0.852803,0.707107,0.0,-0.792118
2,0.852803,-1.0,0.467707,0.489956,-0.900149
3,0.707107,0.467707,-1.0,-0.161165,-0.466569
4,0.0,0.489956,-0.161165,-1.0,-0.641503
5,-0.792118,-0.900149,-0.466569,-0.641503,-1.0


In [13]:
"""计算前n个相似的用户"""
n = 2
similar_users = dict()
for user in users:
    similar_users[user] = similarity_matrix[user].sort_values(ascending=False)[:n].index.tolist()

In [14]:
similar_users

{1: [2, 3], 2: [1, 4], 3: [1, 2], 4: [2, 1], 5: [3, 4]}

In [15]:
"""计算最后得分,用户1对物品E的预测评分"""
user_mean_rating = dict()
for user in users:
    user_mean = np.mean([value for value in users[user].values()])
    user_mean_rating[user] = user_mean

weighted_scores = 0.
corr_values_sum = 0.
for user in similar_users[1]:
    weighted_scores += similarity_matrix[1][user]
    corr_values_sum += similarity_matrix[1][user] * (users[user]['E'] - user_mean_rating[user])

predict = user_mean_rating[1] + corr_values_sum/weighted_scores
print(f'用户1对物品E的预测评分为 {predict:.2f} ')
    

用户1对物品E的预测评分为 4.87 


In [16]:
user_df.loc[1]['E'] = predict.round(2)
user_df

Unnamed: 0,A,B,C,D,E
1,5.0,3.0,4.0,4.0,4.87
2,3.0,1.0,2.0,3.0,3.0
3,4.0,3.0,4.0,3.0,5.0
4,3.0,3.0,1.0,5.0,4.0
5,1.0,5.0,5.0,2.0,1.0


### 计算物品5与其他物品的相似度（这里使用皮尔逊相关系数）

In [17]:
# 定义数据集， 也就是那个表格， 注意这里我们采用字典存放数据， 因为实际情况中数据是非常稀疏的， 很少有情况是现在这样
def loadData():
    ratings={'Alice': {'item1': 5, 'item2': 3, 'item3': 4, 'item4': 4},
           'user1': {'item1': 3, 'item2': 1, 'item3': 2, 'item4': 3, 'item5': 3},
           'user2': {'item1': 4, 'item2': 3, 'item3': 4, 'item4': 3, 'item5': 5},
           'user3': {'item1': 3, 'item2': 3, 'item3': 1, 'item4': 5, 'item5': 4},
           'user4': {'item1': 1, 'item2': 5, 'item3': 5, 'item4': 2, 'item5': 1}
          }
    return ratings
ratings = loadData()
ratings = pd.DataFrame(ratings).T
ratings

Unnamed: 0,item1,item2,item3,item4,item5
Alice,5.0,3.0,4.0,4.0,
user1,3.0,1.0,2.0,3.0,3.0
user2,4.0,3.0,4.0,3.0,5.0
user3,3.0,3.0,1.0,5.0,4.0
user4,1.0,5.0,5.0,2.0,1.0


In [18]:
item5 = ratings.loc['user1':,'item5'].values.tolist()
item4 = ratings.loc['user1':,'item4'].values.tolist()
item3 = ratings.loc['user1':,'item3'].values.tolist()
item2 = ratings.loc['user1':,'item2'].values.tolist()
item1 = ratings.loc['user1':,'item1'].values.tolist()

In [19]:
def pearsonrSim(x,y):
    """
    皮尔森相似度
    """
    return pearsonr(x,y)[0]

In [20]:
item51_similarity = pearsonrSim(item5,item1)
item52_similarity = pearsonrSim(item5,item2)
item53_similarity = pearsonrSim(item5,item3)
item54_similarity = pearsonrSim(item5,item4)
[x.round(2) for x in (item51_similarity,item52_similarity,item53_similarity,item54_similarity)]

[0.97, -0.48, -0.43, 0.58]

# ItemCF代码实现

In [21]:
# 定义数据集， 也就是那个表格， 注意这里我们采用字典存放数据， 因为实际情况中数据是非常稀疏的， 很少有情况是现在这样
def loadData():
    items={'A': {1: 5, 2: 3, 3: 4, 4: 3, 5: 1},
           'B': {1: 3, 2: 1, 3: 3, 4: 3, 5: 5},
           'C': {1: 4, 2: 2, 3: 4, 4: 1, 5: 5},
           'D': {1: 4, 2: 3, 3: 3, 4: 5, 5: 2},
           'E': {2: 3, 3: 5, 4: 4, 5: 1}
          }
    users={1: {'A': 5, 'B': 3, 'C': 4, 'D': 4},
           2: {'A': 3, 'B': 1, 'C': 2, 'D': 3, 'E': 3},
           3: {'A': 4, 'B': 3, 'C': 4, 'D': 3, 'E': 5},
           4: {'A': 3, 'B': 3, 'C': 1, 'D': 5, 'E': 4},
           5: {'A': 1, 'B': 5, 'C': 5, 'D': 2, 'E': 1}
          }
    return items,users

items, users = loadData()
item_df = pd.DataFrame(items).T
user_df = pd.DataFrame(users).T


In [22]:
"""计算物品的相似矩阵"""
similarity_matrix = pd.DataFrame(-1 * np.ones((len(items), len(items))), index=['A', 'B', 'C', 'D', 'E'], columns=['A', 'B', 'C', 'D', 'E'])

# 遍历每条物品-用户评分数据
for itemx in items:
    for itemy in items:
        itemxVec = []
        itemyVec = []
        if itemx == itemy:
            continue
        else:
            itemx_history = set(items[itemx].keys())
            itemy_history = set(items[itemy].keys())
            intersection = itemx_history.intersection(itemy_history)  # 求交集，同时对两个物品都打分的用户，才有意义
            for i in intersection:
                itemxVec.append(items[itemx][i])
                itemyVec.append(items[itemy][i])
            similarity_matrix[itemx][itemy] = pearsonrSim(itemxVec,itemyVec).round(2)
            # similarity_matrix[itemx][itemy] = np.corrcoef(np.array(itemxVec),np.array(itemyVec))[0][1] 两种计算方式等价

In [23]:
similarity_matrix

Unnamed: 0,A,B,C,D,E
A,-1.0,-0.48,-0.12,0.53,0.97
B,-0.48,-1.0,0.65,-0.31,-0.48
C,-0.12,0.65,-1.0,-0.72,-0.43
D,0.53,-0.31,-0.72,-1.0,0.58
E,0.97,-0.48,-0.43,0.58,-1.0


In [24]:
"""计算前n个相似的物品"""
n = 2
similar_items = dict()
for item in items:
    similar_items[item] = similarity_matrix[item].sort_values(ascending=False)[:n].index.tolist()

In [25]:
similar_items['E']

['A', 'D']

In [26]:
"""计算最后得分,用户1对物品E的预测评分"""

# 计算物品平均打分情况
item_ratings_mean = dict()
for item,rating in items.items():
    item_ratings_mean[item] = np.mean([value for value in rating.values()])

weighted_scores = 0.
corr_values_sum = 0.

for item in similar_items['E']:
    weighted_scores += similarity_matrix['E'][item]
    corr_values_sum += similarity_matrix['E'][item] * (users[1][item] -  item_ratings_mean[item])

predict = item_ratings_mean['E'] + corr_values_sum/weighted_scores
print(f'用户1对物品E的预测得分为 {predict:.2f}')

用户1对物品E的预测得分为 4.60
