# 用户物品相似度计算

In [1]:
users = ["User1","User2","User3","User4","User5"]
items =["Item A","Item B","Item C","Item D","Item E"]

#用户购买记录数据
datasets = [
    [1, 0, 1, 1, 0],
    [1, 0, 0, 1, 1],
    [1, 0, 1, 0, 0],
    [0, 1, 0, 1, 1],
    [1, 1, 1, 0, 1],
]
import pandas as pd
df = pd.DataFrame(datasets, columns=items,index=users)
df

Unnamed: 0,Item A,Item B,Item C,Item D,Item E
User1,1,0,1,1,0
User2,1,0,0,1,1
User3,1,0,1,0,0
User4,0,1,0,1,1
User5,1,1,1,0,1


In [2]:
from sklearn.metrics.pairwise import pairwise_distances
# 计算用户的相似度
user_similar = 1 - pairwise_distances(df.values, metric='jaccard')
user_similar = pd.DataFrame(user_similar, columns=users, index=users)
user_similar



Unnamed: 0,User1,User2,User3,User4,User5
User1,1.0,0.5,0.666667,0.2,0.4
User2,0.5,1.0,0.25,0.5,0.4
User3,0.666667,0.25,1.0,0.0,0.5
User4,0.2,0.5,0.0,1.0,0.4
User5,0.4,0.4,0.5,0.4,1.0


In [3]:
# 计算物品的相似度
item_similar = 1 - pairwise_distances(df.T.values, metric='jaccard')
item_similar = pd.DataFrame(item_similar, columns=items, index=items)
item_similar



Unnamed: 0,Item A,Item B,Item C,Item D,Item E
Item A,1.0,0.2,0.75,0.4,0.4
Item B,0.2,1.0,0.25,0.25,0.666667
Item C,0.75,0.25,1.0,0.2,0.2
Item D,0.4,0.25,0.2,1.0,0.5
Item E,0.4,0.666667,0.2,0.5,1.0


# 基于用户的协同过滤 UserCF

In [4]:
# 为每一个用户找到最相似的2个用户
topN_users = {}
for i in user_similar.index:
# 取出每一列数据 删除自己 按照相似度排序
    _df = user_similar.loc[i].drop([i])
    _df_sorted = _df.sort_values(ascending = False)
    top2 = list(_df_sorted.index[:2])
    topN_users[i] = top2

In [5]:
topN_users

{'User1': ['User3', 'User2'],
 'User2': ['User4', 'User1'],
 'User3': ['User1', 'User5'],
 'User4': ['User2', 'User5'],
 'User5': ['User3', 'User4']}

In [6]:
topN_users.items()

dict_items([('User1', ['User3', 'User2']), ('User2', ['User4', 'User1']), ('User3', ['User1', 'User5']), ('User4', ['User2', 'User5']), ('User5', ['User3', 'User4'])])

In [7]:
df.loc['User1'].index

Index(['Item A', 'Item B', 'Item C', 'Item D', 'Item E'], dtype='object')

In [13]:
import numpy as np
# 根据topn的相似用户构建推荐结果
re_results = {}
for user, sim_users in topN_users.items():
    re_result = set() # 当前用户的相似用户的交互过的物品集合
    for sim_user in sim_users:
        re_result = re_result.union(set(df.loc[sim_user].replace(0, np.nan).dropna().index))
        
    # 过滤掉自己交互过的物品
    re_result -= set(df.loc[user].replace(0,np.nan).dropna().index)
    re_results[user] = re_result

In [14]:
re_results

{'User1': {'Item E'},
 'User2': {'Item B', 'Item C'},
 'User3': {'Item B', 'Item D', 'Item E'},
 'User4': {'Item A', 'Item C'},
 'User5': {'Item D'}}