In [1]:
users = ['user1','user2','user3','user4','user5']

items = ['itemA','itemB','itemC','itemD','itemE']

# 用户购买记录
datasets = [
    [1,0,1,1,0],
    [1,0,0,1,1],
    [1,0,1,0,0],
    [0,1,0,1,1],
    [1,1,1,0,1]
]

In [2]:
import pandas as pd

df = pd.DataFrame(datasets, columns=items, index=users)

In [3]:
print(df)

       itemA  itemB  itemC  itemD  itemE
user1      1      0      1      1      0
user2      1      0      0      1      1
user3      1      0      1      0      0
user4      0      1      0      1      1
user5      1      1      1      0      1


In [4]:
from sklearn.metrics import jaccard_similarity_score

# 直接计算 itemA 和itemB 的相似度
jaccard_similarity_score(df['itemA'], df['itemB'])



0.2

In [5]:
# 用户相关性
from sklearn.metrics.pairwise import pairwise_distances
# scikit-learn从0.20.1不再自动修改numpy语法。所以需要养成更加规范的书写习惯
# pairwise_distances(df, metric="jaccard") ×
user_similar = 1 - pairwise_distances(df.values, metric="jaccard")

user_similar = pd.DataFrame(user_similar, columns=users, index=users)

user_similar



Unnamed: 0,user1,user2,user3,user4,user5
user1,1.0,0.5,0.666667,0.2,0.4
user2,0.5,1.0,0.25,0.5,0.4
user3,0.666667,0.25,1.0,0.0,0.5
user4,0.2,0.5,0.0,1.0,0.4
user5,0.4,0.4,0.5,0.4,1.0


In [6]:
# 物品相关性
item_similar = 1 - pairwise_distances(df.T.values, metric="jaccard")

item_similar = pd.DataFrame(item_similar, columns=items, index=items)

item_similar



Unnamed: 0,itemA,itemB,itemC,itemD,itemE
itemA,1.0,0.2,0.75,0.4,0.4
itemB,0.2,1.0,0.25,0.25,0.666667
itemC,0.75,0.25,1.0,0.2,0.2
itemD,0.4,0.25,0.2,1.0,0.5
itemE,0.4,0.666667,0.2,0.5,1.0


In [7]:
topN_users = {}
# 为每个用户找到最相似的用户

for i in user_similar.index:
    # 去出每一行数据，删除自己
    data = user_similar.loc[i].drop([i])
    # 大到小排序
    data_sort = data.sort_values(ascending = False)
    topN_users[i] = list(data_sort.index[:2])

topN_users

{'user1': ['user3', 'user2'],
 'user2': ['user4', 'user1'],
 'user3': ['user1', 'user5'],
 'user4': ['user2', 'user5'],
 'user5': ['user3', 'user4']}

In [8]:
import numpy as np
rs_results = {}

for user, users_sim in topN_users.items():
    rs_result = set()
    
    for user_sim in users_sim:
        rs_result = rs_result.union(set(df.loc[user_sim].replace(0, np.nan).dropna().index))
    
    rs_result -= set(df.loc[user_sim].replace(0, np.nan).dropna().index)

    rs_results[user] = rs_result

rs_results

{'user1': {'itemC'},
 'user2': {'itemB', 'itemE'},
 'user3': {'itemD'},
 'user4': {'itemD'},
 'user5': {'itemA', 'itemC'}}