In [1]:
#构建推荐引擎中一个非常重要的任务是寻找相似的用户，为某位用户生成的推荐信息可以同时推荐给与其相似的用户

In [47]:
import json
import numpy as np

#定义计算user1和use2的皮尔逊相关系数
def pearson_score(dataset,user1,user2):
    if user1 not in dataset:
        raise TypeError('user'+user1+'not present in the dataset')
    if user2 not in dataset:
        raise TypeError('user'+user2+'not present in the dataset')
    
    rated_by_both={}
    
    for item in dataset[user1]:
        if item in dataset[user2]:
            rated_by_both[item]=1
    if len(rated_by_both)==0:
        return 0
    
    user1_sum=np.sum([dataset[user1][item] for item in rated_by_both.keys()])
    user2_sum=np.sum([dataset[user2][item] for item in rated_by_both.keys()])
    
    user1_squared_sum=np.sum([np.square(dataset[user1][item]) for item in rated_by_both.keys()])
    user2_squared_sum=np.sum([np.square(dataset[user2][item]) for item in rated_by_both.keys()])
    
    product_sum=np.sum(dataset[user1][item]*dataset[user2][item] for item in rated_by_both.keys())
    
    num_ratings=len(rated_by_both)
    
    sxy=product_sum-user1_sum*user2_sum/num_ratings
    sxx=user1_squared_sum-np.square(user1_sum)/num_ratings
    syy=user2_squared_sum-np.square(user2_sum)/num_ratings
    if sxx*syy==0:
        return 0
    return sxy/np.sqrt(sxx*syy)
#寻找特定数量的与输入用户相似的用户
def find_similar_users(dataset,user,num_users):
    if user not in dataset:
        raise TypeError('User'+user+'not present in the dataset')
    scores=dict([[x,pearson_score(data,user,x)] for x in dataset if x!=user])
    
    scores=sorted(scores.items(),key=lambda x:x[1],reverse=True)
    
    top_k=scores[0:num_users]

    
    return top_k

data_file=r'F:\程序员\python\Python机器学习经典实例\Chapter05\movie_ratings.json'

with open(data_file) as f:
    data=json.loads(f.read())
    

user='John Carson'
print('users similar to'+user+':\n')
similar_users=find_similar_users(data,user,3)
for i in  similar_users:
    print(i[0],':',i[1])
similar_users
    

users similar toJohn Carson:

Michael Henry : 0.991240707162
Alex Roberts : 0.747017880834
Melissa Jones : 0.594088525786


[('Michael Henry', 0.99124070716192991),
 ('Alex Roberts', 0.74701788083399645),
 ('Melissa Jones', 0.59408852578600435)]