**欧氏距离**    
为了构建一个推荐引擎，需要定义相似度指标，以便于找到与数据库中特定用户相似的用户    
欧式距离分数就是这样一个指标，可以计算两个数据点之间的欧几里得距离

In [3]:
import json
import numpy as np

# 计算user1和user2的欧式距离分数
def euclidean_score(dataset, user1, user2):
    # 用户是否在数据库中出现
    if user1 not in dataset:
        raise TypeError('User ' + user1 + ' not present in the dataset')

    if user2 not in dataset:
        raise TypeError('User ' + user2 + ' not present in the dataset')

    # 提取两个用户评分过的电影
    rated_by_both = {}

    for item in dataset[user1]:
        if item in dataset[user2]:
            rated_by_both[item] = 1

    num_ratings = len(rated_by_both) 

    # 如果没有两个用户共同评分过的电影，则说明这两个用户之间没有相似度
    if num_ratings == 0:
        return 0

    # 对于每个共同评分，只计算平方和的平方根，并将该值归一化，使得评分值在[0,1]范围内
    squared_difference = []
    for item in dataset[user1]:
        if item in dataset[user2]:
            squared_difference.append(np.square(dataset[user1][item]-dataset[user2][item]))
    
    return 1/(1+np.sqrt(np.sum(squared_difference)))


data_file = 'movie_ratings.json'

with open(data_file, 'r') as f:
    data = json.loads(f.read())

user1 = 'John Carson'
user2 = 'Michelle Peterson'

print("\nPearson score:")
print(euclidean_score(data, user1, user2))


Pearson score:
0.29429805508554946


**皮尔逊相关系数**      
欧式距离分数是一个非常好的指标，但是它也有一些缺点。     
因此，皮尔逊相关系数常用于推荐引擎。

In [5]:
import json
import numpy as np

# 计算皮尔逊相关系数
def pearson_score(dataset, user1, user2):
    # 用户是否在数据库中出现
    if user1 not in dataset:
        raise TypeError('User ' + user1 + ' not present in the dataset')

    if user2 not in dataset:
        raise TypeError('User ' + user2 + ' not present in the dataset')

    # 提取两个用户评分过的电影
    rated_by_both = {}

    for item in dataset[user1]:
        if item in dataset[user2]:
            rated_by_both[item] = 1

    num_ratings = len(rated_by_both) 

    # 如果没有两个用户共同评分过的电影，则说明这两个用户之间没有相似度
    if num_ratings == 0:
        return 0

    # 计算相同评分电影的值之和
    user1_sum = np.sum([dataset[user1][item] for item in rated_by_both])
    user2_sum = np.sum([dataset[user2][item] for item in rated_by_both])

    # 计算相同评分电影的平方值之和
    user1_squared_sum = np.sum([np.square(dataset[user1][item]) for item in rated_by_both])
    user2_squared_sum = np.sum([np.square(dataset[user2][item]) for item in rated_by_both])

    # 计算数据集乘积之和
    product_sum = np.sum([dataset[user1][item] * dataset[user2][item] for item in rated_by_both])

    # 计算皮尔逊相关系数需要的各种元素
    Sxy = product_sum - (user1_sum * user2_sum / num_ratings)
    Sxx = user1_squared_sum - np.square(user1_sum) / num_ratings
    Syy = user2_squared_sum - np.square(user2_sum) / num_ratings
    
    if Sxx * Syy == 0:
        return 0

    return Sxy / np.sqrt(Sxx * Syy)


data_file = 'movie_ratings.json'

with open(data_file, 'r') as f:
    data = json.loads(f.read())

user1 = 'John Carson'
user2 = 'Michelle Peterson'

print("\nPearson score:")
print(pearson_score(data, user1, user2))




Pearson score:
0.39605901719066977


**寻找数据集中的相似用户**    
为某位用户生成推荐信息可以同时推荐给与其相似的用户

In [7]:
import json
import numpy as np

# 寻找特定数量与输入用户相似的用户
def find_similar_users(dataset, user, num_users):
    # 如果用户不在数据库中
    if user not in dataset:
        raise TypeError('User ' + user + ' not present in the dataset')

    # 计算所有用户的皮尔逊相关度
    scores = np.array([[x, pearson_score(dataset, user, x)] for x in dataset if user != x])

    # 评分按第二列排序
    scores_sorted = np.argsort(scores[:, 1])

    # 评分按照降序排列 
    scored_sorted_dec = scores_sorted[::-1]

    # 提取出num_users个最高分
    top_k = scored_sorted_dec[0:num_users] 

    return scores[top_k] 


data_file = 'movie_ratings.json'
with open(data_file, 'r') as f:
    data = json.loads(f.read())

user = 'John Carson'
print("\nUsers similar to " + user + ":\n")
similar_users = find_similar_users(data, user, 3) 
print("User\t\t\tSimilarity score\n")
for item in similar_users:
    print(item[0], '\t\t', round(float(item[1]), 2))


Users similar to John Carson:

User			Similarity score

Michael Henry 		 0.99
Alex Roberts 		 0.75
Melissa Jones 		 0.59


**生成电影推荐**     

In [8]:
import json
import numpy as np

# 为给定用户生成电影推荐
def generate_recommendations(dataset, user):
    if user not in dataset:
        raise TypeError('User ' + user + ' not present in the dataset')

    total_scores = {}
    similarity_sums = {}

    for u in [x for x in dataset if x != user]:
        similarity_score = pearson_score(dataset, user, u)

        if similarity_score <= 0:
            continue

        # 找到还未被该用户评分的电影
        for item in [x for x in dataset[u] if x not in dataset[user] or dataset[user][x] == 0]:
            total_scores.update({item: dataset[u][item] * similarity_score})
            similarity_sums.update({item: similarity_score})
    
    # 如果该用户看过数据库中的所有电影，那就不能为用户 推荐电影
    if len(total_scores) == 0:
        return ['No recommendations possible']

    # 生成一个电影评分标准化列表
    movie_ranks = np.array([[total/similarity_sums[item], item] 
            for item, total in total_scores.items()])

    # 根据第一列对皮尔逊相关系数进行降序排列
    movie_ranks = movie_ranks[np.argsort(movie_ranks[:, 0])[::-1]]

    # 提取出推荐电影
    recommendations = [movie for _, movie in movie_ranks]

    return recommendations


data_file = 'movie_ratings.json'
with open(data_file, 'r') as f:
    data = json.loads(f.read())

user = 'Michael Henry'
print("\nRecommendations for " + user + ":")
movies = generate_recommendations(data, user) 
for i, movie in enumerate(movies):
    print(str(i+1) + '. ' + movie)

user = 'John Carson' 
print("\nRecommendations for " + user + ":")
movies = generate_recommendations(data, user) 
for i, movie in enumerate(movies):
    print(str(i+1) + '. ' + movie)


Recommendations for Michael Henry:
1. Jerry Maguire
2. Inception
3. Anger Management

Recommendations for John Carson:
1. No recommendations possible
