In [6]:
"""
Author: Myolive_Lin
Description: Code implementation of three methods for calculating user similarity
"""

import sys
import os
import numpy as np
import pandas as pd

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)


import importlib
from data_processing import data_generate

importlib.reload(data_generate)

from data_processing.data_generate import generate_random_matrix


### UserCF
下面是计算用户相似度的三种方法\
Here are three ways to calculate user similarity

**余弦相似度**，如（式2-1）所示。<u>余弦相似度(CosineSimilarity)衡量了用户向量i和用户向量j之间的向量夹角大小。\
显然，夹角越小，证明余弦相似度越大，两个用户越相似。</u>

$$ sim(i,j) = cos(i,j) = \frac{i\cdot j}{||i|| \cdot ||j||}$$

In [7]:
# Cosine Similarity
def cosine_similarity(v1,v2):
    """
    Calculate cosine similarity between two vectors
    Args:
        v1 (np.array): vector 1
        v2 (np.array): vector 2
    Returns:
        float: cosine similarity
    """
    dot = float(np.dot(v1,v2))
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)

    return dot/(norm_v1*norm_v2)

**皮尔逊相关系数**，如（式2-2）所示。相比余弦相似度，**皮尔逊相关系数通过使用用户平均分对各独立评分进行修正，减少了用户评分偏置的影响**
$$\text{Sim}(i, j) = \frac{\sum_{p \in P} (R_{i,p} - \bar{R_i})(R_{j,p} - \bar{R_j})}{\sqrt{\sum_{p \in P} (R_{i,p} - \bar{R_{i}})^2} \cdot \sqrt{\sum_{p \in P} (R_{j,p} - \bar{R_{j}})^2}}$$

其中，Ri，p代表用户i对物品p的评分。$\bar{R_i}$代表用户i对所有物品的平均评分，P代表所有物品的集合。

In [8]:
def Peason_similarity(vector1,vector2):
    """
    Calculate Peason similarity between two vectors
    Args:
        vector1 (np.array): vector 1
        vector2 (np.array): vector 2
    Returns:
        float: Peason similarity
    """
    
    mean_v1 = np.mean(vector1)
    mean_v2 = np.mean(vector2)

    # Calculate the standard deviation of each vector,
    # The usage of numpy can directly make each vector have an average value
    std_v1 = np.sqrt(np.sum((vector1 - mean_v1) ** 2))
    std_v2 = np.sqrt(np.sum((vector2 - mean_v2) ** 2))

    # Using numpy's broadcast mechanism, first subtract the respective averages from both vectors, 
    # then multiply the corresponding positions and then sum.
    product = np.sum( (vector1 - mean_v1) * (vector2 - mean_v2))

    similarity = product / (std_v1 * std_v2)

    return similarity


基于皮尔逊系数的思路，**还可以通过引入物品平均分的方式，减少物品评分偏置对结果的影响**，如下所示
$$\text{Sim}(I, J) = \frac{\sum_{p \in P} (R_{i,p} - \bar{R}_p)(R_{j,p} - \bar{R}_p)}{\sqrt{\sum_{p \in P} (R_{i,p} - \bar{R}_p)^2} \cdot \sqrt{\sum_{p \in P} (R_{j,p} - \bar{R}_p)^2}}$$

其中，\( $\bar{R}_{p}$ \) 代表物品p得到所有评分的平均分。

In [9]:
#Introducing the similarity calculation formula for the average score of items
def similarity_mean_item(vector1,vector2,matrix):
    """
    Calculate similarity between two vectors
    Args:
        vector1 (np.array): vector 1
        vector2 (np.array): vector 2
        matrix (np.array): matrix
    Returns:
        float: similarity
    """
    

    mean_item = np.mean(matrix,axis = 0)

    product = np.sum((vector1 - mean_item) * (vector2 - mean_item))

    std_v1 = np.sqrt(np.sum((vector1 - mean_item)**2))
    std_v2 = np.sqrt(np.sum((vector2 - mean_item)**2))

    similarity = product / (std_v1 * std_v2)

    return similarity

In [95]:

def Calculate_Top_k_User(user_vecotr, matrix,k):
    # 根据用户向量选出k个最相似的用户
    # 输入：用户向量，用户矩阵，k
    # 输出：k个最相似的用户的索引
    # 相似度采用余弦相似度
    # 相似度越大，越相似
    res = []
    for i in range(len(matrix)):
        res.append((i,cosine_similarity(user_vecotr, matrix[i])))
    
    res.sort(key = lambda x: x[1], reverse = True)
    return res[1:k+1]

In [96]:
# 生成随机矩阵
matrix = generate_random_matrix(4,8,0,5,42)
matrix

array([[3, 4, 2, 4, 4, 1, 2, 2],
       [2, 4, 3, 2, 5, 4, 1, 3],
       [5, 5, 1, 3, 4, 0, 3, 1],
       [5, 4, 3, 0, 0, 2, 2, 1]])

In [97]:
#示例: 获得第一个用户,前三个最相似的用户
Calculate_Top_k_User(matrix[0],matrix,3)

[(2, 0.9408591535633023), (1, 0.8867889026274118), (3, 0.7002247996006618)]

In [112]:
# 利用用户相似度和相似用户的评价加权平均获取目标用户的评价
def get_evaluation_prediction(user_index, item_index,matrix,k):
    #注意这里如果我们是要预测第item_index个商品的话，我们是需要把这个商品从矩阵中去掉的,因为这个列其实对于目标用户来说还是未知的,所以也不适合用于计算相似度
    user_vector = np.delete(matrix[user_index],item_index)

    target_matrix = np.delete(matrix,item_index,axis = 1)

    top_k_user = Calculate_Top_k_User(user_vector,target_matrix,k)

    evaluation_prediction = 0
    sum = 0

    # 遍历相似用户
    for user in top_k_user:
        # 计算用户相似度
        evaluation_prediction += user[1] * matrix[user[0]][item_index]
        sum += user[1]

    return evaluation_prediction / sum

In [114]:
#示例,计算第一个用户,第8个物品的用户的评价预测
get_evaluation_prediction(user_index= 0, item_index= 7,matrix = matrix,k = 3)

1.6982366845190702

In [115]:
# 最后计算目标用户所有未使用过物品的预测评分,最终的推荐列表根据得分进行排序即可
def get_recommend(user_index, matrix, top_k = 5):
    # 未使用过的物品
    unuse_items = [i for i,x in enumerate(matrix[user_index]) if x == 0] #评分为0的表示未使用过

    # 预测评分
    predict_scores = []

    # 对每个未使用的物品进行预测
    for item_index in unuse_items:
        # 对该物品进行评分预测
        prediction = get_evaluation_prediction(user_index, item_index, matrix, k = top_k)

        predict_scores.append((item_index, prediction))
    
    predict_scores.sort(key = lambda x:x[1],reverse= True)

    return predict_scores[:top_k]



In [116]:
matrix

array([[3, 4, 2, 4, 4, 1, 2, 2],
       [2, 4, 3, 2, 5, 4, 1, 3],
       [5, 5, 1, 3, 4, 0, 3, 1],
       [5, 4, 3, 0, 0, 2, 2, 1]])

In [117]:
# 示例:获得最终的推荐列表
recomend = get_recommend(user_index = 3, matrix = matrix, top_k = 5)
recomend

[(4, 4.329826164304694), (3, 3.042637466393807)]

### ItemCF
物品相似度计算
ItemCF的具体步骤如下：

1. 基于历史数据，构建以用户（假设用户总数为m）为行坐标，物品（物品总数为n）为列坐标的 ( $m \times n $) 维的共现矩阵。
2. **计算共现矩阵两两列向量间的相似性（相似度的计算方式与用户相似度的计算方式相同），构建 $( n \times n$ \) 维的物品相似度矩阵。**
3. 获得用户历史行为数据中的正反馈物品列表。
4. **利用物品相似度矩阵，针对目标用户历史行为中的正反馈物品，找出相似的Top K个物品，组成相似物品集合。**
5. 对相似物品集合中的物品，利用相似度分值进行排序，生成最终的推荐列表。

公式如下
$$ R_{u,p} = \sum_{h \in H}  (w_{p,h} \cdot R_{u,h})$$ 

其中，H是目标用户的正反馈物品集合，($w_{p,h}$) **是物品p与物品h的物品相似度**，$R_{u,h}$ 是用户u对物品h的已有评分。


In [109]:
def get_item_similarity_matrix(matrix):
    #构建n*n的矩阵
    item_matrix = np.zeros([matrix.shape[1],matrix.shape[1]],dtype=np.float32)
    
    for i in range(matrix.shape[1]):
        for j in range(i,matrix.shape[1]):
            item_matrix[i][j] = item_matrix[j][i] = cosine_similarity(matrix[:,i],matrix[:,j])

    return item_matrix


In [136]:
matrix = generate_random_matrix(rows = 4,cols = 8,min_value = 0, max_value = 5,seed = 42)
matrix

array([[3, 4, 2, 4, 4, 1, 2, 2],
       [2, 4, 3, 2, 5, 4, 1, 3],
       [5, 5, 1, 3, 4, 0, 3, 1],
       [5, 4, 3, 0, 0, 2, 2, 1]])

In [137]:
#物品相似度进行展示
item_similarity_matrix = get_item_similarity_matrix(matrix)
pd.DataFrame(item_similarity_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,0.958477,0.840651,0.725258,0.700877,0.57735,0.979958,0.71566
1,0.958477,1.0,0.902976,0.847626,0.868139,0.715133,0.96554,0.876377
2,0.840651,0.902976,1.0,0.658243,0.745697,0.910031,0.786357,0.915249
3,0.725258,0.847626,0.658243,1.0,0.934646,0.486265,0.831607,0.815088
4,0.700877,0.868139,0.745697,0.934646,1.0,0.693688,0.780488,0.92338
5,0.57735,0.715133,0.910031,0.486265,0.693688,1.0,0.514345,0.901498
6,0.979958,0.96554,0.786357,0.831607,0.780488,0.514345,1.0,0.730297
7,0.71566,0.876377,0.915249,0.815088,0.92338,0.901498,0.730297,1.0


In [110]:
def get_evaluation_prediction_by_item(item_similarity_matrix, item_index, user_vector):
    """
    根据物品相似度矩阵和用户评分向量预测评分
    :param item_similarity_matrix: 物品相似度矩阵
    :param item_index: 目标物品的索引
    :param user_vector: 用户的评分向量
    :return: 对目标物品的预测评分
    """
    vector = np.delete(user_vector, item_index)
    
    # get similarity vector
    similarity_vector = np.delete(item_similarity_matrix[item_index], item_index)

    prediction = np.sum(vector *similarity_vector)
    
    return prediction




In [128]:
#示例: 对第四个用户 获取第七个商品的预测评分
get_evaluation_prediction_by_item(item_similarity_matrix= item_similarity_matrix, item_index = 6, user_vector = matrix[3])

12.88000738620758

In [133]:
def get_item_recommend(user_index, matrix, top_k=5):
    Unused_list = [index for index,x in enumerate(matrix[user_index]) if x == 0]
    res = []
    item_similarity_matrix = get_item_similarity_matrix(matrix)

    for index in Unused_list:
        prediction = get_evaluation_prediction_by_item(item_similarity_matrix = item_similarity_matrix, item_index = index, user_vector= matrix[user_index])
        res.append((index, prediction))

    res.sort(key = lambda x:x[1], reverse = True)

    return res[:top_k]

    

In [135]:
#示例 对于上述第四个用户,推荐其未看过的四部电影
recommend = get_item_recommend(3,matrix, 5)
recommend

[(4, 13.085762321949005), (3, 12.442351639270782)]