# 1. 目标相似度--向量的余弦相似度

In [10]:
# 让notebook宽屏显示
from IPython.display import display, HTML
display(HTML('<style>.container{width:100% !important;}</style>'))

In [11]:
import numpy as np
import pandas as pd
import sklearn

# 生成一些示例向量。 这里我们有4个用户, 6本书
user_num = 3
item_num = 5
user_vector1 = np.array([4, 3, 0, 0, 5, 0])
user_vector2 = np.array([5, 0, 4, 0, 4, 0])
user_vector3 = np.array([4, 0, 5, 3, 4, 0])
user_vector4 = np.array([0, 0, 2, 4, 0, 5])

user_matrix = np.vstack([user_vector1, user_vector2, user_vector3, user_vector4])
item_matrix = user_matrix.T


In [12]:
def form_vectors(label_str: str, matrix):
    if label_str == 'user':
        label_prefix = 'vector_user'
    elif label_str == 'item':
        label_prefix = 'vector_item'
    else:
        assert False, f"wrong label"
    
    vector_num = len(matrix)
    vectors = {}
    
    for i in range(vector_num):
        new_vector_i = np.array(matrix[i])
        vectors[f'{label_prefix}{i+1}'] = new_vector_i
    
    print(vectors)
        
    return vectors


## 例1： 面向用户，把用户对物品的喜好向量化，这可以用来计算用户之间的相似度

### 例如：一个向量有6个值，代表**1个用户对于6本书的打分。**


In [13]:
user_vectors = form_vectors('user', user_matrix)

{'vector_user1': array([4, 3, 0, 0, 5, 0]), 'vector_user2': array([5, 0, 4, 0, 4, 0]), 'vector_user3': array([4, 0, 5, 3, 4, 0]), 'vector_user4': array([0, 0, 2, 4, 0, 5])}


## 例2： 面向物品，把用户对物品的喜好向量化，用来计算物品之间的相似度

### 例如：一个向量，有4个值，代表**4个用户对于1本书的打分。**


In [14]:
item_vectors = form_vectors('item', item_matrix)

{'vector_item1': array([4, 5, 4, 0]), 'vector_item2': array([3, 0, 0, 0]), 'vector_item3': array([0, 4, 5, 2]), 'vector_item4': array([0, 0, 3, 4]), 'vector_item5': array([5, 4, 4, 0]), 'vector_item6': array([0, 0, 0, 5])}


In [15]:
def cosine_similarity(vector1, vector2):
    ### 实际上就是计算两个单位向量之间的点积
    dot_product = np.dot(vector1, vector2)  #点积
    norm_vector1 = np.linalg.norm(vector1)  #模1
    norm_vector2 = np.linalg.norm(vector2)  #模2

    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

In [16]:
def numpy_cos_similar(vectors):
    # 计算余弦相似度矩阵
    vector_array = np.array(list(vectors.values()))
    vector_labels = list(vectors.keys())

    num_vectors = len(vector_array)
    similarity_matrix = np.zeros((num_vectors, num_vectors))

    for i in range(num_vectors):
        for j in range(num_vectors):
            similarity_matrix[i, j] = cosine_similarity(vector_array[i], vector_array[j])

    # 将相似度矩阵包装成DataFrame
    similarity_df = pd.DataFrame(similarity_matrix, index=vector_labels, columns=vector_labels)

    # 打印结果
    print("Vectors:")
    print(pd.DataFrame(vector_array, index=vector_labels))
    print("\nCosine Similarity Matrix:")
    print(similarity_df)

In [17]:
user_similarity = numpy_cos_similar(user_vectors)

Vectors:
              0  1  2  3  4  5
vector_user1  4  3  0  0  5  0
vector_user2  5  0  4  0  4  0
vector_user3  4  0  5  3  4  0
vector_user4  0  0  2  4  0  5

Cosine Similarity Matrix:
              vector_user1  vector_user2  vector_user3  vector_user4
vector_user1      1.000000      0.749269      0.626680      0.000000
vector_user2      0.749269      1.000000      0.913017      0.157960
vector_user3      0.626680      0.913017      1.000000      0.403687
vector_user4      0.000000      0.157960      0.403687      1.000000


计算得到了user1 和 user2/3/4 之间的（对items)的喜好相似度 user_similarity, user_similarity 告诉我们， user1 和 user2 之间的相似度是0.749,  user1 和 user3 之间的相似度是0.627,  user1 和 user4 之间的相似度是0。 

In [18]:
item_similarity = numpy_cos_similar(item_vectors)

Vectors:
              0  1  2  3
vector_item1  4  5  4  0
vector_item2  3  0  0  0
vector_item3  0  4  5  2
vector_item4  0  0  3  4
vector_item5  5  4  4  0
vector_item6  0  0  0  5

Cosine Similarity Matrix:
              vector_item1  vector_item2  vector_item3  vector_item4  \
vector_item1      1.000000      0.529813      0.789799      0.317888   
vector_item2      0.529813      1.000000      0.000000      0.000000   
vector_item3      0.789799      0.000000      1.000000      0.685728   
vector_item4      0.317888      0.000000      0.685728      1.000000   
vector_item5      0.982456      0.662266      0.710819      0.317888   
vector_item6      0.000000      0.000000      0.298142      0.800000   

              vector_item5  vector_item6  
vector_item1      0.982456      0.000000  
vector_item2      0.662266      0.000000  
vector_item3      0.710819      0.298142  
vector_item4      0.317888      0.800000  
vector_item5      1.000000      0.000000  
vector_item6      0.000000

我们看到user2（利表中的第1列）对于item1/3/4/5的评分分别是5,4,0,4. user3（利表中的第2列）对于item1/3/4/5的评分分别是4,5,3,4.

## 使用Numpy 库

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def numpy_cosine_similar(matrix):
# # 将向量组合成矩阵
# matrix = np.vstack([vector_user1, vector_user2, vector_user3])
    # 计算余弦相似度
    similarity_matrix = cosine_similarity(matrix)

    # 打印结果
    print("Vectors:")
    print(matrix)
    print("\nCosine Similarity Matrix:")
    print(similarity_matrix)

In [None]:
numpy_cosine_similar(user_matrix)

In [None]:
numpy_cosine_similar(item_matrix)

## 使用Pandas库

In [None]:
def pandas_cosine_similar(vectors):
    # 将向量和标签组合成DataFrame
    df = pd.DataFrame(vectors)

    # 计算余弦相似度
    similarity_matrix = cosine_similarity(df.T)

    # 将标签添加到相似度矩阵
    similarity_df = pd.DataFrame(similarity_matrix, index=df.columns, columns=df.columns)

    # 打印结果
    print("Vectors:")
    print(df)
    print("\nCosine Similarity Matrix:")
    print(similarity_df)

In [None]:
pandas_cosine_similar(user_vectors)

In [None]:
pandas_cosine_similar(item_vectors)

# 2.基于相似度做产品推荐度计算 

## 2.1 User-based CF :基于用户的协同过滤算法

User-Based Collaborative Filtering (User-Based CF):

In User-Based CF, recommendations are made based on the preferences and behaviors of users who are similar to the target user. The idea is that if two users have similar tastes or preferences, and one user likes an item that the other has not yet interacted with, it's likely that the second user will also like that item. The similarity between users is typically measured using metrics such as cosine similarity or Pearson correlation.

The steps involved in User-Based CF are as follows:

User Similarity Calculation: Compute the similarity between the target user and other users in the system.
Neighborhood Selection: Identify a set of users (neighborhood) who are most similar to the target user.
Rating Prediction: Predict the target user's preference for items by aggregating the ratings of the items from the selected neighborhood.
One drawback of User-Based CF is the scalability issue. As the number of users grows, calculating user similarities for all pairs can become computationally expensive.

问题：我们知道了用户A的喜好向量，我们想为A推荐“潜在地符合他的喜好的产品，该怎么做”？

合理假设：用户兴趣相似，那么他们对于同一个产品的评价应该类似。

思路：其它相似用户，如果他们已经使用过产品，并对产品有了评分，那么我们可以通过计算他们的加权分数，作为用户A对于该产品可能的喜好度的“推测值”。而权重，就是他们和用户A之间的相似度（见上文，余弦相似度）。


例如，我们已经计算得到了，user1 和 user2/3/4 之间的（对items)的喜好相似度 user_similarity, user_similarity 告诉我们， user1 和 user2 之间的相似度是0.749,  user1 和 user3 之间的相似度是0.627,  user1 和 user4 之间的相似度是0。 

我们看到user2（利表中的第1列）对于item1/3/4/5的评分分别是5,4,0,4. user3（利表中的第2列）对于item1/3/4/5的评分分别是4,5,3,4.
此时，我们想知道user1（读过Item1/5,没有读过item3/4)对于item3/4可能的评分。我们可以利用user2/3已有的数据来预测（因为user2/3和user1很相似）-- User-Based CF。
那么， 我们这样预测user1可能对于item3的喜好度：(0.749*4+0.627*5)/(0.749+0.627) ~= 4.5
我们这样预测user1可能对于item4的喜好度：0.627*3 /0.627 = 3


## 2.1 Item-based CF :基于产品的协同过滤算法

思路类似，但是前提是不一样的。

