In [1]:
import numpy as np
import pandas as pd

In [2]:
u1 = np.array([4,3,0,0,5,0])
u2 = np.array([5,0,4,0,4,0])

In [3]:
np.mat(u1)

matrix([[4, 3, 0, 0, 5, 0]])

In [5]:
np.mat(u2).T

matrix([[5],
        [0],
        [4],
        [0],
        [4],
        [0]])

In [6]:
np.mat(u1) * np.mat(u2).T

matrix([[40]])

In [10]:
np.sqrt(np.power(u1, 2).sum()) * np.sqrt(np.power(u2, 2).sum())

53.38539126015656

In [11]:
np.mat(u1) * np.mat(u2).T / (np.sqrt(np.power(u1, 2).sum()) * np.sqrt(np.power(u2, 2).sum()))

matrix([[0.74926865]])

In [12]:
data = [[4,3,0,0,5,0],
        [5,0,4,0,4,0]]

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
cosine_similarity(data)

array([[1.        , 0.74926865],
       [0.74926865, 1.        ]])

In [18]:
# 导入数据集

data = pd.read_csv('example.txt', header = None,
                   names = ['user', 'product_id', 'score']
                  )
data

Unnamed: 0,user,product_id,score
0,1,1,4
1,1,2,3
2,1,5,5
3,2,1,5
4,2,3,4
5,2,5,4
6,3,1,4
7,3,3,5
8,3,4,3
9,3,5,4


In [34]:
# 使用pivot将从数据库里提出的数据进行一个转换，从而可以计算用户和用户之间的相似度
freq_matrix = data.pivot(index = 'user',
                         columns = 'product_id',
                         values = 'score')
freq_matrix

product_id,1,2,3,4,5,6
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4.0,3.0,,,5.0,
2,5.0,,4.0,,4.0,
3,4.0,,5.0,3.0,4.0,
4,,3.0,,,,5.0
5,,4.0,,,,4.0
6,,,2.0,4.0,,5.0


In [36]:
# 填补空值

freq_matrix = freq_matrix.fillna(0)

# 放入cos里面进行行和行之间的（用户和用户）之间的相似度

user_similar_matrix = cosine_similarity(freq_matrix)
user_similar_matrix = pd.DataFrame(user_similar_matrix,
                                   index = freq_matrix.index,
                                   columns = freq_matrix.index)
user_similar_matrix
# 找出与目标用户最相似的前k个相似用户，将相似用户看过的目标用户没看过的额书，通过加权平均的方式推荐给目标用户

user,1,2,3,4,5,6
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,0.749269,0.62668,0.218282,0.3,0.0
2,0.749269,1.0,0.913017,0.0,0.0,0.15796
3,0.62668,0.913017,1.0,0.0,0.0,0.403687
4,0.218282,0.0,0.0,1.0,0.970143,0.639137
5,0.3,0.0,0.0,0.970143,1.0,0.527046
6,0.0,0.15796,0.403687,0.639137,0.527046,1.0


In [37]:
freq_matrix

product_id,1,2,3,4,5,6
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4.0,3.0,0.0,0.0,5.0,0.0
2,5.0,0.0,4.0,0.0,4.0,0.0
3,4.0,0.0,5.0,3.0,4.0,0.0
4,0.0,3.0,0.0,0.0,0.0,5.0
5,0.0,4.0,0.0,0.0,0.0,4.0
6,0.0,0.0,2.0,4.0,0.0,5.0


In [67]:
# 确定k值，选取最相近的k个用户
k = 2

# 固定用户id和物品id， 单独进行推荐分数的计算

user_id = 1
item_id = 6

# 在freq_matrix里面找出当前的这个物品，所有用户对它的评分
item_id_col  = freq_matrix.loc[:, item_id]
# item_id_col

# 找到和当前用户id最为相似的前k个用户
similar_id_values = user_similar_matrix.loc[user_id, :].sort_values()[-(k+1):-1]
similar_id_values

user
3    0.626680
2    0.749269
Name: 1, dtype: float64

In [68]:
# 因为后面需要做矩阵相乘，所这个地方需要做一个排序，防止做矩阵相乘的时候，乘错位了
similar_id_values = similar_id_values.sort_index()

# 将item_id_col里面最相近的两个用户对当前item_id的评分提取出来
score = item_id_col[similar_id_values.index]
score

user
2    0.0
3    0.0
Name: 6, dtype: float64

In [None]:
if score.sum() == 0:
    return 0

In [66]:
# 创建出新的score用于加权平均分母的计算
score_above_0 = (score > 0).astype(int)

# 加权平均分子的部分
above_score = similar_id_values.dot(score)

# 加权平均分母的部分
below_score = similar_id_values.dot(score_above_0)

# 计算推荐出来的分数
above_score/below_score

3.0

In [70]:
def cal_recommend_index(k,
                        user_id,
                        item_id,
                        freq_matrix,user_similar_matrix
                       ):
    item_id_col  = freq_matrix.loc[:, item_id]
    similar_id_values = user_similar_matrix.loc[user_id, :].sort_values()[-(k+1):-1]
    similar_id_values = similar_id_values.sort_index()
    score = item_id_col[similar_id_values.index]
    
    if score.sum() == 0:
        return 0
    
    score_above_0 = (score > 0).astype(int)
    above_score = similar_id_values.dot(score)
    below_score = similar_id_values.dot(score_above_0)
    return above_score/below_score

In [76]:
cal_recommend_index(k = 2,user_id = 2, 
                    item_id = 4, freq_matrix = freq_matrix,
                    user_similar_matrix = user_similar_matrix)

3.0

In [82]:
# 尝试复制一个freq_matrix, 通过双层for循环的方式来进行扫描

predict_matrix = pd.DataFrame(np.zeros(freq_matrix.shape),
                              index = freq_matrix.index,
                              columns = freq_matrix.columns 
                             )
predict_matrix

product_id,1,2,3,4,5,6
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
# 开始双层for循环，一行一行的扫，扫0，就说明没推荐，使用函数进行推荐分数的计算

for user_id in freq_matrix.index:
    for item_id in freq_matrix.columns:
        if freq_matrix.loc[user_id, item_id] == 0:
            final_score = cal_recommend_index(k = 2,user_id = user_id, 
                                              item_id = item_id, freq_matrix = freq_matrix,
                                              user_similar_matrix = user_similar_matrix)
            # 计算出这个推荐的分数之后，填predict_matrix相应的位置里面
            predict_matrix.loc[user_id, item_id] = final_score

In [91]:
# 保留两位小树
predict_matrix = np.around(predict_matrix, 2)
predict_matrix

product_id,1,2,3,4,5,6
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,4.46,3.0,0.0,0.0
2,0.0,3.0,0.0,3.0,0.0,0.0
3,0.0,3.0,0.0,0.0,0.0,0.0
4,0.0,0.0,2.0,4.0,0.0,0.0
5,0.0,0.0,2.0,4.0,0.0,0.0
6,0.0,3.45,0.0,0.0,0.0,0.0


In [100]:
# 将0的这些值，全部换成空值
predict_matrix_zero = predict_matrix.replace({0 : np.nan})

# 使用stack的方法逆转回去
final_recommend_df = predict_matrix_zero.stack().reset_index()

# 使用rename的方法更改列名
final_recommend_df = final_recommend_df.rename({0 : 'recommend_score'}, axis = 1)

# 排序按用户和推荐的分数排序，用户顺序是从小到大，每一个用户中推荐的分数是从大到小
final_recommend_df = final_recommend_df.sort_values(by = ['user', 'recommend_score'],
                                                    ascending = [True, False]
                                                   )

In [101]:
final_recommend_df

Unnamed: 0,user,product_id,recommend_score
0,1,3,4.46
1,1,4,3.0
2,2,2,3.0
3,2,4,3.0
4,3,2,3.0
6,4,4,4.0
5,4,3,2.0
8,5,4,4.0
7,5,3,2.0
9,6,2,3.45


In [6]:
# 将上面的所有的代码全部打包

# 达成的效果是，只要你计算好freq_matrix，传入当前的函数，就可以直接出结果

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def cal_recommend_index(k,
                        user_id,
                        item_id,
                        freq_matrix,user_similar_matrix
                       ):
    item_id_col  = freq_matrix.loc[:, item_id]
    similar_id_values = user_similar_matrix.loc[user_id, :].sort_values()[-(k+1):-1]
    similar_id_values = similar_id_values.sort_index()
    score = item_id_col[similar_id_values.index]
    
    if score.sum() == 0:
        return 0
    
    score_above_0 = (score > 0).astype(int)
    above_score = similar_id_values.dot(score)
    below_score = similar_id_values.dot(score_above_0)
    return above_score/below_score


def cal_recommend(freq_matrix, k = 2):
    user_similar_matrix = cosine_similarity(freq_matrix)
    user_similar_matrix = pd.DataFrame(user_similar_matrix,
                                   index = freq_matrix.index,
                                   columns = freq_matrix.index
                                  )
    predict_matrix = pd.DataFrame(np.zeros(freq_matrix.shape),
                              index = freq_matrix.index,
                              columns = freq_matrix.columns
                             )
    
    for user_id in freq_matrix.index:
        for item_id in freq_matrix.columns:
            if freq_matrix.loc[user_id,item_id] == 0 :
                final_score = cal_recommend_index(k = 2,
                                                  user_id = user_id,
                                                  item_id = item_id,
                                                  freq_matrix = freq_matrix, 
                                                  user_similar_matrix = user_similar_matrix)
                predict_matrix.loc[user_id, item_id] = final_score
                
    predict_freq_matrix = np.around(predict_matrix,2)
    final_recommend_df = predict_freq_matrix.stack().reset_index()
    final_recommend_df = final_recommend_df.rename({0 : 'recommend_score'}, axis = 1)
    final_recommend_df = final_recommend_df[final_recommend_df['recommend_score'] != 0]
    final_recommend_df = final_recommend_df.sort_values(['user', 'recommend_score'], ascending = [True, False])
    
    return final_recommend_df

In [4]:
data = pd.read_csv('example.txt', header = None,
                   names = ['user', 'product_id', 'score']
                  )

freq_matrix = data.pivot(index = 'user',
                         columns = 'product_id',
                         values = 'score').fillna(0)
freq_matrix

product_id,1,2,3,4,5,6
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4.0,3.0,0.0,0.0,5.0,0.0
2,5.0,0.0,4.0,0.0,4.0,0.0
3,4.0,0.0,5.0,3.0,4.0,0.0
4,0.0,3.0,0.0,0.0,0.0,5.0
5,0.0,4.0,0.0,0.0,0.0,4.0
6,0.0,0.0,2.0,4.0,0.0,5.0


In [7]:
cal_recommend(freq_matrix, k = 2)

Unnamed: 0,user,product_id,recommend_score
2,1,3,4.46
3,1,4,3.0
7,2,2,3.0
9,2,4,3.0
13,3,2,3.0
21,4,4,4.0
20,4,3,2.0
27,5,4,4.0
26,5,3,2.0
31,6,2,3.45
