<h1>目录<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#读入数据集" data-toc-modified-id="读入数据集-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>读入数据集</a></span></li><li><span><a href="#构建用户和物品之间的关系矩阵" data-toc-modified-id="构建用户和物品之间的关系矩阵-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>构建用户和物品之间的关系矩阵</a></span></li><li><span><a href="#构建用户余弦相似度矩阵" data-toc-modified-id="构建用户余弦相似度矩阵-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>构建用户余弦相似度矩阵</a></span></li><li><span><a href="#基于物品的协同过滤" data-toc-modified-id="基于物品的协同过滤-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>基于物品的协同过滤</a></span></li><li><span><a href="#计算用户对于产品的推荐系数" data-toc-modified-id="计算用户对于产品的推荐系数-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>计算用户对于产品的推荐系数</a></span></li><li><span><a href="#构建预测矩阵" data-toc-modified-id="构建预测矩阵-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>构建预测矩阵</a></span></li><li><span><a href="#构建最终的推荐功能" data-toc-modified-id="构建最终的推荐功能-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>构建最终的推荐功能</a></span></li><li><span><a href="#我们要对上面处理到的数据进行分组操作" data-toc-modified-id="我们要对上面处理到的数据进行分组操作-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>我们要对上面处理到的数据进行分组操作</a></span></li></ul></div>

# MovieLens数据集基于用户的协同过滤算法

In [1]:
import numpy as np
import pandas as pd

### 读入数据集

In [6]:
# 将数据集中的u.data 读入进来
df = pd.read_csv('ml-100k/u.data',sep='\t' ,header=None)
df

Unnamed: 0,0,1,2,3
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [7]:
# 修改列名
df.columns = ['用户ID','物品ID','评分','时间']
df

Unnamed: 0,用户ID,物品ID,评分,时间
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [8]:
# 将没有用的时间这一列删除掉
del df['时间']

In [9]:
df

Unnamed: 0,用户ID,物品ID,评分
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [52]:
df.评分.value_counts()

4    34174
3    27145
5    21201
2    11370
1     6110
Name: 评分, dtype: int64

### 构建用户和物品之间的关系矩阵

使用透视表方法, pivot 构建数据集

In [10]:
# index 使用用户ID作为行索引
#  columns 使用物品ID 作为列索引
# values 使用评分作为透视表的值
df_pivot = df.pivot(index = "用户ID",columns="物品ID",values='评分')
df_pivot

物品ID,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
用户ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# 将矩阵中的空值填充成0
df_pivot.fillna(0, inplace=True)

In [14]:
df_pivot

物品ID,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
用户ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 构建用户余弦相似度矩阵

In [15]:
# 导入余弦相似度
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
# 计算出来的是用户相似度矩阵
# user_similar_matrix 代表用户相似度矩阵
user_similar_matrix = cosine_similarity(df_pivot)
user_similar_matrix

array([[1.        , 0.16693098, 0.04745954, ..., 0.14861694, 0.17950788,
        0.39817474],
       [0.16693098, 1.        , 0.11059132, ..., 0.16148478, 0.17226781,
        0.10579788],
       [0.04745954, 0.11059132, 1.        , ..., 0.10124256, 0.13341615,
        0.02655587],
       ...,
       [0.14861694, 0.16148478, 0.10124256, ..., 1.        , 0.1016418 ,
        0.09511958],
       [0.17950788, 0.17226781, 0.13341615, ..., 0.1016418 , 1.        ,
        0.18246466],
       [0.39817474, 0.10579788, 0.02655587, ..., 0.09511958, 0.18246466,
        1.        ]])

In [17]:
# 皮尔逊相似度矩阵
# 如果直接这样写,计算出来的是物品相似度
df_pivot.corr(method='pearson')

KeyboardInterrupt: 

In [21]:
# 所以需要先转置,然后再计算
user_pearson_similar_matrix  = df_pivot.T.corr(method='pearson')
user_pearson_similar_matrix
# 皮尔逊相关系数矩阵

用户ID,1,2,3,4,5,6
用户ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,0.511321,0.1624,-0.284199,-0.169842,-0.865802
2,0.511321,1.0,0.822475,-0.668261,-0.698884,-0.591708
3,0.1624,0.822475,1.0,-0.914286,-0.956183,-0.304646
4,-0.284199,-0.668261,-0.914286,1.0,0.956183,0.429274
5,-0.169842,-0.698884,-0.956183,0.956183,1.0,0.231714
6,-0.865802,-0.591708,-0.304646,0.429274,0.231714,1.0


In [23]:
# 余弦相似度矩阵对比
pd.DataFrame(user_similar_matrix)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.749269,0.62668,0.218282,0.3,0.0
1,0.749269,1.0,0.913017,0.0,0.0,0.15796
2,0.62668,0.913017,1.0,0.0,0.0,0.403687
3,0.218282,0.0,0.0,1.0,0.970143,0.639137
4,0.3,0.0,0.0,0.970143,1.0,0.527046
5,0.0,0.15796,0.403687,0.639137,0.527046,1.0


In [None]:
# 假设我们是基于最近邻的2个用户去进行推荐


### 基于物品的协同过滤

item_baesd

In [28]:
# 先转置调换行和列
# 然后计算余弦相似度
cosine_similarity(df_pivot.T)

array([[1.        , 0.27258651, 0.7897985 , 0.31788777, 0.98245614,
        0.        ],
       [0.27258651, 1.        , 0.        , 0.        , 0.34073314,
        0.65441051],
       [0.7897985 , 0.        , 1.        , 0.68572751, 0.71081865,
        0.18349396],
       [0.31788777, 0.        , 0.68572751, 1.        , 0.31788777,
        0.49236596],
       [0.98245614, 0.34073314, 0.71081865, 0.31788777, 1.        ,
        0.        ],
       [0.        , 0.65441051, 0.18349396, 0.49236596, 0.        ,
        1.        ]])

### 计算用户对于产品的推荐系数

直接调用写好的脚本,计算用户和产品之间的推荐系数

In [18]:
# 导入计算函数
from recommend import cal_recommend_index_by_users

In [19]:
df_pivot

物品ID,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
用户ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# 将评分矩阵中的数组提取出来, 变成ndarray格式
freq_matrix = df_pivot.values
freq_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [21]:
user_similar_matrix

array([[1.        , 0.16693098, 0.04745954, ..., 0.14861694, 0.17950788,
        0.39817474],
       [0.16693098, 1.        , 0.11059132, ..., 0.16148478, 0.17226781,
        0.10579788],
       [0.04745954, 0.11059132, 1.        , ..., 0.10124256, 0.13341615,
        0.02655587],
       ...,
       [0.14861694, 0.16148478, 0.10124256, ..., 1.        , 0.1016418 ,
        0.09511958],
       [0.17950788, 0.17226781, 0.13341615, ..., 0.1016418 , 1.        ,
        0.18246466],
       [0.39817474, 0.10579788, 0.02655587, ..., 0.09511958, 0.18246466,
        1.        ]])

In [22]:
# 测试函数效果
# 按照函数要求传入对应的参数,就可以进行计算
# 用户0,对于产品2
cal_recommend_index_by_users(k = 2, user_id=0 , item_id=2, 
                             freq_matrix = freq_matrix, 
                             user_similar_matrix=user_similar_matrix)

3.0

In [23]:
# 用户0 对于产品3
cal_recommend_index_by_users(k = 2, user_id=0 , item_id=3, 
                             freq_matrix = freq_matrix, 
                             user_similar_matrix=user_similar_matrix)

4.0

### 构建预测矩阵

目标: 将所有用户对于所有未评分产品的推荐系数全部计算出来

思路:

1. 先创建一个和用户评分矩阵形状相同的空矩阵,形状和freq_matrix 相同
2. 依次对这个空矩阵中的每一个格子进行计算,也就是计算每一个用户和每一个产品之间的推荐系数
3. 如果原来的freq_matrix 对应的格子中已经有数字存在了, 则这个格子不需要计算.如果数字为0,则调用上面的cal_recommend_index_by_users 函数计算对应的推荐系数,并填充到该格子中去.
4. 循环方法: 写一个两层循环, 外层循环代表 行, 内层代表 列,依次对所有格子进行计算.

In [63]:
for i in range(5): # i 从0 - 4
    for j in range(4):
        print("当前的i值是:",i,"   j值是:",j)
    print('-'*30)

当前的i值是: 0    j值是: 0
当前的i值是: 0    j值是: 1
当前的i值是: 0    j值是: 2
当前的i值是: 0    j值是: 3
------------------------------
当前的i值是: 1    j值是: 0
当前的i值是: 1    j值是: 1
当前的i值是: 1    j值是: 2
当前的i值是: 1    j值是: 3
------------------------------
当前的i值是: 2    j值是: 0
当前的i值是: 2    j值是: 1
当前的i值是: 2    j值是: 2
当前的i值是: 2    j值是: 3
------------------------------
当前的i值是: 3    j值是: 0
当前的i值是: 3    j值是: 1
当前的i值是: 3    j值是: 2
当前的i值是: 3    j值是: 3
------------------------------
当前的i值是: 4    j值是: 0
当前的i值是: 4    j值是: 1
当前的i值是: 4    j值是: 2
当前的i值是: 4    j值是: 3
------------------------------


In [24]:
freq_matrix.shape

(943, 1682)

In [25]:
# 使用np.zeros_like 创建出一个和freq_matrix 形状相同的空矩阵
predict_matrix = np.zeros_like(freq_matrix)
predict_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
freq_matrix.shape[0] # 行数

943

In [28]:
freq_matrix.shape[1] # 列数

1682

In [29]:
freq_matrix # 原来的

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [30]:
predict_matrix # 预测的

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
# 循环方法: 写一个两层循环, 外层循环代表 行, 内层代表 列,依次对所有格子进行计算.
for user_id in range(freq_matrix.shape[0]): # 外层循环行数
    for item_id in range(freq_matrix.shape[1]):
        # 如果原来的freq_matrix 对应的格子中已经有数字存在了, 则这个格子不需要计算.
        # 如果数字为0,则调用上面的cal_recommend_index_by_users 函数计算对应的推荐系数,
        # 并填充到该格子中去.
        if freq_matrix[user_id ,item_id] == 0: # 用户没有评分该产品,进行计算
            # 调用刚才的函数计算对应的推荐指数
            recommend_index = cal_recommend_index_by_users(2,user_id,item_id,
                                freq_matrix, user_similar_matrix)
            # 把计算的推荐指数传入到刚才的空矩阵对应的相同位置中去
            predict_matrix[user_id,item_id] = recommend_index

In [32]:
# 所有对应的推荐系数就都计算出来了
predict_matrix

array([[0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 2., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.],
       [4., 0., 3., ..., 0., 0., 0.]])

In [36]:
# 也可以直接调用脚本,直接进行计算
from recommend import cal_recommend_index_matrix_by_users

In [76]:
predict_matrix2 = cal_recommend_index_matrix_by_users(freq_matrix,
                                                user_similar_matrix,k=2)
predict_matrix2

array([[0.        , 0.        , 4.45545287, 3.        , 0.        ,
        0.        ],
       [0.        , 3.        , 0.        , 3.        , 0.        ,
        0.        ],
       [0.        , 3.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 2.        , 4.        , 0.        ,
        0.        ],
       [0.        , 0.        , 2.        , 4.        , 0.        ,
        0.        ],
       [0.        , 3.45194102, 0.        , 0.        , 0.        ,
        0.        ]])

### 构建最终的推荐功能

In [37]:
predict_matrix

array([[0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 2., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.],
       [4., 0., 3., ..., 0., 0., 0.]])

In [38]:
df_pivot

物品ID,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
用户ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
# 将上面这个矩阵转换成DataFrame
#  行列索引都用df_pivot 相同的
re_df = pd.DataFrame(predict_matrix, index=df_pivot.index, columns= df_pivot.columns)
re_df

物品ID,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
用户ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,3.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,4.0,0.0,3.000000,0.000000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,4.0,0.0,0.0,0.0,0.0,0.0,4.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,3.0,0.0,0.0,0.0,0.0,0.0,0.000000,5.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# 假设给每个用户推荐两个产品
# 首先用stack将DataFrame转换成2重索引的Series
re_df.stack()

用户ID  物品ID
1     1       0.0
      2       0.0
      3       0.0
      4       0.0
      5       0.0
             ... 
943   1678    0.0
      1679    0.0
      1680    0.0
      1681    0.0
      1682    0.0
Length: 1586126, dtype: float64

In [41]:
# 重置索引,相当于将原来的索引变成普通的列数据, 然后生成一个新的序列索引
re_df.stack().reset_index()

Unnamed: 0,用户ID,物品ID,0
0,1,1,0.0
1,1,2,0.0
2,1,3,0.0
3,1,4,0.0
4,1,5,0.0
...,...,...,...
1586121,943,1678,0.0
1586122,943,1679,0.0
1586123,943,1680,0.0
1586124,943,1681,0.0


In [42]:
re_df_2 =re_df.stack().reset_index()
re_df_2

Unnamed: 0,用户ID,物品ID,0
0,1,1,0.0
1,1,2,0.0
2,1,3,0.0
3,1,4,0.0
4,1,5,0.0
...,...,...,...
1586121,943,1678,0.0
1586122,943,1679,0.0
1586123,943,1680,0.0
1586124,943,1681,0.0


In [43]:
# 将列名0 改成 推荐指数
re_df_2.rename(columns={0:'推荐指数'} ,inplace=True)

In [44]:
re_df_2

Unnamed: 0,用户ID,物品ID,推荐指数
0,1,1,0.0
1,1,2,0.0
2,1,3,0.0
3,1,4,0.0
4,1,5,0.0
...,...,...,...
1586121,943,1678,0.0
1586122,943,1679,0.0
1586123,943,1680,0.0
1586124,943,1681,0.0


### 我们要对上面处理到的数据进行分组操作

将用户ID相同的数据,分成一组,然后提取出每一组中最大的两个数据的物品ID和推荐指数

In [45]:
re_df_groupby = re_df_2.groupby(by='用户ID')
re_df_groupby

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1a2657fdd0>

In [63]:
# 将用户ID 为1的数据提取出来, 进行模拟
test = re_df_2[re_df_2.用户ID == 1]
test

Unnamed: 0,用户ID,物品ID,推荐指数
0,1,1,0.0
1,1,2,0.0
2,1,3,0.0
3,1,4,0.0
4,1,5,0.0
...,...,...,...
1677,1,1678,0.0
1678,1,1679,0.0
1679,1,1680,0.0
1680,1,1681,0.0


In [64]:
# 假设推荐2个物品
n = 5

In [65]:
test.sort_values(by='推荐指数', ascending=False )

Unnamed: 0,用户ID,物品ID,推荐指数
424,1,425,5.0
332,1,333,5.0
482,1,483,5.0
650,1,651,5.0
495,1,496,5.0
...,...,...,...
605,1,606,0.0
604,1,605,0.0
603,1,604,0.0
601,1,602,0.0


In [66]:
# 先排序,在提取出最大的两个数据
test.sort_values(by='推荐指数', ascending=False )[:5]

Unnamed: 0,用户ID,物品ID,推荐指数
424,1,425,5.0
332,1,333,5.0
482,1,483,5.0
650,1,651,5.0
495,1,496,5.0


In [67]:
# 将上面的过程写成函数
def get_topn(group , n):
    """group: 就是指代分组中的数据
    n 就是要推荐的物品个数"""
    # 先排序,在提取出最大的两个数据
    result = group.sort_values(by='推荐指数', ascending=False )[:n]
    return result

In [68]:
# 因为没有现成的规约函数符合我们的要求 ,所有我们需要自定义分组函数
# 将自定义好的函数传输进去 参数也传入进去
# 取前n个推荐电影
topn = re_df_groupby.apply(get_topn , n)
topn

Unnamed: 0_level_0,Unnamed: 1_level_0,用户ID,物品ID,推荐指数
用户ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,424,1,425,5.0
1,332,1,333,5.0
1,482,1,483,5.0
1,650,1,651,5.0
1,495,1,496,5.0
...,...,...,...,...
943,1585193,943,750,5.0
943,1584964,943,521,5.0
943,1585462,943,1019,5.0
943,1585367,943,924,5.0


In [69]:
# 去掉没有用的第二个索引
topn_final = topn.droplevel(1)
topn_final
# 这个就是我们最终得到的一个推荐表单

Unnamed: 0_level_0,用户ID,物品ID,推荐指数
用户ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,425,5.0
1,1,333,5.0
1,1,483,5.0
1,1,651,5.0
1,1,496,5.0
...,...,...,...
943,943,750,5.0
943,943,521,5.0
943,943,1019,5.0
943,943,924,5.0


In [71]:
topn_final.drop(columns='用户ID',inplace=True)

In [73]:
topn_final

Unnamed: 0_level_0,物品ID,推荐指数
用户ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,425,5.0
1,333,5.0
1,483,5.0
1,651,5.0
1,496,5.0
...,...,...
943,750,5.0
943,521,5.0
943,1019,5.0
943,924,5.0


In [74]:
user_id_for = 5
topn_final.loc[user_id_for]['物品ID']

用户ID
5    258
5    483
5    152
5    275
5    177
Name: 物品ID, dtype: int64

In [77]:
# 上面的过程也可以直接调用函数
from recommend import get_recom

In [78]:
get_recom(predict_matrix, df_pivot, 5)

Unnamed: 0_level_0,物品ID,推荐指数
用户ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,425,5.0
1,333,5.0
2,515,5.0
2,124,5.0
3,1612,5.0
...,...,...
941,50,5.0
942,333,5.0
942,8,5.0
943,750,5.0
