## 0.读取数据

In [1]:
import pandas as pd
import numpy as np
import scipy.spatial.distance as ssd    # 用于计算皮尔逊相关系数
from collections import defaultdict

In [2]:
# 读取训练集的前三列，分别命名为'user_id', 'item_id', 'rating'
df_training_data = pd.read_csv(
    './data/movielen_rating_training.base',    # 文件路径
    names=['user_id', 'item_id', 'rating'],    # 列名称
    usecols=[0, 1, 2],                         # 只读取 0 , 1 , 2 列
    sep='\t'                                   # 以'\t'分隔数据
)
df_training_data.head()

Unnamed: 0,user_id,item_id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


## 1.使用Map保存用户唯一索引

**{ user_id1 : user_index1 , user_id2 : user_index2 , user_id3 : user_index3 , ... }**

In [3]:
# 不重复的user_id与item_id列表
user_id_s = df_training_data['user_id'].unique()
item_id_s = df_training_data['item_id'].unique()

# 使用map建立id与index的索引
# { user_id1 : user_index1 , user_id2 : user_index2 }
user_index_map = {}    
item_index_map = {}
for user_index in range(len(user_id_s)):
    user_id = user_id_s[user_index]
    user_index_map[user_id] = user_index#
for item_index in range(len(item_id_s)):
    item_id = item_id_s[item_index]
    item_index_map[item_id] = item_index

## 2.将打分记录转换成矩阵和字典

| |user_id|item_id|rating|
|---|---|---|---|
|0	|1	|1	|1  |
|1	|1	|2	|4  |
|2	|1	|3	|2  |
|3	|1	|4	|1  |
|4	|2	|1	|2  |
|5	|2	|2	|4  |
|...|...|...|...|

----------------------------------------转换为

| |商品1| 商品2| 商品3| 商品4| 商品5|
|:---:|:---:|:---:|:---:|:---:|:---:|
|用户1| 1分| 4分| 2分| 1分| ？|
|用户2| 2分| 4分| 2分| 1分| 5分|
|用户3| 5分| 1分| 5分| 4分| 2分|
|用户4| 2分| 5分| 3分| 4分| 5分|
|...| ...| ...| ...| ...| ...|

| map |
|:---:|
|user_index : [ item_index1, item_index2 ...] |
1: [0 ,1 ,2, 3]
2: [0 ,1 ,2, 3, 4]
3: [0 ,1 ,2, 3, 4]
4: ...
... : ...

In [4]:
# 用户与物品的打分矩阵(943, 1650)
user_item_rating_array = np.zeros(shape=(len(user_id_s), len(item_id_s)))    # shape = (943, 1650)
# 用户打分商品的索引集合
user_rating_map = defaultdict(set)    # 以set为默认创建字典，字典中每一个键值都是set，当键不存在时返回空set

for row_index in df_training_data.index:
    # 取得每一行的数据
    row_data = df_training_data.iloc[row_index]
    # 打分用户的索引
    user_index = user_index_map[row_data['user_id']]
    # 打分电影的索引
    item_index = item_index_map[row_data['item_id']]
    # 添加用户打分商品索引集合
    user_rating_map[user_index].add(item_index)    # 往一个user_index的集合set中添加item_index
    # 矩阵中行=user_index，列=item_index的元素赋值为打分
    user_item_rating_array[user_index, item_index] = row_data['rating']
user_item_rating_array

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

## 3.计算用户的平均打分向量
- 1.从map中取用户1评过分的所有item索引

| map |
|:---:|
|user_index : [ item_index1, item_index2 ...] |
|---->    1: [0 ,1 ,2, 3] |
 
- 2.根据item索引中取评分

| |商品1| 商品2| 商品3| 商品4| 商品5|
|:---:|:---:|:---:|:---:|:---:|:---:|
|用户1| 1分| 4分| 2分| 1分| ？|

- 3.求

    用户1对商品的平均评分:

    1 + 4 + 2 + 1 \ 4 = 2.00

    用户2对商品的平均评分:

    2 + 4 + 2 + 1 \ 4 = 2.25


- 4.用一个列表保存所有用户平均评分
[2.00, 2.25, ...]

In [5]:
# 计算用户的平均打分向量
def calculate_user_rating_mean():
    user_rating_mean_s = []
    # 遍历所有的用户
    for user_index in range(len(user_id_s)):
        # 取用户打过分的电影索引
        item_rating_index_v = list(user_rating_map[user_index])    # 将set转为list，map里存放的都是set，set内都是item_index。
        # 打过分的电影评分向量
        item_rating_v = np.take(
            user_item_rating_array[user_index],                    # 从打分矩阵中取的user_index对应的行
            item_rating_index_v                                    # 从对应的行中取用户打过分的item
        )
        # 打分向量的平均值
        mean = round(item_rating_v.mean(), 2)                      # round()四舍五入取2位小数
        user_rating_mean_s.append(mean)
    return user_rating_mean_s                                     # 返回一个所有用户平均打分的列表


# def calculate_user_rating_mean():
#     return [
#         round(
#             np.take(
#                 user_item_rating_array[user_index],
#                 list(user_rating_map[user_index])
#             ).mean(),
#             2
#         )
#         for user_index in range(len(user_id_s))
#     ]

# 得到用户打分的平均值，列表类型，列表索引对应user_index
user_rating_mean_s = calculate_user_rating_mean()
user_rating_mean_s

[3.68,
 3.8,
 3.0,
 4.36,
 2.96,
 3.58,
 3.89,
 3.6,
 4.17,
 4.21,
 3.53,
 4.28,
 3.14,
 4.22,
 3.03,
 4.35,
 3.16,
 3.94,
 3.6,
 3.31,
 2.66,
 3.3,
 3.64,
 4.39,
 4.05,
 2.91,
 3.3,
 3.64,
 3.94,
 3.8,
 3.91,
 3.55,
 3.64,
 3.8,
 3.0,
 4.0,
 3.46,
 3.9,
 3.85,
 2.73,
 3.94,
 3.62,
 3.68,
 3.64,
 3.48,
 4.18,
 3.56,
 3.73,
 2.73,
 3.54,
 3.75,
 4.34,
 4.0,
 3.33,
 3.8,
 3.65,
 3.63,
 3.88,
 4.02,
 4.13,
 2.83,
 3.32,
 2.98,
 3.6,
 3.98,
 3.52,
 3.42,
 3.17,
 3.74,
 3.4,
 3.82,
 3.77,
 3.65,
 3.76,
 3.18,
 3.4,
 3.46,
 3.36,
 4.19,
 3.93,
 3.57,
 3.13,
 3.4,
 3.74,
 3.55,
 3.64,
 3.78,
 4.09,
 4.15,
 4.25,
 3.9,
 3.23,
 3.0,
 3.66,
 3.48,
 4.25,
 3.97,
 3.88,
 3.65,
 3.03,
 3.0,
 2.56,
 3.4,
 2.79,
 3.31,
 3.71,
 2.77,
 3.41,
 3.5,
 3.09,
 3.5,
 3.92,
 3.81,
 3.63,
 4.04,
 3.06,
 3.72,
 4.64,
 3.88,
 3.57,
 3.67,
 3.97,
 3.93,
 3.5,
 3.44,
 3.7,
 4.18,
 3.58,
 2.87,
 4.08,
 4.07,
 3.71,
 3.28,
 3.58,
 3.45,
 4.55,
 4.08,
 4.3,
 3.93,
 3.6,
 3.45,
 3.8,
 3.73,
 3.67,
 3.32,
 3.68,
 4.3,


## 4.计算用户相似度矩阵

$$ \vec{r}_1 = (1,4,2,1)  $$

$$ \overline{r}_1 = \frac{1+4+2+1}{4} $$

$$ \vec{r}^,_1 = \vec{r}_1 - \overline{r}_1 = (-1,2,0,-1)  $$




$$sim(\vec{r}^,_1,\vec{r}^,_2) = \frac{ (\vec{r}^,_1 , \vec{r}^,_2) }{ \sqrt{ (\vec{r}^,_1)^2 } \sqrt{ (\vec{r}^,_2)^2 } }$$

$$ = \frac{ -1 \times (-0.25)+2\times1.75+0\times(-0.25)+(-1)\times(-1.25) } { \sqrt{(-1)^2 + 2^2 + 0^2 + (-1)^2 } \times \sqrt{ (-0.25)^2 + 1.75^2 + (-0.25)^2 + (-1.25)^2} } = 0.937  $$


**sim = 1 - ssd.cosine(v1, v2)**

In [6]:
# 定义用户相似度函数
def calculate_sim(user_index1, user_index2):
    # 取用户1和用户2公开打分的电影集合，转换成列表
    intersection_index_s = list(
        user_rating_map[user_index1] & user_rating_map[user_index2]    # 集合取交集
    )
    # 如果没有公开的打分项，相似度0.0
    if not intersection_index_s:
        return 0.0
    # 根据公共索引，取到用户1的打分向量，并减去均值
    v1 = np.take(
        user_item_rating_array[user_index1],    # 矩阵中取行
        intersection_index_s                    # 行中取多列
    ) - user_rating_mean_s[user_index1]         # 减去均值
    # 根据公共索引，取到用户2的打分向量，并减去均值
    v2 = np.take(
        user_item_rating_array[user_index2],    # 矩阵中取行
        intersection_index_s                    # 行中取多列
    ) - user_rating_mean_s[user_index2]         # 减去均值
    # 计算相似度
    sim = 1 - ssd.cosine(v1, v2)
    # 如果相似度不是数字（如果v1或v2是0向量，将会得到无穷大），返回相似度0
    if np.isnan(sim):
        return 0.0
    # 否则相似度保留两位小数，返回结果
    else:
        return round(sim, 2)

In [7]:
# 用户相似度矩阵
user_similarity_array = np.zeros(shape=(len(user_id_s), len(user_id_s)))    # shape = (943, 943)
for user_index_i in range(len(user_id_s)):
    for user_index_j in range(user_index_i + 1, len(user_id_s)):
        # 计算用户i和用户j的相似度
        sim = calculate_sim(user_index_i, user_index_j)
        # 计算用户i和用户j的相似度 = 计算用户j和用户i的相似度
        user_similarity_array[user_index_i, user_index_j] = sim
        user_similarity_array[user_index_j, user_index_i] = sim

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [8]:
user_similarity_array     # 对称矩阵

array([[ 0.  ,  0.52,  0.53, ...,  0.81, -0.18,  0.3 ],
       [ 0.52,  0.  ,  0.14, ...,  0.04,  0.25,  0.71],
       [ 0.53,  0.14,  0.  , ...,  0.74,  0.51,  1.  ],
       ...,
       [ 0.81,  0.04,  0.74, ...,  0.  , -0.46,  0.01],
       [-0.18,  0.25,  0.51, ..., -0.46,  0.  ,  0.42],
       [ 0.3 ,  0.71,  1.  , ...,  0.01,  0.42,  0.  ]])

## 5.根据用户评分矩阵和用户相似度矩阵计算预测商品分数形成——预测打分矩阵
||||||||
|---|---|---|---|---|---|---|
|0.   | 0.   | 0.   | ... | 3.18 | 3.7  | 3.31 | 
|0.   | 3.53 | 3.48 | ... | 3.3  | 3.82 | 3.43 | 
|3.08  | 2.96 | 2.64 | ... | 3.5  | 3.02 | 0.   | 
|...   | ... | ... | ... | ... | ... | ... |
|0.    | 3.81 | 3.72 | ... | 4.55 | 4.07 | 3.68 | 
|4.5   | 4.13 | 3.96 | ... | 3.77 | 4.29 | 3.9  | 
|3.67   | 0.   | 2.88 | ... | 2.91 | 3.43 | 3.04 | 

- 矩阵中为0的，存在实际打分
- 打分区间[0, 5]

### 计算过程



$$\hat{r}_{u,i}=\overline{r}_u+ \frac{\sum_{v\in{V}}sim(u,v)(r_{v,i} -\overline{r}_v) }{\sum_{v\in{V}}|sim(u,v)|} $$

$$ sim(1,2)=0.937  $$ $$  sim(1,3)=-0.872 $$ $$  sim(1,4)=0.730 $$

$$ a_2 = 用户2 评分 商品5  = 5 $$
$$ a_3 = 用户3 评分 商品5  = 2 $$
$$ a_4 = 用户4 评分 商品5  = 5 $$

$$ rating_{1,5} = \overline{r}_1 + 
\frac
{ (a_2 - \overline{r}_2)\times sim(1,2)  +  (a_3 - \overline{r}_3)\times sim(1,3)  +  (a_4 - \overline{r}_4)\times sim(1,4) }
{ |sim(1,2)|  +  |sim(1,3)|  +  |sim(1,4)| } = 4.05
$$

In [9]:
# 用户对商品的预测矩阵，已打分的商品，预测分数为0分
user_item_predict_rating_array = np.zeros_like(user_item_rating_array)    # user_item_rating_array.shape = (943, 1650)

# 对所有的用户index进行遍历
for user_index in range(len(user_id_s)):
    # 对所有的商品index进行遍历
    for item_index in range(len(item_id_s)):
        # 如果这个商品没有这个用户被打过分
        if item_index not in user_rating_map[user_index]:
            # 找到对这个商品打过分的所有用户的索引
            user_rating_index_v = np.where(
                user_item_rating_array[:, item_index] > 0
            )[0]
            # 如果没有用户对这个商品打过分，continue
            if not list(user_rating_index_v):
                continue
                
            # 根据用户打分的索引，从用户相似度矩阵中取出相似度向量
            user_sim_v = np.take(
                user_similarity_array[user_index],    # 矩阵中取行
                user_rating_index_v    # 行中取多列
            )  #  sim(1,2)=0.937    sim(1,3)=-0.872    sim(1,4)=0.730
            
            # 计算相似度绝对值加和
            user_sim_abs_sum = user_sim_v.__abs__().sum()
            # 如果相似度绝对值加和为0（存在对这个商品打过分的用户群体，但这些用户群体与目标用户的相关度都为0），continue
            if user_sim_abs_sum == 0:
                continue
            # 得到用户打分的向量，并去掉平均值
            user_rating_v = np.take(
                user_item_rating_array[:, item_index],    # 矩阵中取行
                user_rating_index_v    # 行中取多列
            ) - np.take(
                user_rating_mean_s,    # 列表
                user_rating_index_v    # 列表中取多行
            )
            # 根据公式，得到预测的结果，这里保留两位小数
            predict_rating = round(
                np.dot(user_rating_v, user_sim_v) / user_sim_abs_sum
                ,2
            ) + user_rating_mean_s[user_index]
            # 把预测的结果添加到预测矩阵中
            user_item_predict_rating_array[user_index, item_index] = predict_rating

# 打印预测矩阵
user_item_predict_rating_array

array([[0.  , 0.  , 0.  , ..., 3.18, 3.7 , 3.31],
       [0.  , 3.53, 3.48, ..., 3.3 , 3.82, 3.43],
       [3.08, 2.96, 2.64, ..., 3.5 , 3.02, 0.  ],
       ...,
       [0.  , 3.81, 3.72, ..., 4.55, 4.07, 3.68],
       [4.5 , 4.13, 3.96, ..., 3.77, 4.29, 3.9 ],
       [3.67, 0.  , 2.88, ..., 2.91, 3.43, 3.04]])

## 6.为用户推荐商品

1. 从预测打分矩阵中选取为每个用户的预测行
2. 对每行的每个预测打分从高到低排序
3. 选取最高的前n个商品推荐给用户

In [10]:
# 推荐商品，predict_quantity是推荐的商品的个数
def predict(predict_quantity):
    # 建立一个商品推荐字典，保存对user_index推荐的商品索引
    predict_item_index_map = {}
    # 例如：predict_item_index_map[user_index] = [4, 3, 2, 5]  排序 只要3个则取[4,3,2]  只要2个取[4,3]
    # 对训练集中所有的user_index进行遍历
    for user_index in range(len(user_id_s)):
        # 预测矩阵中对应user_index的向量，"-"进行从大到小倒序排序(正序排序从小到大)
        predict_item_index_v = list(np.argsort(-user_item_predict_rating_array[user_index]))
        # 取0 : min(predict_quantity,len(predict_item_index_v))最前面的商品索引
        predict_item_index_v = predict_item_index_v[0 : min(predict_quantity,len(predict_item_index_v))]
        # 添加到商品推荐字典中
        predict_item_index_map[user_index] = predict_item_index_v
    # 返回商品推荐字典
    return predict_item_index_map

# 推荐50个商品
predict_item_index_map = predict(50)
predict_item_index_map

{0: [1184,
  1337,
  1327,
  1514,
  1598,
  1624,
  1230,
  1639,
  1595,
  1236,
  1521,
  1606,
  1576,
  1601,
  1592,
  1607,
  1317,
  1493,
  1269,
  1568,
  1502,
  679,
  1089,
  1585,
  1239,
  674,
  241,
  770,
  1109,
  1319,
  673,
  972,
  173,
  952,
  1614,
  1272,
  1479,
  1486,
  1489,
  1485,
  1490,
  1484,
  1491,
  1492,
  1494,
  1481,
  1480,
  1487,
  1488,
  1475],
 1: [1365,
  1265,
  1644,
  1432,
  1573,
  1514,
  1327,
  1598,
  1595,
  1569,
  1228,
  1225,
  1624,
  1585,
  1317,
  1646,
  1168,
  1230,
  1533,
  1607,
  1601,
  1592,
  1576,
  1606,
  1599,
  1334,
  1124,
  1493,
  1568,
  1502,
  1269,
  849,
  1510,
  1237,
  1343,
  953,
  1553,
  1319,
  1279,
  703,
  1236,
  1498,
  1272,
  1332,
  1370,
  679,
  1223,
  1258,
  1028,
  1555],
 2: [1560,
  1644,
  1365,
  1570,
  1529,
  1432,
  1396,
  1500,
  1146,
  1621,
  1622,
  1434,
  1555,
  1514,
  1533,
  231,
  1563,
  1623,
  1510,
  1598,
  1063,
  1624,
  388,
  1612,
  1324,
  1

# 读取测试集

## 0.读取数据

In [11]:
# 读取测试集
df_test_data = pd.read_csv('./data/movielen_rating_test.base', sep='\t', names=['user_id','item_id','rating'], usecols=[0,1,2])
df_test_data.head()                                                                                

Unnamed: 0,user_id,item_id,rating
0,1,6,5
1,1,10,3
2,1,12,5
3,1,14,5
4,1,17,3


## 1.保存训练集中出现过的用户索引

In [12]:
# 测试集中不重复的用户id
user_test_unique_s = df_test_data['user_id'].unique()
# 创建一个列表，保存测试集中的user_id对应的user_index
user_index_test_s = []
# 对测试集中的用户id进行遍历
for user_id in user_test_unique_s:
    # 如果测试集中的用户id在训练集的用户索引map中，添加这个user_index
    if user_id in user_index_map.keys():
        user_index_test_s.append(user_index_map[user_id])
# 打印测试集中的user_index列表
user_index_test_s

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


## 2.根据训练集的打分构建测试集的打分矩阵
例如：

| |商品1| 商品2| 商品3| 商品4| 商品5|
|:---:|:---:|:---:|:---:|:---:|:---:|
|用户1| 1分| 4分| 2分| 1分| 4分|
|用户2| 2分| 4分| 2分| 1分| 5分|
|用户3| 5分| 1分| 5分| 4分| 2分|
|用户4| 2分| 5分| 3分| 4分| 5分|
|...|...|...|...|

In [13]:
# 创建一个用户对商品打分的dataframe
df_user_item_rating_test = pd.DataFrame(np.zeros(shape=(len(user_index_test_s), len(item_id_s))))    # shape = (459, 1650)
# dataframe的index设置为训练集中存在的测试集的user_index
df_user_item_rating_test.index = user_index_test_s
# 对dataframe的index进行遍历
for row_index in df_test_data.index:
    # 取每一行
    row_data = df_test_data.loc[row_index]    # 根据索引用loc
    # 如果这个用户id在user_rating_map中，商品id也在item_index_map中，添加这次打分
    if row_data['user_id'] in user_rating_map.keys() and row_data['item_id'] in item_index_map.keys():
        item_index = item_index_map[row_data['item_id']]    # 取商品索引
        user_index = user_index_map[row_data['user_id']]    # 取用户索引
        df_user_item_rating_test[item_index][user_index] = row_data['rating']    # dataframe先列后行
        
# 打印
df_user_item_rating_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1640,1641,1642,1643,1644,1645,1646,1647,1648,1649
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3.计算均方误差RMSE

$$
均方误差：RMSE = \sqrt{  \frac{ \sum_{ u,i \in T } (r_{ui} - \hat{r}_{ui})^2 }{ |T| }    }
$$

$$
平均绝对误差：MAE = \frac{ \sum_{ u,i \in T }|r_{ui} - \hat{r}_{ui}| }{ |T| }
$$

$$ r_{ui}测试集中用户u对物品i的实际评分  $$ 

$$ \hat{r}_{ui}推荐系统为用户u对物品i的预测打分  $$ 

$$ T打分集合  $$ 

- **RMSE加大了预测不准评分的惩罚**

|   |  实际打分 |  预测打分 |
|---|---|---|
|用户1对物品1| 1 | 3 |
|用户2对物品2| 2 | 1 |

$$
RMSE = \sqrt{  \frac{ (1 - 3)^2 + (2 - 1)^2 }{ 2 }   } = 1.581
$$ 

$$
MAE = \frac{|1-3|+|2-1|}{2} = 1.5
$$

In [14]:
# 计算均方误差
def calculate_RMSE():
    # acc_loss为分子，二次损失
    acc_loss = 0
    # acc_num为分母，集合项数
    acc_num = 0
    # 对测试集的index进行遍历
    for user_index in df_user_item_rating_test.index:
        # 取测试集打分矩阵中用户打分的那一行
        test_row_data = np.array(df_user_item_rating_test.loc[user_index])
        # 取行中打过分的商品
        test_index_v = np.where(test_row_data > 0)    # 打过分的商品分数必然大于0
        # 取预测打分矩阵中用户打分的那一行
        predict_row_data = user_item_predict_rating_array[user_index]
        # 取行中打过分的商品
        predict_index_v = np.where(predict_row_data > 0)    # 预测过打分的商品分数必然大于0
        # 取test_index_v和predict_index_v的交集，即预测过的打分和测试集中也出现的实际打分
        intersection_index_s = list(
            set(test_index_v[0]) & set(predict_index_v[0])    # ndarray[0]，实现ndarray一行 转 列表
        )
        # 如果交集为空，continue 
        if not intersection_index_s:
            continue
        # 根据上述的交集索引，取得测试集中的打分向量和预测矩阵中的打分向量
        test_rating_v = np.take(test_row_data, intersection_index_s)
        predict_rating_v = np.clip(
            np.take(predict_row_data, intersection_index_s), 0, 5    # clip()最小取0，最大取5
        )   
        # 计算二次损失
        acc_loss += np.square(test_rating_v - predict_rating_v).sum()
        # 分母叠加个数
        acc_num += len(intersection_index_s)
    # 得出均方误差
    return np.sqrt(acc_loss/acc_num)

# 计算均方误差
RMSE = calculate_RMSE()
RMSE

0.9663647036221744

## 4.计算准确率和召回

#### 准确率：为用户推荐的，有多少是用户感兴趣的
#### 召回率：用户感兴趣的，有多少被推荐了

$$ 
Precision  =  \frac{ \sum_{u\in{V}} |R(u) \bigcap T(u)| }{ \sum_{u\in{V}}|R(u)| }
$$

$$
Recall =   \frac{ \sum_{u\in{V}} |R(u) \bigcap T(u)| }{ \sum_{u\in{V}}|T(u)| }
$$ 

$$
R(u)：根据训练集学习，为用户推荐的推荐列表
$$

$$
T(u)：用户在测试集上的行为列表
$$

In [15]:
# 计算准确率和召回率
def calculate_precision_and_recall():
    # 推荐的商品也在测试集中出现的总数
    union_num = 0    #  为用户推荐的，且用户喜欢的
    # 推荐的商品的总数
    predict_num = 0    # 为用户推荐的
    # 测试集中出现的商品总数
    test_num = 0     # 用户喜欢的
    # 对测试集的user_index进行遍历
    for user_index in df_user_item_rating_test.index:
        # 对测试集中的user_index打过分的商品，进行从大到小倒排序，得到索引
        # （这里没有做排序功能，比如可以去测试集中倒序的前100个索引test_item_v）
        test_item_v = np.where(df_user_item_rating_test[user_index] >= 3)[0].tolist()    # 用户评分大于3分视为用户喜欢
        # 推荐的商品也在测试集中出现的总数做叠加
        union_num += len(
            set(predict_item_index_map[user_index]) & set(test_item_v)
        )
        # 推荐的商品的总数叠加
        predict_num += len(predict_item_index_map[user_index])
        # 测试集中出现的商品总数叠加
        test_num += len(test_item_v)
    # 返回正确率和召回率
    return union_num / predict_num, union_num / test_num

# 计算正确率和召回率
precision, recall = calculate_precision_and_recall()
print('precision=', precision)
print('recall=', recall)

precision= 0.0024400871459694987
recall= 0.0055232271427162445


## 5.计算覆盖度

#### 覆盖度：能够推荐出来的物品种类总数占总物品种类集合的比例

$$ Coverage = \frac{U_{u\in{V}} \dot R(u)}{|I|} $$ 

$$ R(u)对用户推荐的推荐列表 $$

$$ U 用户集合 $$

$$ I 总物品集合 $$

In [16]:
# 计算覆盖率
def calculate_coverage():
    # 推荐的物品索引集合
    predict_item_index_set = set()
    # 把所有用户推荐过的商品id都添加到predict_item_set里，然后根据predict_item_index_set的数量，计算覆盖度
    for user_index in predict_item_index_map.keys():
        for item_index in predict_item_index_map[user_index]:
            predict_item_index_set.add(item_index)
    return len(predict_item_index_set) / len(item_id_s)

# 计算并打印覆盖度
coverage = calculate_coverage()
print('coverage=', coverage)

coverage= 0.39636363636363636


# 将代码用numpy pandas简化