# 数据集加载

### 加载ratings.csv,并转换为用户电影评分矩阵

In [1]:

import os
import pandas as pd
import numpy as np

DATA_PATH = "../dataset/ml-latest-small/ratings.csv"
CACHE_DIR = "../dataset/cache/"


def load_data(data_path):
    '''
    加载数据
    ：param data_path: 数据集路径
    ：param cache_path: 数据集缓存路径
    ：return：用户-物品评分矩阵
    '''
    # 数据集缓存地址
    cache_path = os.path.join(CACHE_DIR, "ratings_matrix.cache")
#     print("开始加载数据集···")
    if os.path.exists(cache_path):  # 判断是否存在缓存文件
#         print('加载缓存中···')
        ratings_matrix = pd.read_pickle(cache_path)
#         print('从缓存中加载数据集完毕')
    else:
#         print('加载新数据中···')
        # 设置要加载的数据字段的类型
        dtype = {"userId": np.int32, "movieId": np.int32, "rating": np.float32}
        # 加载数据 只用前三列数据 [userId, movieId, rating]
        ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))
        # 透视表 将movieId转换为列名称 转换为一个User-Movie的评分矩阵
        ratings_matrix = ratings.pivot_table(index=["userId"], columns=["movieId"], values="rating")
        # 存入缓存文件
        ratings_matrix.to_pickle(cache_path)
#         print("数据集加载完毕")
    return ratings_matrix

In [2]:
ratings_matrix = load_data(DATA_PATH)
ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


# 相似度计算

### 计算用户或物品两两相似度

In [3]:
def compute_pearson_similarity(ratings_matrix, based="user"):
    '''
    计算皮尔逊相关系数
    :param ratings_matrix: 用户物品评分矩阵
    :param based: "user" or "item"
    :return:  相似度矩阵
    '''
    user_similarity_cache_path = os.path.join(CACHE_DIR, "user_similarity.cache")
    item_similarity_cache_path = os.path.join(CACHE_DIR, "item_similarity.cache")
    # 基于皮尔逊相关系数计算相似度
    # 用户相似度
    if based == "user":
        if os.path.exists(user_similarity_cache_path):
#             print("正在从缓存中加载数据···")
            similarity = pd.read_pickle(user_similarity_cache_path)
        else:
#             print("开始计算用户相似度")
            similarity = ratings_matrix.T.corr()
            similarity.to_pickle(user_similarity_cache_path)
    elif based == "item":
        if os.path.exists(item_similarity_cache_path):
#             print("正在从缓存中加载数据···")
            similarity = pd.read_pickle(item_similarity_cache_path)
        else:
#             print("计算物品相似度")
            similarity = ratings_matrix.corr()
            similarity.to_pickle(item_similarity_cache_path)
    else:
        raise Exception("Unhandled 'based' value: %s" % based)
#     print("相似度矩阵计算/加载完毕")
    return similarity


In [4]:
user_similarity = compute_pearson_similarity(ratings_matrix, "user")
user_similarity

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,,0.079819,0.207983,0.268749,-0.291636,-0.118773,0.469668,0.918559,-0.037987,...,0.091574,0.000000,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.268070,-0.175412,-0.032086
2,,1.000000,,,,,-0.991241,,,0.037796,...,-0.387347,,-1.000000,,,0.583333,,-0.125000,,0.623288
3,0.079819,,1.000000,,,,,,,,...,,,0.433200,,,-0.791334,-0.333333,-0.395092,,0.569562
4,0.207983,,,1.000000,-0.336525,0.148498,0.542861,0.117851,,0.485794,...,-0.222113,0.396641,0.090090,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.277350,-0.043786
5,0.268749,,,-0.336525,1.000000,0.043166,0.158114,0.028347,,-0.777714,...,0.000000,0.153303,0.234743,0.067791,-0.364156,0.244321,0.231080,-0.020546,0.384111,0.040582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.066378,0.583333,-0.791334,0.144603,0.244321,-0.049192,0.137771,0.253582,0.572700,-0.382955,...,0.290490,0.140613,0.318473,0.682949,0.167062,1.000000,0.114191,0.240842,0.533002,0.389185
607,0.174557,,-0.333333,0.116518,0.231080,0.255639,0.402792,0.251280,,-0.241121,...,0.698241,0.217210,0.192787,0.035806,-0.299641,0.114191,1.000000,0.200814,0.190117,0.106605
608,0.268070,-0.125000,-0.395092,-0.170501,-0.020546,0.125428,0.008081,0.434423,0.336625,-0.571043,...,0.473967,0.297646,0.086423,0.053986,-0.075673,0.240842,0.200814,1.000000,0.488929,0.147606
609,-0.175412,,,-0.277350,0.384111,0.193649,0.420288,0.141860,,,...,1.000000,0.188512,0.343303,0.641624,-0.550000,0.533002,0.190117,0.488929,1.000000,-0.521773


In [5]:
item_similarity = compute_pearson_similarity(ratings_matrix, "item")
item_similarity

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.330978,0.487109,1.000000,0.310971,0.106465,0.208402,0.968246,0.095913,-0.021409,...,,,,,,,,,,
2,0.330978,1.000000,0.419564,,0.562791,0.163510,0.430261,0.415227,0.277350,0.016626,...,,,,,,,,,,
3,0.487109,0.419564,1.000000,,0.602266,0.345069,0.554088,0.333333,0.458591,-0.050276,...,,,,,,,,,,
4,1.000000,,,1.000000,0.654654,,0.203653,,,0.870388,...,,,,,,,,,,
5,0.310971,0.562791,0.602266,0.654654,1.000000,0.291302,0.609119,0.555556,0.319173,0.218263,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


# 案例--算法实现： User-Based CF 和 Item-Based CF评分预测

* **User-basedCF评分预测公式**
$$
pred(u,i)=\hat{r}_{ui}=\frac{\sum_{v\in U}sim(u,v)*r_{vi}}{\sum_{v\in U}|sim(u,v)|}
$$
* **Item-basedCF评分预测公式**
$$
pred(u,i)=\hat{r}_{ui}=\frac{\sum_{j\in I}sim(i,j)*r_{uj}}{\sum_{j\in I}|sim(i,j)|}
$$

In [6]:
# 评分预测方法
def predict(uid, iid, ratings_matrix, based="user"):
    '''
    预测给定用户uid对给定物品iid的评分
    :param uid:
    :param iid:
    :param based: "user"代表 User-Based CF评分预测，"item"代表 Item-Based CF评分预测
    :return: 预测的评分值
    '''
#     print("开始预测用户<%d>对电影<%d>的评分···" % (uid, iid))

    if based == "user":  # user-based CF预测评分
        # 找 uid 的近邻用户
        user_similarity = compute_pearson_similarity(ratings_matrix, "user")
        users_sim_score = user_similarity.loc[uid].drop([uid])  # 去掉自己
        users_sim_score = users_sim_score.where(users_sim_score > 0).dropna()  # 只留下正相关的用户及其相似度
#         if users_sim_score.empty is True:
#             raise Exception("用户<%d>没有相似用户" % uid)
        # 从近邻用户中筛选出对iid有过评分记录的用户
        sim_user_itemIID_rating = ratings_matrix.loc[list(users_sim_score.index), iid].dropna()  # 符合条件的用户及其评分

        # 预测评分
        predict_score = 0  # 分子
        sum_weight = 0  # 分母
        for user, rating in sim_user_itemIID_rating.items():
            weight = user_similarity.loc[uid, user]
            predict_score += weight * rating
            sum_weight += weight
        if sum_weight == 0:
            return np.nan;
        else:
            predict_score /= sum_weight
            return round(predict_score, 2)

    elif based == "item":  # item-based CF预测评分
        # 筛选iid的相似物品
        item_similarity = compute_pearson_similarity(ratings_matrix, "item")
        sim_item_score = item_similarity[iid].drop([iid]).dropna()
        sim_item_score = sim_item_score.where(sim_item_score > 0).dropna()  # 选正相关的物品及其评分
#         if sim_item_score.empty is True:
#             raise Exception("物品<%d>没有相似物品" % iid)
        # 只留下与iid正相关且uid对其评过分的物品
        ids = set(sim_item_score.index) & set(ratings_matrix.loc[uid].dropna().index)
        sim_item_score = sim_item_score.loc[list(ids)]
        #         print(sim_item_score)
        # 预测评分
        predict_score = 0
        sum_weight = 0
        for item, weight in sim_item_score.items():
            rate = ratings_matrix.loc[uid, item]
            predict_score += weight * rate
            sum_weight += weight
        
        if sum_weight == 0:
            return np.nan;
        else:
            predict_score /= sum_weight
            return round(predict_score, 2)
    else:
        raise Exception("Unhandled 'based' value %s" % based)

In [7]:
predict_score = predict(1, 53, ratings_matrix, based="user")
predict_score

nan

In [8]:
predict_score = predict(1, 3, ratings_matrix, based="item")
predict_score

4.56

In [9]:
# 预测全部评分的方法
def predict_all(rating_matrix, based="user"):
    """
    依次预测全部用户对全部物品的打分
    :param rating_matrix:
    :param based: 选择什么算法预测评分 "user":user-based CF  "item" item-based CF
    :return:
    """

    new_rating_matrix = []
    for user in rating_matrix.index:
        user_items_rating = []
        for item in rating_matrix.columns:
            score = predict(user, item, rating_matrix, based=based)
            user_items_rating.append(score)
        new_rating_matrix.append(user_items_rating)
    dtype = {"userId": np.int32, "movieId": np.int32, "rating": np.float32}
    new_rating_matrix = pd.DataFrame(data=new_rating_matrix, index=rating_matrix.index, columns=rating_matrix.columns,
                                     dtype=dtype)
    if based == "user":
        new_rating_matrix_cache_UserBased = os.path.join(CACHE_DIR, "new_rating_matrix_cache_UserBased.cache")
        if os.path.exists(new_rating_matrix_cache_UserBased):
            pass
        else:
            new_rating_matrix.to_pickle(new_rating_matrix_cache_UserBased)
    elif based == "item":
        new_rating_matrix_cache_ItemBased = os.path.join(CACHE_DIR, "new_rating_matrix_cache_ItemBased.cache")
        if os.path.exists(new_rating_matrix_cache_ItemBased):
            pass
        else:
            new_rating_matrix.to_pickle(new_rating_matrix_cache_ItemBased)
    else:
        raise Exception("Unhandled 'based' value %s" % based)
    print("全量评分全部计算完毕")
    return new_rating_matrix

In [10]:
# new_rating_matrix = predict_all(ratings_matrix, based="user")
# new_rating_matrix

### 这个我没运行完就停了  计算太大了 但是可以运行的起来

### 添加过滤规则

In [24]:
def _predict_all(uid, item_ids, ratings_matrix, user_similarity, based="user"):
    """
    预测全部评分
    :param uid:
    :param item_ids:要预测的用户id列表
    :param ratings_matrix:
    :param user_similarity:
    :return:生成器 逐个返回预测评分
    """
    print("item_ids:", item_ids)
    #################################################
    for iid in item_ids:
        predict_score = predict(uid, iid, ratings_matrix, "user")
        yield uid, iid, predict_score


def predict_all_filter(uid, ratings_matrix, user_similarity, filter_rule=None):
    """
    预测全部评分，并可根据条件进行前置过滤
    :param uid:
    :param ratings_matrix:
    :param user_similarity:
    :param filter_rule: 过滤规则，只能是四选一， 否则将抛异常："unhot","rated",["unhot","rated"], None
    :return:
    """
    if not filter_rule:
        item_ids = ratings_matrix.columns
    elif isinstance(filter_rule, str) and filter_rule == "unhot":
        """过滤非热门电影"""
        # 统计每部电影的评分数
        count = ratings_matrix.count()
        # 过滤出评分数大于10个的电影作为热门电影
        item_ids = count.where(count > 10).dropna().index
    elif isinstance(filter_rule, str) and filter_rule == "rated":
        """过滤用户评分过的物品"""
        # 获取用户对所有电影的评分记录
        user_ratings = ratings_matrix.loc[uid]
        # 评分范围是1~5 留下评分过的电影
        _ = user_ratings < 6
        item_ids = _.where(_ == False).dropna().index
    elif isinstance(filter_rule, list) and set(filter_rule) == {"unhot", "rated"}:
        """过滤非热门和用户已经评分过的电影"""
        count = ratings_matrix.count()
        item_ids1 = count.where(count > 10).dropna().index
        user_ratings = ratings_matrix.loc[uid]
        _ = user_ratings < 6
        item_ids2 = user_ratings.where(_ == False).dropna().index
        item_ids = set(item_ids1) & set(item_ids2)
    else:
        raise Exception("无效的过滤参数")
    yield from _predict_all(uid, item_ids, ratings_matrix, user_similarity, "user")


# if __name__ == '__main__':
#     ratings_matrix = load_data(DATA_PATH)
#     user_similarity = compute_pearson_similarity(ratings_matrix,"user")
#     for result in predict_all_filter(1,ratings_matrix,user_similarity,filter_rule=["unrot", "rated"]):
#         print(result)



### 根据预测评分为指定用户进行TopN推荐

In [47]:
def top_k_rs_result(k):
    ratings_matrix = load_data(DATA_PATH)
    user_similarity = compute_pearson_similarity(ratings_matrix, "user")
    results = predict_all_filter(2, ratings_matrix, user_similarity, filter_rule="rated")# ["unhot", "rated"]
    print(results)
    
    results = pd.DataFrame(results, columns=["uid", "iid", "rating"]).dropna()
    results.sort_values(by="rating",axis=0,ascending=False,inplace=True)
    return results[:k]

#     return sorted(results, key=lambda x: x[2], reverse=True)[:k]

In [48]:
from pprint import pprint
result = top_k_rs_result(20)
# pprint(result)

<generator object predict_all_filter at 0x000002160366A3C8>
item_ids: Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            193565, 193567, 193571, 193573, 193579, 193581, 193583, 193585,
            193587, 193609],
           dtype='int64', name='movieId', length=9695)


In [49]:
result

Unnamed: 0,uid,iid,rating
4471,2,6619,5.0
7694,2,90603,5.0
4353,2,6380,5.0
7531,2,85736,5.0
7559,2,86504,5.0
9389,2,166568,5.0
7565,2,86721,5.0
7575,2,86898,5.0
5519,2,26612,5.0
5505,2,26547,5.0


In [50]:
rec_item_ids = list(result["iid"])
rec_item_ids

[6619,
 90603,
 6380,
 85736,
 86504,
 166568,
 86721,
 86898,
 26612,
 26547,
 2106,
 84414,
 26401,
 26326,
 26169,
 95149,
 25947,
 5088,
 162344,
 25771]