In [12]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pprint import pprint
import os

### EDA

In [13]:
dtype = {"userId":np.int32, "movieId":np.int32, "rating":np.float32}

In [14]:
df = pd.read_csv("data/ratings.csv", dtype=dtype)

In [15]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int32  
 1   movieId    100836 non-null  int32  
 2   rating     100836 non-null  float32
 3   timestamp  100836 non-null  int64  
dtypes: float32(1), int32(2), int64(1)
memory usage: 1.9 MB


### 构建用户物品表

In [17]:
rating_matrix = df.pivot_table(index=["userId"], columns="movieId", values="rating")
rating_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


### 计算相似度

#### 用户相似度

In [18]:
# 默认计算列与列之间的皮尔逊相关系数
user_similar = rating_matrix.T.corr()
user_similar.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,0.079819,0.207983,0.268749,-0.291636,-0.118773,0.469668,0.918559,-0.037987,...,0.09157371,-1.597727e-16,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.26807,-0.175412,-0.032086
2,,1.0,,,,,-0.991241,,,0.037796,...,-0.3873468,,-1.0,,,0.583333,,-0.125,,0.623288
3,0.079819,,1.0,,,,,,,,...,,,0.4332,,,-0.791334,-0.333333,-0.395092,,0.569562
4,0.207983,,,1.0,-0.336525,0.148498,0.542861,0.117851,,0.485794,...,-0.2221127,0.3966413,0.09009,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.27735,-0.043786
5,0.268749,,,-0.336525,1.0,0.043166,0.158114,0.028347,,-0.777714,...,2.71948e-16,0.1533034,0.234743,0.067791,-0.364156,0.244321,0.23108,-0.020546,0.384111,0.040582


#### 电影相似度

In [19]:
item_similar = rating_matrix.corr()
item_similar.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.330978,0.487109,1.0,0.310971,0.106465,0.208402,0.968246,0.095913,-0.021409,...,,,,,,,,,,
2,0.330978,1.0,0.419564,,0.562791,0.16351,0.430261,0.415227,0.27735,0.016626,...,,,,,,,,,,
3,0.487109,0.419564,1.0,,0.602266,0.345069,0.554088,0.333333,0.458591,-0.050276,...,,,,,,,,,,
4,1.0,,,1.0,0.654654,,0.203653,,,0.870388,...,,,,,,,,,,
5,0.310971,0.562791,0.602266,0.654654,1.0,0.291302,0.609119,0.555556,0.319173,0.218263,...,,,,,,,,,,


### 读取数据

In [20]:
DATA_PATH = "./data/ratings.csv"
CACHE_DIR = "./data/cache/"

In [21]:
def load_data(data_path):
    """加载数据"""
    
    cache_path = os.path.join(CACHE_DIR,"rating_matric.cache")
    print("开始加载数据集")
    
    if os.path.exists(cache_path):
        print("加载缓存中")
        rating_matrix = pd.read_pickle(cache_path)
        print("缓存加载完毕")
    else:
        print("加载新数据")
        dtype = {"userId":np.int32, "movieId":np.int32, "rating":np.float32}
        df = pd.read_csv("data/ratings.csv", dtype=dtype, usecols=range(3))
        rating_matrix = df.pivot_table(index=["userId"], columns="movieId", values="rating")
        rating_matrix.to_pickle(cache_path)
        print("新数据加载完毕")
        
    return rating_matrix

### 计算相似度

In [22]:
def compute_person_similarity(rating_matrix, based="user"):
    """计算相似度"""
    
    user_similarity_cache_path = os.path.join(CACHE_DIR, "user_similarity.cache")
    item_similarity_cache_path = os.path.join(CACHE_DIR, "item_similarity.cache")
    
    if based == "user":
        if os.path.exists(user_similarity_cache_path):
            print("缓存中获取用户相似度")
            similarity = pd.read_pickle(user_similarity_cache_path)
        else:
            print("正在计算用户相似度")
            similarity = rating_matrix.T.corr()
            similarity.to_pickle(user_similarity_cache_path)
    elif based == "item":
        if os.path.exists(item_similarity_cache_path):
            print("缓存中获取电影相似度")
            similarity = pd.read_pickle(item_similarity_cache_path)
        else:
            print("正在计算电影相似度")
            similarity = rating_matrix.corr()
            similarity.to_pickle(item_similarity_cache_path)
    else:
        raise Exception("Unhandles based value %s" % based)
    
    print("相似度计算完成")
    
    return similarity

### 评分预测

In [23]:
def predict(uid, iid, rating_matrix, user_similar):
    """预测给定用户对物品的评分值"""
    
#     print("开始预测用户<%d>对电影<%d>的评分"%(uid, iid))
    similar_user = user_similar[uid].drop(uid).dropna() #　提取相似用户消除空值
    similar_user = similar_user.where(similar_user>0).dropna()
    
    if similar_user.empty:
        raise Exception("用户<%d>无相似信息"%(uid))
    ids = set(similar_user.index)&set(rating_matrix[iid].dropna().index)
    # 最终与用户相似的n个用户
    finnaly_similar_users = similar_user.loc[list(ids)]
    
    sum_up = 0.0
    sum_down = 0.0
    
    # 遍历相似用户，并找出相似用户对推荐物品的评分，从而加权预测出相对应的推荐评分
    for sim_uid, similarity in finnaly_similar_users.iteritems():
        
        # 获取相应用户所对应的评分
        sim_user_rated_movied = rating_matrix.loc[sim_uid].dropna()
        similar_user_rating_for_item = sim_user_rated_movied.loc[iid]
        
        # 计算预测值(加权)
        sum_up += similarity * similar_user_rating_for_item
        sum_down += similarity
    
    score = sum_up/sum_down
    
#     print("预测结束，用户<%d>对<%d>的预测分数为：%0.2f"%(uid, iid, score))
    return round(score)

In [24]:
predict(1, 1, rating_matrix, user_similar)

4

#### 用户对所有item进行预测

In [14]:
def predict_all(uid, rating_matrix, user_similar):
    
    item_ids = rating_matrix.columns
    for iid in item_ids:
        try:
            rating = predict(uid=1, iid=iid, rating_matrix=rating_matrix, user_similar=user_similar)
        except Exception as e:
            print(e)
        else:
            yield uid, iid, rating  # 产生可迭代的对象

In [15]:
count = 0
for i in predict_all(1, rating_matrix, user_similar):
    print(i) 
    count += 1
    if count == 3:
        break

(1, 1, 4.0)
(1, 2, 3.0)
(1, 3, 3.0)


#### 基于过滤规则的预测

In [16]:
def _predict_all(item_ids, uid, rating_matrix, user_similar):
    
    for iid in item_ids:
        try:
            rating = predict(uid=1, iid=iid, rating_matrix=rating_matrix, user_similar=user_similar)
        except Exception as e:
            print(e)
        else:
            yield uid, iid, rating  # 产生可迭代的对象

In [17]:
def predict_all(uid, rating_matrix, user_similar, filter_rule=None):
    """基于给定规则的过滤方式"""
    
    print("执行预测")
    if not filter_rule:
        item_ids = rating_matrix.columns
    elif isinstance(filter_rule, str) and filter_rule == "unhot":
        """过滤非热门电影"""
        count = rating_matrix.count()
        item_ids = count.where(count>10).dropna().index # 获取电影总评分大于十分的电影
    elif isinstance(filter_rule, str) and filter_rule == "rated":
        """过滤用户评分过的电影"""
        user_rating = rating_matrix.loc[uid].dropna()
        _ = user_rating<6
        item_ids = _.where(_==False).dropna().index
    elif isinstance(filter_rule, list) and set(filter_rule) == set(["unhot", "rated"]):
        """过滤非热门和用户已经评分过的电影"""
        
        # unhot
        count = rating_matrix.count()
        item_ids1 = count.where(count>10).dropna().index
        
        # rated
        user_rating = rating_matrix.loc[uid].dropna()
        _ = user_rating<6
        item_ids2 = _.where(_==False).dropna().index
        
        item_ids = set(item_ids1)&set(item_ids2)
    else:
        raise Exception("无效过滤参数")
    
    yield from _predict_all(item_ids, uid, rating_matrix, user_similar)

In [18]:
def top_k_result(k):
    
    rating_matrix = load_data(DATA_PATH)
    user_similar = compute_person_similarity(rating_matrix)
    results = predict_all(1, rating_matrix, user_similar, filter_rule="unhot")
    
    return sorted(results, key=lambda x:x[2], reverse=True)[:k]

In [19]:
# 测试
count = 0
for i in predict_all(1, rating_matrix, user_similar, filter_rule="unhot"):
    print(i) 
    count += 1
    if count == 3:
        break

执行预测
(1, 1, 4.0)
(1, 2, 3.0)
(1, 3, 3.0)


###  电影推荐

In [20]:
rating_matrix = load_data(DATA_PATH)

开始加载数据集
加载缓存中
缓存加载完毕


In [21]:
user_similar = compute_person_similarity(rating_matrix)
user_similar.head()

缓存中获取用户相似度
相似度计算完成


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,0.079819,0.207983,0.268749,-0.291636,-0.118773,0.469668,0.918559,-0.037987,...,0.091574,0.0,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.26807,-0.175412,-0.032086
2,,1.0,,,,,-0.991241,,,0.037796,...,-0.387347,,-1.0,,,0.583333,,-0.125,,0.623288
3,0.079819,,1.0,,,,,,,,...,,,0.4332,,,-0.791334,-0.333333,-0.395092,,0.569562
4,0.207983,,,1.0,-0.336525,0.148498,0.542861,0.117851,,0.485794,...,-0.222113,0.396641,0.09009,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.27735,-0.043786
5,0.268749,,,-0.336525,1.0,0.043166,0.158114,0.028347,,-0.777714,...,0.0,0.153303,0.234743,0.067791,-0.364156,0.244321,0.23108,-0.020546,0.384111,0.040582


In [22]:
predict(1, 1, rating_matrix, user_similar)

4.0

In [23]:
item_similar = compute_person_similarity(rating_matrix, based="item")

缓存中获取电影相似度
相似度计算完成


In [24]:
item_similar.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.330978,0.487109,1.0,0.310971,0.106465,0.208402,0.968246,0.095913,-0.021409,...,,,,,,,,,,
2,0.330978,1.0,0.419564,,0.562791,0.16351,0.430261,0.415227,0.27735,0.016626,...,,,,,,,,,,
3,0.487109,0.419564,1.0,,0.602266,0.345069,0.554088,0.333333,0.458591,-0.050276,...,,,,,,,,,,
4,1.0,,,1.0,0.654654,,0.203653,,,0.870388,...,,,,,,,,,,
5,0.310971,0.562791,0.602266,0.654654,1.0,0.291302,0.609119,0.555556,0.319173,0.218263,...,,,,,,,,,,


In [25]:
top_k_result(2)

开始加载数据集
加载缓存中
缓存加载完毕
缓存中获取用户相似度
相似度计算完成
执行预测


[(1, 280, 5.0), (1, 318, 5.0)]