基于物品的协同过滤算法

参考链接：https://blog.csdn.net/lck5602/article/details/78816593

In [36]:
import os
import math
import pickle
import numpy as np
import pandas as pd
from itertools import permutations
from sklearn.model_selection import train_test_split

# 数据预处理

In [37]:
# 加载训练集、测试集数据
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [38]:
# 取部分数据集
train = train[train['uid']<10]
test = test[test['uid']<5]

In [39]:
def generate_user_item_matrix(train):
    """生成用户对物品的评分矩阵"""
    file_path = '../data/user_item_matrix.pkl'
    if os.path.exists(file_path):
        uid_iid_matrix = pickle.load(open(file_path, 'rb'))
    else:
        users = train.uid.unique()
        items = train.iid.unique()
        uid_iid_matrix = np.zeros((users.shape[0], items.shape[0]), dtype=np.int8)
        uid_iid_matrix = pd.DataFrame(uid_iid_matrix, index=users, columns=items)
        for index, row in train.iterrows(): # 获取每行的index、row
            if index % 3000 == 0:
                print(index)
            uid_iid_matrix.loc[row['uid'], row['iid']] = row['score'] # 将结果返回给data
        pickle.dump(uid_iid_matrix, open(file_path, 'wb'), True)
    return uid_iid_matrix

余弦相似度
![image.png](attachment:image.png)

欧式距离
![image.png](attachment:image.png)

皮尔逊相关性
![image.png](attachment:image.png)

In [40]:
'''
无论是计算用户相似度还是物品相似度都需要相似度公式，常用的相似度公式包括：
余弦相似度、改进的余弦相似度、欧氏距离、皮尔逊相似度度量。
'''
def cosine(rate_matrix, i, j):
    """ 余弦相似度 """
    a = rate_matrix[:, i]
    b = rate_matrix[:, j]
    m = np.dot(a, b)
    n = np.sqrt(np.dot(a, a) * np.dot(b, b))
    return m / float(n)

def cosine_s(rate_matrix, i, j):
    """ 改进的余弦相似度 """
    a = rate_matrix[:, i]
    b = rate_matrix[:, j]
    intersection = a * b
    if intersection[intersection != 0].size == 0:
        return 0.0
    c = a[a != 0]
    d = b[b != 0]
    c_mean = np.mean(c)
    d_mean = np.mean(d)
    
    m = np.dot(a[intersection != 0] - c_mean, b[intersection != 0] - d_mean)
    n = np.sqrt(np.dot(c-c_mean, c-c_mean) * np.dot(d-d_mean, d-d_mean))
    if n == 0:
        return 0.0
    return m / float(n)
    
def pearson(rate_matrix, i, j):
    """ 皮尔逊相似度 """
    a = rate_matrix[:, i]
    b = rate_matrix[:, j]
    intersection = a * b
    if intersection[intersection != 0].size == 0:
        return 0.0
    
    c = a[intersection != 0] # 评价物品i的公共用户评分
    d = b[intersection != 0] 
    a_mean = np.mean(c)
    b_mean = np.mean(d)
    m = np.dot(c-a_mean, d-b_mean)
    n = np.sqrt(np.dot(c-a_mean, c-a_mean) * np.dot(d-b_mean, d-b_mean))
    if n == 0:
        return 0.0
    return m / float(n)

In [41]:
def get_rate_cosine(rate_matrix, n_iid, func):
    """ 
    将用户对物品的评分矩阵通过某种相似度度量方法得到相似度矩阵 
    :param rate_matrix 用户对物品的评分矩阵
    :param n_iid 用户数
    :param func 相似度度量方式
    :return 
    """
    file_path = '../data/user_item_similarity_matrix.pkl'
    if os.path.exists(file_path):
        rate_cos = pickle.load(open(file_path, 'rb'))
    else:
        shapes = [n_iid, n_iid]
        rate_cos = np.zeros(shapes)
        
        for i in range(shapes[0]):
            #if i % 1000 == 0:
                #print(i)
            for j in range(shapes[1]):
                if i == j:
                    rate_cos[i, j] = 1
                elif rate_cos[j, i] != 0:
                    rate_cos[i, j] = rate_cos[j, i]
                else:
                    rate_cos[i, j] = eval(func)(np.array(rate_matrix), i, j)
        iid_index = rate_matrix.columns
        rate_cos = pd.DataFrame(rate_cos, index=iid_index, columns=iid_index)
        pickle.dump(rate_cos, open(file_path, 'wb'), True)
    return rate_cos

# 构建模型

In [42]:
#train['iid'] = train['iid'].apply(str)
#train['uid'] = train['uid'].apply(str)

In [43]:
# 生成用户-物品评分矩阵
rate_matrix = generate_user_item_matrix(train).fillna(0) 

In [44]:
n_iid = rate_matrix.shape[1] # 物品数量
n_iid

3806

In [46]:
rate_cosine = get_rate_cosine(rate_matrix, n_iid, 'cosine_s').fillna(0)

In [21]:
rate_cosine

Unnamed: 0,0,8,13,18,34,38,44,59,115,124,...,9604,10087,10204,10278,10457,11001,11224,12275,13189,13821
0,1.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,1.000000,-0.270501,0.0,-0.420084,0.093250,0.353553,-0.518875,0.000000,-0.690849,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,-0.270501,1.000000,0.0,0.069442,-0.340529,-0.191273,-0.210534,-0.055216,0.272897,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,0.0,-0.420084,0.069442,0.0,1.000000,0.045702,-0.544581,0.363285,-0.685994,0.433019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,0.0,0.093250,-0.340529,0.0,0.045702,1.000000,0.164845,0.000000,-0.323589,0.257688,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44,0.0,0.353553,-0.191273,0.0,-0.544581,0.164845,1.000000,-0.131036,0.577350,-0.162835,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59,0.0,-0.518875,-0.210534,0.0,0.363285,0.000000,-0.131036,1.000000,0.317744,0.526723,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115,0.0,0.000000,-0.055216,0.0,-0.685994,-0.323589,0.577350,0.317744,1.000000,-0.094013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124,0.0,-0.690849,0.272897,0.0,0.433019,0.257688,-0.162835,0.526723,-0.094013,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
iid_index = rate_matrix.columns
iid_index

Index(['0', '8', '13', '18', '34', '38', '44', '59', '115', '124',
       ...
       '9604', '10087', '10204', '10278', '10457', '11001', '11224', '12275',
       '13189', '13821'],
      dtype='object', length=3806)

![image.png](attachment:image.png)

In [22]:
def recommendation(uid, t, iid_iid_similarity, rate_matrix, k=10):
    """ 推荐iid最相似的k个物品的index，得到用户uid对iid物品的评分 """
    score = 0 
    weight = 0
    iid_similarity = iid_iid_similarity.loc[iid, :].values # 商品iid对应所有商品相似度
    uid_action = rate_matrix.loc[uid, :].values # 用户uid对应所有商品的行为评分
    iid_action = rate_matrix.loc[:, iid].values # 物品iid得到所有用户评分
    similarity_indexs = np.argsort(iid_similarity)[-(k+1): -1] # 最相似的k个物品的index
    iid_i_mean = np.sum(iid_action) / iid_action[iid_action != 0].size
    
    # 遍历最相似的k个物品
    for j in similarity_indexs:
        # 若用户uid对物品j有评分
        if uid_action[j] != 0:
            # j物品的所有用户评分
            iid_j_action = rate_matrix.values[:, j]
            # j物品的平均评分
            iid_j_mean = np.sum(iid_j_action)/iid_j_action[iid_j_action !=0].size
            
            score += iid_similarity[j] * (uid_action[j] - iid_j_mean)
            weight += abs(iid_similarity[j])
    #print(iid_i_mean, score, weight)
    if weight == 0:
        return iid_i_mean
    else:
        return iid_i_mean + score/float(weight)

In [61]:
def predict(num, k, iid_index, iid_iid_similarity, rate_matrix):
    """ 预测 """
    result = np.zeros(num[0]) 
    count = 0 # 统计下数量
    for i in range(num[0]):
        a = test.ix[i, 'uid']
        b = test.ix[i, 'iid']
        if b not in iid_index:
            result[i] = 3
            count += 1
        else:
            # 用户对某个物品的评分
            result[i] = recommendation(a, b, iid_iid_similarity, rate_matrix, k)
    print(count)
    return result

In [65]:
# 预测
num = test.shape
result = predict(num, 5, iid_index, rate_cosine, rate_matrix)
result

22


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


array([3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3.])

In [69]:
#Y_test_score = pd.DataFrame(np.array(result), columns=['score'])

# 把data中的score写入csv文件中
#Y_test_score.to_csv('item_cf.csv', index=False, columns=['scores'])