# 数据加载
## 文件地址

In [21]:
import os
import collections
ROOT = os.getcwd()
ROOT = os.path.split(ROOT)[0]
ROOT = os.path.join(ROOT,'data_set')

class Ml_100K():
    #下载地址：https://github.com/rexrex9/kb4recMovielensDataProcess
    # 以双下划线（__）开头的属性名（如这里的__BASE）在类定义里通常会被解释器进行名称改写（Name Mangling）
    # 用于实现一种有限的 “私有” 机制，实际的属性名会被改写成_类名__属性名的形式。
    __BASE = os.path.join(ROOT, 'ml-100k')
    ORGINAL_DIR = os.path.join(ROOT,'ml-100k-orginal')
    USER_DF = os.path.join(ORGINAL_DIR,'user_df.csv')
    ITEM_DF = os.path.join(ORGINAL_DIR,'item_df.csv')
    ITEM_DF_0 = os.path.join(ORGINAL_DIR, 'item_df_0.csv')

    KG=os.path.join(__BASE,'kg_index.tsv')
    RATING = os.path.join(__BASE,'rating_index.tsv')
    RATING5 = os.path.join(__BASE, 'rating_index_5.tsv')

class Ml_latest_small():
    __BASE = os.path.join(ROOT,'ml-latest-small')
    RATING_TS = os.path.join(__BASE,'rating_index_ts.tsv')
    SEQS = os.path.join(__BASE, 'seqs.npy')

    SEQS_NEG = os.path.join(__BASE, 'seqsWithNeg.npy')
print(ROOT)
print(Ml_100K._Ml_100K__BASE)
print(Ml_100K.RATING)

d:\DeepLearning\推荐系统\recbyhand\data_set
d:\DeepLearning\推荐系统\recbyhand\data_set\ml-100k
d:\DeepLearning\推荐系统\recbyhand\data_set\ml-100k\rating_index.tsv


In [22]:
import json

def readTriple(path,sep=None):
    # with语句的作用：自动管理文件资源
    # path是路径 ‘r’是read表示只读模式  utf-8是文件的编码格式
    # 文件对象被赋值给f
    with open(path,'r',encoding='utf-8') as f:
        # readlines()方法会一次性读取文件中所有的行
        # 并将每行内容作为一个字符串元素储存在一个列表中
        for line in f.readlines():
            if sep:
                # .strip()去除字符串两端的空白字符
                # .split()按照分隔符将字符串拆分成列表
                lines = line.strip().split(sep)
            else:
                lines=line.strip().split()
            # 如果长度不是3那就跳过本次循环
            if len(lines)!=3:continue
            # 长度是3 利用yield将Lines列表作为一个生产的数据项返回
            yield lines

In [23]:
import random
from tqdm import tqdm

def readRecData(path,test_ratio = 0.1):
    print('读取用户评分三元组...')
    user_set,item_set=set(),set()
    triples=[]
    for u, i, r in tqdm(readTriple(path)):
        user_set.add(int(u))
        item_set.add(int(i))
        triples.append((int(u),int(i),int(r)))

    test_set=random.sample(triples,int(len(triples)*test_ratio))
    train_set=list(set(triples)-set(test_set))

    #返回用户集合列表，物品集合列表，与用户，物品，评分三元组列表
    return list(user_set),list(item_set),train_set,test_set


# 近邻协同过滤
## UerCF_01

In [24]:
# 得到用户-物品集合
def getSet( triples ):
    user_items = collections.defaultdict( set )
    for u, i, r in triples:
        if r == 1:
            user_items[u].add(i)
    return user_items

# 两个集合间的cos相似度
def cos4set( set1, set2 ):
    return len(set1&set2)/(len(set1)*len(set2))**0.5

#knn算法
def knn4set(trainset, k, sim_method):
    '''
    :param trainset: 训练集合
    :param k: 近邻数量
    :param sim_method: 相似度方法
    :return: {样本1:[近邻1,近邻2，近邻3]}
    '''
    sims = {}
    #两个for循环遍历训练集合
    for e1 in tqdm(trainset):
        ulist = []#初始化一个列表来记录样本e1的近邻
        for e2 in trainset:
            #如果两个样本相同则跳过
            if e1 == e2 or \
                    len(trainset[e1]&trainset[e2]) == 0:
                #如果两个样本的交集为0也跳过
                continue
            #用相似度方法取得两个样本的相似度
            sim = sim_method(trainset[e1], trainset[e2])
            ulist.append((e2, sim))
        #排序后取前K的样本
        sims[e1] = [i[0] for i in sorted(ulist, key=lambda x:x[1], reverse=True)[:k]]
    return sims

#得到基于相似用户的推荐列表
def get_recomedations_by_usrCF( user_sims, user_o_set ):
    '''
    :param user_sims: 用户的近邻集:{样本1:[近邻1,近邻2，近邻3]}
    :param user_o_set: 用户的原本喜欢的物品集合:{用户1:{物品1,物品2，物品3}}
    :return: 每个用户的推荐列表{用户1:[物品1，物品2，物品3]}
    '''
    recomedations = collections.defaultdict(set)
    for u in user_sims:
        for sim_u in user_sims[u]:
            #将近邻用户喜爱的电影与自己观看过的电影去重后推荐给自己
            recomedations[u] |= (user_o_set[sim_u] - user_o_set[u])
    return recomedations

#得到基于UserCF的推荐列表
def trainUserCF( user_items_train, sim_method, user_all_items, k = 5 ):
    user_sims = knn4set( user_items_train, k, sim_method )
    recomedations = get_recomedations_by_usrCF( user_sims, user_all_items )
    return recomedations


## ItemCF_01

In [25]:
def getSet( triples ):
    #已物品为索引，喜欢物品的用户集
    item_users = collections.defaultdict( set )
    user_items = collections.defaultdict( set )
    for u, i, r in triples:
        if r == 1:
            item_users[i].add(u)
            user_items[u].add(i)
    return item_users,user_items

#得到基于相似物品的推荐列表
def get_recomedations_by_itemCF( item_sims, user_o_set ):
    '''
    :param item_sims: 物品的近邻集:{样本1:[近邻1,近邻2，近邻3]}
    :param user_o_set: 用户的原本喜欢的物品集合:{用户1:{物品1,物品2，物品3}}
    :return: 每个用户的推荐列表{用户1:[物品1，物品2，物品3]}
    '''
    recomedations = collections.defaultdict(set)
    for u in user_o_set:
        for item in user_o_set[u]:
            # 将自己喜欢物品的近邻物品与自己观看过的视频去重后推荐给自己
            if item in item_sims:
                recomedations[u] |= set( item_sims[item] ) - user_o_set[u]
    return recomedations

#得到基于ItemCF的推荐列表
def trainItemCF( item_users_train, sim_method, user_all_items, k = 5 ):
    item_sims = knn4set( item_users_train, k, sim_method )
    recomedations = get_recomedations_by_itemCF( item_sims, user_all_items )
    return recomedations

# 模型评估

In [26]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score
import numpy as np

def precision( y_true, y_pred ):
    return precision_score( y_true, y_pred )

def precision4Set( test_pos_set, test_neg_set, pred_set ):
    '''
    :param test_pos_set: 真实的用户喜爱的物品集合{iid1,iid2,iid3}
    :param test_neg_set: 真实的用户不喜爱的物品集合{iid1,iid2,iid3}
    :param pred_set: 预测的推荐集合{iid2,iid3,iid4}
    :return: 精确率
    '''
    TP = len( pred_set & test_pos_set )
    FP = len( pred_set & test_neg_set )
    # 若推荐列表和真实的正负例样本均无交集，则返回none
    p = TP / (TP + FP) if TP + FP > 0 else None
    # p = TP/len(pred_set) #若对模型严格一点可这么去算精确度
    return p

def recall( y_true, y_pred ):
    return recall_score( y_true, y_pred )

def recall4Set( test_set, pred_set ):
    '''
    :param test_set:真实的用户喜爱的物品集合{iid1,iid2,iid3}
    :param pred_set: 预测的推荐集合{iid2,iid3,iid4}
    :return: 召回率
    '''
    #计算它们的交集数量 除以 测试集的数量 即可
    return len(pred_set & test_set)/(len(test_set))

def auc(y_true,y_scores):
    return roc_auc_score(y_true,y_scores)

def accuracy(y_true,y_scores):
    return accuracy_score(y_true,y_scores)

def MSE(y_true, y_pred):
    return np.average((np.array(y_true) - np.array(y_pred)) ** 2)

def RMSE(y_true, y_pred):
    return MSE(y_true, y_pred) ** 0.5

def MAE(y_true,y_pred):
    return np.average(abs(np.array(y_true) - np.array(y_pred)))


In [27]:
#集合形式读取数据, 返回{uid1:{iid1,iid2,iid3}}
def getSet( triples ):
    #用户喜欢的物品集
    user_pos_items = collections.defaultdict( set )
    #用户不喜欢的物品集
    user_neg_items = collections.defaultdict( set )
    #用户交互过的所有物品集
    user_all_items = collections.defaultdict(set)
    #已物品为索引，喜欢物品的用户集
    item_users = collections.defaultdict( set )
    for u, i, r in triples:
        user_all_items[u].add(i)
        if r == 1:
            user_pos_items[u].add(i)
            item_users[i].add(u)
        else:
            user_neg_items[u].add(i)
    return user_pos_items, item_users, user_neg_items, user_all_items

In [28]:
def evaluation( test_set, user_neg_items, pred_set ):
    total_r = 0.0
    total_p = 0.0
    has_p_count = 0
    for uid in test_set:
        if len(test_set[uid]) > 0:
            p = precision4Set( test_set[uid], user_neg_items[uid], pred_set[uid] )
            if p:
                total_p += p
                has_p_count += 1
            total_r += recall4Set( test_set[uid], pred_set[uid] )

    print("Precision {:.4f} | Recall {:.4f}".format(total_p / has_p_count, total_r / len(test_set)))

In [29]:
_, _, train_set, test_set = readRecData(Ml_100K.RATING, test_ratio=0.1)

读取用户评分三元组...


97466it [00:00, 1185692.96it/s]


In [30]:
user_items_train, item_users_train, _ ,user_all_items= getSet(train_set)
user_pos_items_test, _, user_neg_items_test,_ = getSet(test_set)

recomedations_by_userCF = trainUserCF( user_items_train, cos4set, user_all_items, k=5 )
recomedations_by_itemCF = trainItemCF( item_users_train, cos4set, user_all_items, k=5 )    

100%|██████████| 942/942 [00:01<00:00, 740.30it/s]
100%|██████████| 1363/1363 [00:01<00:00, 1080.46it/s]


In [31]:
print(user_pos_items_test)
print('user_CF')
evaluation( user_pos_items_test, user_neg_items_test , recomedations_by_userCF )
print('item_CF')
evaluation( user_pos_items_test, user_neg_items_test , recomedations_by_itemCF )

defaultdict(<class 'set'>, {880: {674, 483, 1129, 202, 1452, 877, 1390, 1109, 310, 1303, 22, 246, 1469, 1471}, 622: {6, 1355, 973, 909, 1019}, 605: {1158, 744, 1483, 985, 272, 1552, 1106, 117, 758, 727, 441, 700}, 333: {1344, 185, 1350, 263, 1160, 137, 294, 555, 1094, 1229, 1485, 345, 1370, 1180, 158}, 744: {1116, 278, 7}, 861: {1091, 5, 904, 1194, 973, 656, 185, 732}, 881: {1090, 869, 1019, 854, 887, 185, 379}, 617: {1569, 868, 904, 360, 110, 1490, 182, 503, 1304, 537, 22, 987, 508}, 933: {1056, 327, 1448, 585, 361, 750, 634, 217, 1530}, 726: {641, 1538, 771, 450, 388, 136, 623, 16, 1010, 310, 94}, 406: {1345, 450, 644, 101, 1127, 973, 887, 17, 114, 1201, 501, 534, 1431, 1370, 318}, 631: {672, 1345, 2, 360, 441, 1019, 382}, 906: {834, 333, 1392, 691, 372, 438, 1211, 191}, 552: {1056, 1063, 617, 874, 17, 177, 1203, 628, 1307, 895}, 347: {34, 711, 951, 1304, 442}, 296: {1316, 1127, 455, 495, 593, 310, 23}, 42: {1056, 839, 1130, 909, 1490, 115, 117, 504, 1208, 250, 1116, 1213, 1502}, 353