# 数据加载
## 文件地址

In [10]:
import os
import collections
ROOT = os.getcwd()
ROOT = os.path.split(ROOT)[0]
ROOT = os.path.join(ROOT,'data_set')

class Ml_100K():
    #下载地址：https://github.com/rexrex9/kb4recMovielensDataProcess
    # 以双下划线（__）开头的属性名（如这里的__BASE）在类定义里通常会被解释器进行名称改写（Name Mangling）
    # 用于实现一种有限的 “私有” 机制，实际的属性名会被改写成_类名__属性名的形式。
    __BASE = os.path.join(ROOT, 'ml-100k')
    ORGINAL_DIR = os.path.join(ROOT,'ml-100k-orginal')
    USER_DF = os.path.join(ORGINAL_DIR,'user_df.csv')
    ITEM_DF = os.path.join(ORGINAL_DIR,'item_df.csv')
    ITEM_DF_0 = os.path.join(ORGINAL_DIR, 'item_df_0.csv')

    KG=os.path.join(__BASE,'kg_index.tsv')
    RATING = os.path.join(__BASE,'rating_index.tsv')
    RATING5 = os.path.join(__BASE, 'rating_index_5.tsv')

class Ml_latest_small():
    __BASE = os.path.join(ROOT,'ml-latest-small')
    RATING_TS = os.path.join(__BASE,'rating_index_ts.tsv')
    SEQS = os.path.join(__BASE, 'seqs.npy')

    SEQS_NEG = os.path.join(__BASE, 'seqsWithNeg.npy')
print(ROOT)
print(Ml_100K._Ml_100K__BASE)
print(Ml_100K.RATING)

d:\DeepLearning\推荐系统\recbyhand\data_set
d:\DeepLearning\推荐系统\recbyhand\data_set\ml-100k
d:\DeepLearning\推荐系统\recbyhand\data_set\ml-100k\rating_index.tsv


In [11]:
import json

def readTriple(path,sep=None):
    # with语句的作用：自动管理文件资源
    # path是路径 ‘r’是read表示只读模式  utf-8是文件的编码格式
    # 文件对象被赋值给f
    with open(path,'r',encoding='utf-8') as f:
        # readlines()方法会一次性读取文件中所有的行
        # 并将每行内容作为一个字符串元素储存在一个列表中
        for line in f.readlines():
            if sep:
                # .strip()去除字符串两端的空白字符
                # .split()按照分隔符将字符串拆分成列表
                lines = line.strip().split(sep)
            else:
                lines=line.strip().split()
            # 如果长度不是3那就跳过本次循环
            if len(lines)!=3:continue
            # 长度是3 利用yield将Lines列表作为一个生产的数据项返回
            yield lines

In [12]:
import random
from tqdm import tqdm

def readRecData(path,test_ratio = 0.1):
    print('读取用户评分三元组...')
    user_set,item_set=set(),set()
    triples=[]
    for u, i, r in tqdm(readTriple(path)):
        user_set.add(int(u))
        item_set.add(int(i))
        triples.append((int(u),int(i),int(r)))

    test_set=random.sample(triples,int(len(triples)*test_ratio))
    train_set=list(set(triples)-set(test_set))

    #返回用户集合列表，物品集合列表，与用户，物品，评分三元组列表
    return list(user_set),list(item_set),train_set,test_set


# 近邻协同过滤
## UerItemCF_15
参考文件 chapter2\s36_userItemCF_15label.py

In [13]:
import numpy as np

# 得到用户-物品集合
def getSet( triples ):
    user_items = collections.defaultdict( set )
    for u, i, r in triples:
        if r == 1:
            user_items[u].add(i)
    return user_items

#字典形式读取数据, 返回{uid1:{iid1:rate,iid2:rate}}
def getDict(triples):
    user_items = collections.defaultdict(dict)
    item_users = collections.defaultdict(dict)
    for u, i, r in triples:
        user_items[u][i] = float(r)
        item_users[i][u] = float(r)
    return user_items, item_users

#集合形式读取数据, 评分大于4的为正例，反之为反例。返回{uid1:{iid1,iid2,iid3}}
def getPosAndNegSet(triples):
    user_pos_items = collections.defaultdict(set)
    user_neg_items = collections.defaultdict(set)
    for u, i, r in triples:
        if r >=4:
            user_pos_items[u].add(i)
        else:
            user_neg_items[u].add(i)
    return user_pos_items, user_neg_items

# 两个向量间的cos相似度
def cos4vector( v1, v2 ):
    return (np.dot(v1,v2))/(np.linalg.norm(v1)*np.linalg.norm(v2))

# 根据评分字典得到cos相似度
def getCosSimForDict(d1, d2):
    '''
    :param d1: 字典{iid1:rate, iid2:rate}
    :param d2: 字典{iid2:rate, iid3:rate}
    :return: 得到cos相似度
    '''
    s1 = set(d1.keys())
    s2 = set(d2.keys())
    inner = s1 & s2
    if len( inner ) == 0:
        return 0 #如果没有交集，则相似度一定为0
    a1, a2 = [],[]

    for i in inner:
        a1.append(d1[i])
        a2.append(d2[i])
    for i in s1 - inner:
        a1.append(d1[i])
        a2.append(0)
    for i in s2 - inner:
        a1.append(0)
        a2.append(d2[i])
    return cos4vector( np.array(a1), np.array(a2) )

#knn算法
def knn4Dict( trainset, k ):
    sims = {}
    for e1 in tqdm( trainset ):
        ulist=[]
        for e2 in trainset:
            if e1 == e2 :
                continue
            cosSim = getCosSimForDict( trainset[e1], trainset[e2] )
            if cosSim !=0:
                ulist.append((e2, cosSim))
        sims[e1] = [i[0] for i in sorted( ulist, key = lambda x:x[1], reverse = True)[:k]]
    return sims

#得到基于相似用户的推荐列表
def get_recomedations_by_usrCF(user_sims, user_o_set, user_items):
    recomedations = collections.defaultdict(set)
    for u in user_sims:
        for sim_u in user_sims[u]:
            recomedations[u] |= (user_items[sim_u] - set(user_o_set[u].keys()))
    return recomedations

#得到基于相似物品的推荐列表
def get_recomedations_by_itemCF(item_sims, user_o_set):
    recomedations = collections.defaultdict(set)
    for u in user_o_set:
        for item in user_o_set[u]:
            recomedations[u] |= set(item_sims[item]) - set(user_o_set[u].keys())
    return recomedations

def trainUserCF( user_items_train, user_pos_items, k = 5 ):
    user_sims = knn4Dict(user_items_train, k)
    recomedations = get_recomedations_by_usrCF(user_sims, user_items_train, user_pos_items)
    return recomedations

def trainItemCF( user_items_train, item_users_train, k = 5 ):
    item_sims = knn4Dict( item_users_train, k )
    recomedations = get_recomedations_by_itemCF( item_sims, user_items_train )
    return recomedations

# 模型评估

In [14]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score
import numpy as np

def precision( y_true, y_pred ):
    return precision_score( y_true, y_pred )

def precision4Set( test_pos_set, test_neg_set, pred_set ):
    '''
    :param test_pos_set: 真实的用户喜爱的物品集合{iid1,iid2,iid3}
    :param test_neg_set: 真实的用户不喜爱的物品集合{iid1,iid2,iid3}
    :param pred_set: 预测的推荐集合{iid2,iid3,iid4}
    :return: 精确率
    '''
    TP = len( pred_set & test_pos_set )
    FP = len( pred_set & test_neg_set )
    # 若推荐列表和真实的正负例样本均无交集，则返回none
    p = TP / (TP + FP) if TP + FP > 0 else None
    # p = TP/len(pred_set) #若对模型严格一点可这么去算精确度
    return p

def recall( y_true, y_pred ):
    return recall_score( y_true, y_pred )

def recall4Set( test_set, pred_set ):
    '''
    :param test_set:真实的用户喜爱的物品集合{iid1,iid2,iid3}
    :param pred_set: 预测的推荐集合{iid2,iid3,iid4}
    :return: 召回率
    '''
    #计算它们的交集数量 除以 测试集的数量 即可
    return len(pred_set & test_set)/(len(test_set))

def auc(y_true,y_scores):
    return roc_auc_score(y_true,y_scores)

def accuracy(y_true,y_scores):
    return accuracy_score(y_true,y_scores)

def MSE(y_true, y_pred):
    return np.average((np.array(y_true) - np.array(y_pred)) ** 2)

def RMSE(y_true, y_pred):
    return MSE(y_true, y_pred) ** 0.5

def MAE(y_true,y_pred):
    return np.average(abs(np.array(y_true) - np.array(y_pred)))

In [15]:
def evaluation(test_set, user_neg_items, pred_set):
    total_r = 0.0
    total_p = 0.0
    has_p_count=0
    for uid in test_set:
        if len(test_set[uid]) > 0:
            p = precision4Set(test_set[uid], user_neg_items[uid], pred_set[uid])
            if p:
                total_p += p
                has_p_count+=1
            total_r += recall4Set(test_set[uid], pred_set[uid])

    print("Precision {:.4f} | Recall {:.4f}".format(total_p / has_p_count, total_r / len(test_set)))

In [16]:
_, _, train_set, test_set = readRecData(Ml_100K.RATING5, test_ratio=0.1)

user_items_train, item_users_train = getDict(train_set)
user_pos_items_train, _ = getPosAndNegSet(train_set)

#测试集，正例集合与负例集合
user_pos_items_test, user_neg_items_test = getPosAndNegSet(test_set)

读取用户评分三元组...


97466it [00:00, 1188457.47it/s]


In [17]:
recomedations_by_userCF = trainUserCF( user_items_train, user_pos_items_train, k=5 )
recomedations_by_itemCF = trainItemCF( user_items_train, item_users_train, k=5 )

100%|██████████| 943/943 [00:38<00:00, 24.41it/s]
100%|██████████| 1584/1584 [01:00<00:00, 26.25it/s]


In [18]:
print('user_CF')
evaluation( user_pos_items_test, user_neg_items_test , recomedations_by_userCF )
print('item_CF')
evaluation( user_pos_items_test, user_neg_items_test , recomedations_by_itemCF )

user_CF
Precision 0.7122 | Recall 0.7094
item_CF
Precision 0.6843 | Recall 0.6505
