In [1]:
import pandas as pd
import math

In [52]:
class RecBasedtag:
    #用于从文件中读取的为字符串，统一格式为整数，方便后续计算
    def __init__(self):
        #用户听过达人的次数文件
        self.user_rate_file = "./data/lastfm-2k/user_artists.dat"
        #用户打标签的文件
        self.user_tag_file = "./data/lastfm-2k/user_taggedartists.dat"
        #获取达人所有的ID
        #read_table()读取以‘/t’分割的文件到DataFrame,delimiter用于对行中各字段进行切分的字符
        self.artAll = list(pd.read_table("./data/lastfm-2k/artists.dat",delimiter = '\t')['id'].values)
        #用户对达人对评分
        self.userRateDict = self.getUserRate()
        #艺术家和标签的相似度
        self.artistsTagsDict = self.getArtistsTags()
        #用户对每个标签打标的次数统计和每个标签被所有用户打标的次数统计
        self.userTagDict, self.tagUserDict = self.getUserTagNum()
        #用户最终对每个标签的喜好程度
        self.userTagPre = self.getUserTagPre()
        #self.recommendForUser = self.recommendForUser("2", K=20)
        #print(self.artAll)
    #计算某个用户user对艺术家对喜好程度, 指定某个user，k为推荐达人的个数
    def recommendForUser(self, user, K, flag = True):
        userArtistPreDict = dict()
        # 得到用户没有打标过的艺术家
        #遍历艺术家，取出一个
        for artist in self.artAll:
            #如果该艺术家在我们已经计算过的字典里
            if int(artist) in self.artistsTagsDict.keys():
                # 计算用户对艺术的喜好程度
                for tag in self.userTagPre[int(user)].keys():
                    #得到该用户对某个标签对喜好
                    rate_ut = self.userTagPre[int(user)][int(tag)]
                    #得到该艺术家对该标签的相关度
                    rel_it = (
                        0
                        if tag not in self.artistsTagsDict[int(artist)].keys()
                        else self.artistsTagsDict[int(artist)][tag]
                    )
                    #如果存在，则后添加，否则等于
                    if artist in userArtistPreDict.keys():                   
                        userArtistPreDict[int(artist)] += rate_ut * rel_it
                    else:
                        userArtistPreDict[int(artist)] = rate_ut * rel_it

        newUserArtistPreDict = dict()
        #通过flag变量来控制功能
        if flag:
            # 对推荐结果进行过滤，过滤掉用户已经听过的达人
            for artist in userArtistPreDict.keys():
                if artist not in self.userRateDict[int(user)].keys():
                    newUserArtistPreDict[artist] = userArtistPreDict[int(artist)]
            return sorted(
                newUserArtistPreDict.items(), key = lambda k:k[1], reverse = True
            )[:K]
        else:
            #是用来进行结果评估
            return sorted(
                userArtistPreDict.items(), key=lambda k: k[1], reverse=True
            )[:K]
    #获取用户对达人的评分
    def getUserRate(self):
        userRateDict = dict()
        fr = open(self.user_rate_file, "r", encoding='utf-8')
        for line in fr.readlines():
            if not line.startswith('userID'):
                userID, artistID, weight = line.split('\t')
                userRateDict.setdefault(int(userID), {})
                #对听歌次数缩放，避免计算结果太大
                userRateDict[int(userID)][int(artistID)] = float(weight) / 10000           
        return userRateDict
    
     #获得每个用户打标对标签和每个标签被打标的次数
    def getUserTagNum(self):
        userTagDict = dict()
        tagUserDict = dict()
        for line in open(self.user_tag_file, "r", encoding='utf-8'):
            if not line.startswith('userID'):
                userID, artistID, tagID = line.strip().split('\t')[:3]
                #每个标签被打标的次数
                if int(tagID) in tagUserDict.keys():
                    tagUserDict[int(tagID)] += 1
                else:
                    tagUserDict[int(tagID)] = 1
                #每个用户对每个标签的打标次数
                userTagDict.setdefault(int(userID), {})
                if int(tagID) in userTagDict[int(userID)].keys():
                    userTagDict[int(userID)][int(tagID)] += 1
                else:
                    userTagDict[int(userID)][int(tagID)] = 1
        return userTagDict, tagUserDict
    
    # 获取艺术家对应的标签基因,这里的相关度全部为1
    def getArtistsTags(self):
        artistsTagsDict = dict()
        for line in open(self.user_tag_file, 'r', encoding='utf-8'):
            if not line.startswith('userID'):
                artistID, tagID = line.split('\t')[1:3]
                artistsTagsDict.setdefault(int(artistID), {})
                # 如果艺术家有对应的标签则记录，相关度为1，否则不为1
                artistsTagsDict[int(artistID)][int(tagID)] = 1
        return artistsTagsDict
    
    # 获取用户对标签的最终兴趣度
    def getUserTagPre(self):
        userTagPre = dict()
        userTagCount = dict()
        # Num 为用户打标总条数
        Num = len(open(self.user_tag_file, "r", encoding="utf-8").readlines())
        for line in open(self.user_tag_file, "r", encoding="utf-8").readlines():
            if not line.startswith('userID'):
                userID, artistID, tagID = line.split('\t')[:3]
                userTagPre.setdefault(int(userID), {})
                userTagCount.setdefault(int(userID), {})
                #rate_ui为用户对艺术家对评分
                rate_ui = (
                    self.userRateDict[int(userID)][int(artistID)]
                    
                    if int(artistID) in self.userRateDict[int(userID)].keys()
                    else 0
                )
                #artistsTagsDict艺术家对标签的相关度，userTagPre用户对标签的主观喜好
                if int(tagID) not in userTagPre[int(userID)].keys():
                    userTagPre[int(userID)][int(tagID)] = (
                        rate_ui * self.artistsTagsDict[int(artistID)][int(tagID)]

                    )
                    userTagCount[int(userID)][int(tagID)] = 1
                else:
                    userTagPre[int(userID)][int(tagID)] += (
                            rate_ui * self.artistsTagsDict[int(artistID)][int(tagID)]
                        )
                    userTagCount[int(userID)][int(tagID)] += 1

        for userID in userTagPre.keys():
            for tagID in userTagPre[userID].keys():
                #tfidf值为用户对标签的依赖度
                #计算tf值
                tf_ut = self.userTagDict[int(userID)][int(tagID)] / sum(self.userTagDict[int(userID)].values())
                #计算idf值
                idf_ut = math.log(Num * 1.0 / (self.tagUserDict[int(tagID)] + 1))
                #用户对标签的兴趣度为主观喜好度*依赖度
                userTagPre[userID][tagID] = (
                    userTagPre[userID][tagID] / userTagCount[userID][tagID] * tf_ut * idf_ut
                )
            return userTagPre
        
    #效果评估,userRateDict为用户对达人对评分字典
    def evaluate(self, user):
        K = len(self.userRateDict[int(user)])
        recResult = self.recommendForUser(user, K=K, flag=False)
        count = 0
        for (artist, pre) in recResult:
            if artist in self.userRateDict[int(user)]:
                count += 1
        return count * 1.0 / K

In [12]:
    #获取用户对达人的评分
    def getUserRate(self):
        userRateDict = dict()
        fr = open(self.user_rate_file, "r", encoding='utf-8')
        for line in fr.readlines():
            if not line.startswith('userID'):
                userID, artistID, weight = line.split('\t')
                userRateDict.setdefault(int(userID), {})
                #对听歌次数缩放，避免计算结果太大
                userRateDict[int(userID)][int(artistID)] = float(weight) / 10000           
        return userRateDict


In [13]:
    #获得每个用户打标对标签和每个标签被打标的次数
    def getUserTagNum(self):
        userTagDict = dict()
        tagUserDict = dict()
        for line in open(self.user_tag_file, "r", encoding='utf-8'):
            if not line.startswith('userID'):
                userID, artistID, weight = line.strip().split('\t')[:3]
                #每个标签被打标的次数
                if int(tagID) in tagUserDict.keys():
                    tagUserDict[int(tagID)] += 1
                else:
                    tagUserDict[int(tagID)] = 1
                #每个用户对每个标签的打标次数
                userTagDict.setdefault(int(userID), {})
                if int(tagID) in userTagDict[int(userID)].keys():
                    userTagDict[int(userID)][int(tagID)] += 1
                else:
                    userTagDict[int(userID)][int(tagID)] = 1
        return userTagDict, tagUserDict


In [14]:
    # 获取艺术家对应的标签基因,这里的相关度全部为1
    def getArtistsTags(self):
        artistsTagsDict = dict()
        for line in open(self.user_tag_file, 'r', encoding='utf-8'):
            if not line.startswith('userID'):
                artistID, tagID = line.strip('\t')[1:3]
                artistsTagsDict.setdefault(int(artistID), {})
                # 如果艺术家有对应的标签则记录，相关度为1，否则不为1
                artistsTagsDict[int(artistID)][int(tagID)] = 1
        return artistsTagsDict


In [15]:
    # 获取用户对标签的最终兴趣度
    def getUserTagPre(self):
        userTagPre = dict()
        userTagCount = dict()
        # Num 为用户打标总条数
        Num = len(open(self.user_tag_file, "r", encoding="utf-8").readlines())
        for line in open(self.user_tag_file, "r", encoding="utf-8").readlines():
            if not line.startswith('userID'):
                userID, artistID, tagID = line.strip('\t')[:3]
                userTagPre.setdefault(int(userID), {})
                userTagCount.setdefault(int(userID), {})
                #rate_ui为用户对艺术家对评分
                rate_ui = (
                    self.userRateDict[int(userID)][int(artistID)]
                    if int(artistID) in self.userRateDict[int(userID)].keys()
                    else 0
                )
                #artistsTagsDict艺术家对标签的相关度，userTagPre用户对标签的主观喜好
                if int(tagID) not in userTagPre[int(userID)].keys():
                    userTagPre[int(userID)][int(tagID)] = (
                        rate_ui * self.artistsTagsDict[int(artistID)][int(tagID)]

                    )
                    userTagCount[int(userID)][int(tagID)] = 1
                else:
                    userTagPre[int(userID)][int(tagID)] += (
                            rate_ui * self.artistsTagsDict[int(artistID)][int(tagID)]
                        )
                    userTagCount[int(userID)][int(tagID)] += 1

        for userID in userTagPre.keys():
            for tagID in userTagPre[userID].keys():
                #tfidf值为用户对标签的依赖度
                #计算tf值
                tf_ut = self.userTagDict[int(userID)][int(tagID)] / sum(self.userTagDict[int(userID)].values())
                #计算idf值
                idf_ut = math.log(Num * 1.0 / (self.tagUserDict[int(tagID)] + 1))
                #用户对标签的兴趣度为主观喜好度*依赖度
                userTagPre[userID][tagID] = (
                    userTagPre[userID][tagID] / userTagCount[userID][tagID] * tf_ut * idf_ut
                )
            return userTagPre



In [16]:
    #计算某个用户user对艺术家对喜好程度, 指定某个user，k为推荐达人的个数
    def recommendForUser(self, user, K, flag = True):
        userArtistPreDict = dict()
        # 得到用户没有打标过的艺术家
        #遍历艺术家，取出一个
        for artist in self.artistsAll:
            #如果该艺术家在我们已经计算过的字典里
            if int(artistID) in self.artistsTagsDict.keys():
                # 计算用户对艺术的喜好程度
                for tag in self.userTagPre[int(user)].keys():
                    #得到该用户对某个标签对喜好
                    rate_ut = self.userTagPre[int(user)][int(tag)]
                    #得到该艺术家对该标签的相关度
                    rel_it = (
                        0
                        if tag not in self.artistsTagsDict[int(artist)].keys()
                        else self.artistsTagsDict[int(artist)][tag]
                    )
                    #如果存在，则后添加，否则等于
                    if artist in userArtistPreDict.keys():                   
                        userArtistPreDict[int(artist)] += rate_ut * rel_it
                    else:
                        userArtistPreDict[int(artist)] = rate_ut * rel_it

        newUserArtistPreDict = dict()
        #通过flag变量来控制功能
        if flag:
            # 对推荐结果进行过滤，过滤掉用户已经听过的达人
            for artist in userArtistPreDict.keys():
                if artist not in self.userRateDict[int(user)].keys():
                    newUserArtistPreDict[artist] = userArtistPreDict[int(artist)]
            return sorted(
                newUserArtistPreDict.items(), key = lambda k:k[1], reverse = True
            )[:K]
        else:
            #是用来进行结果评估
            return sorted(
                userArtistPreDict.items(), key=lambda k: k[1], reverse=True
            )[:K]



In [17]:
    #效果评估,userRateDict为用户对达人对评分字典
    def evaluate(self, user):
        K = len(self.userRateDict[int(user)])
        recResult = self.recommendForUser(user, K=K, flag=False)
        count = 0
        for (artist, pre) in recResult:
            if artist in self.userRateDict[int(user)]:
                count += 1
        return count * 1.0 / K

In [60]:
if __name__ == "__main__":
    user_ID = "2"
    K = 20
    rbt = RecBasedtag()
    print(rbt.recommendForUser(user_ID, K=K))
    print('我是分割线-----------------------------------------------------------------------------------------------')
    print(rbt.evaluate(user_ID))


[(5803, 0.9397784544070824), (6582, 0.9397784544070824), (18229, 0.9280932264044133), (18232, 0.9280932264044133), (1965, 0.9269016485847453), (15675, 0.9269016485847453), (1801, 0.9004958140614302), (1835, 0.9004958140614302), (2605, 0.9004958140614302), (2668, 0.9004958140614302), (4852, 0.9004958140614302), (4863, 0.9004958140614302), (3992, 0.8579588652986282), (8068, 0.8565790775053892), (748, 0.8460267445255922), (2673, 0.8460267445255922), (4316, 0.8460267445255922), (10522, 0.8460267445255922), (175, 0.8433451847266689), (10519, 0.8425740993544554)]
我是分割线-----------------------------------------------------------------------------------------------
0.22
