In [2]:
# Item Based Collaborative Filtering

In [3]:
import sys
import random
import math
import os
from operator import itemgetter
from collections import defaultdict

In [4]:
# 使得随机数据可预测，即只要seed的值一样，后续生成的随机数都一样
random.seed(0)

In [69]:
class ItemBasedCF(object):
    ''' TopN recommendation - Item Based Collaborative Filtering '''

    def __init__(self):
        self.trainset = {}
        self.testset = {}

        # n_sim_movie: top 20个电影， n_rec_movie: top 10个推荐结果
        self.n_sim_movie = 20
        self.n_rec_movie = 10

        # movie_sim_mat: 电影之间的相似度， movie_popular: 电影的出现次数， movie_count: 总电影数量
        self.movie_sim_mat = {}
        self.movie_popular = {}
        self.movie_count = 0

        print('Similar movie number = %d' % self.n_sim_movie, file=sys.stderr)  # sys.stderr 目的就是返回错误信息
        print('Recommended movie number = %d' % self.n_rec_movie, file=sys.stderr)

    @staticmethod
    def loadfile(filename):
        ''' load a file, return a generator. '''
        fp = open(filename, 'r')
        # enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，同时列出数据和数据下标
        for i, line in enumerate(fp):
            # yield和return的关系和区别：
            # return在程序中返回某个值，返回之后程序就不再往下运行了。
            # 带yield的函数是一个生成器，而不是一个函数，这个生成器有一个函数就是next函数，next就相当于“下一步”生成哪个数，
            # 这一次的next开始的地方是接着上一次的next停止的地方执行的，然后遇到yield后，return出要生成的数，此步就结束。
            yield line.strip('\r\n')
            if i % 100000 == 0:
                print ('loading %s(%s)' % (filename, i), file=sys.stderr)
        fp.close()
        print ('load %s succ' % filename, file=sys.stderr)

    def generate_dataset(self, filename, pivot=0.7):
        ''' load rating data and split it to training set and test set '''
        trainset_len = 0
        testset_len = 0

        for line in self.loadfile(filename):
            # 用户ID，电影名称，评分，时间戳
            user, movie, rating, _ = line.split('::')
            # split the data by pivot
            if random.random() < pivot:
                # dict.setdefault(key, default=None), 如果键不存在于字典中，将会添加键并将值设为默认值
                self.trainset.setdefault(user, {})
                self.trainset[user][movie] = int(rating)
                trainset_len += 1
            else:
                self.testset.setdefault(user, {})
                self.testset[user][movie] = int(rating)
                testset_len += 1

        print ('split training set and test set succ', file=sys.stderr)
        print ('train set = %s' % trainset_len, file=sys.stderr)
        print ('test set = %s' % testset_len, file=sys.stderr)

    def calc_movie_sim(self):
        ''' calculate movie similarity matrix '''
        print('counting movies number and popularity...', file=sys.stderr)

        for user, movies in self.trainset.items():  # dict.items() 以列表返回可遍历的(键, 值) 元组数组
            for movie in movies:
                # count item popularity
                if movie not in self.movie_popular:
                    self.movie_popular[movie] = 0
                self.movie_popular[movie] += 1

        print('count movies number and popularity succ', file=sys.stderr)

        # save the total number of movies
        self.movie_count = len(self.movie_popular)
        print('total movie number = %d' % self.movie_count, file=sys.stderr)

        # count co-rated users between items
        itemsim_mat = self.movie_sim_mat
        print('building co-rated users matrix...', file=sys.stderr)

        for user, movies in self.trainset.items():
            for m1 in movies:
                itemsim_mat.setdefault(m1, defaultdict(int))
                for m2 in movies:
                    if m1 == m2:
                        continue
                    itemsim_mat[m1][m2] += 1

        print('build co-rated users matrix succ', file=sys.stderr)

        # calculate similarity matrix
        print('calculating movie similarity matrix...', file=sys.stderr)
        simfactor_count = 0
        PRINT_STEP = 2000000

        for m1, related_movies in itemsim_mat.items():
            for m2, count in related_movies.items():
                # 余弦相似度
                itemsim_mat[m1][m2] = count / math.sqrt(
                    self.movie_popular[m1] * self.movie_popular[m2])
                simfactor_count += 1
                # 打印进度条
                if simfactor_count % PRINT_STEP == 0:
                    print('calculating movie similarity factor(%d)' %
                          simfactor_count, file=sys.stderr)

        print('calculate movie similarity matrix(similarity factor) succ',
              file=sys.stderr)
        print('Total similarity factor number = %d' %
              simfactor_count, file=sys.stderr)

    def recommend(self, user):
        ''' Find K similar movies and recommend N movies. '''
        K = self.n_sim_movie
        N = self.n_rec_movie
        rank = {}
        watched_movies = self.trainset[user]

        # 计算top K 电影的相似度
        # rating=电影评分, w=不同电影出现的次数
        # 耗时分析：98.2%的时间在 line-154行
        for movie, rating in watched_movies.items():
            for related_movie, similarity_factor in sorted(self.movie_sim_mat[movie].items(),
                                                           key=itemgetter(1), reverse=True)[:K]:
                if related_movie in watched_movies:
                    continue
                rank.setdefault(related_movie, 0)
                rank[related_movie] += similarity_factor * rating
        # return the N best movies
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]

    def evaluate(self):
        ''' print evaluation result: precision, recall, coverage and popularity '''
        print('Evaluation start...', file=sys.stderr)

        N = self.n_rec_movie
        #  varables for precision and recall
        hit = 0  # hit 命中(测试集和推荐集相同+1)
        rec_count = 0  # rec_count 每个用户的推荐数
        test_count = 0  # test_count 每个用户对应的测试数据集的电影数
        # varables for coverage
        all_rec_movies = set()
        # varables for popularity
        popular_sum = 0

        # 对比测试集和推荐集的差异
        for i, user in enumerate(self.trainset):
            if i % 500 == 0:
                print ('recommended for %d users' % i, file=sys.stderr)
            test_movies = self.testset.get(user, {})
            rec_movies = self.recommend(user)
            for movie, _ in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
                popular_sum += math.log(1 + self.movie_popular[movie])
            rec_count += N
            test_count += len(test_movies)

        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
        popularity = popular_sum / (1.0 * rec_count)

        print ('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' %
               (precision, recall, coverage, popularity), file=sys.stderr)

In [70]:
if __name__ == '__main__':

    # 创建ItemCF对象
    itemcf = ItemBasedCF()
    
    # 将数据按照 7:3的比例，拆分成：训练集和测试集，存储在itemcf的trainset和testset中
    ratingfile = os.path.join('ml-1m', 'ratings.dat')
    itemcf.generate_dataset(ratingfile)

    # 计算电影之间的相似度
    itemcf.calc_movie_sim()
    
    # 评估推荐效果
    itemcf.evaluate()

Similar movie number = 20
Recommended movie number = 10
loading ml-1m\ratings.dat(0)
loading ml-1m\ratings.dat(100000)
loading ml-1m\ratings.dat(200000)
loading ml-1m\ratings.dat(300000)
loading ml-1m\ratings.dat(400000)
loading ml-1m\ratings.dat(500000)
loading ml-1m\ratings.dat(600000)
loading ml-1m\ratings.dat(700000)
loading ml-1m\ratings.dat(800000)
loading ml-1m\ratings.dat(900000)
loading ml-1m\ratings.dat(1000000)
load ml-1m\ratings.dat succ
split training set and test set succ
train set = 699977
test set = 300232
counting movies number and popularity...
count movies number and popularity succ
total movie number = 3658
building co-rated users matrix...
build co-rated users matrix succ
calculating movie similarity matrix...
calculating movie similarity factor(2000000)
calculating movie similarity factor(4000000)
calculating movie similarity factor(6000000)
calculating movie similarity factor(8000000)
calculating movie similarity factor(10000000)
calculate movie similarity matrix

In [None]:
# 查看某用户推荐结果
user = "2"
print("推荐结果", itemcf.recommend(user))
print("---", itemcf.testset.get(user, {}))

In [59]:
i = 0
for key,values in itemcf.trainset.items():
    if i<2: print(key,values)
    i += 1

1 {'661': 3, '914': 3, '3408': 4, '2355': 5, '1197': 3, '594': 4, '919': 4, '938': 4, '2398': 4, '2918': 4, '1035': 5, '2018': 4, '3105': 5, '1270': 5, '527': 5, '48': 5, '1097': 4, '1721': 4, '1545': 4, '745': 3, '3186': 4, '1566': 4, '588': 4, '1907': 4, '783': 4, '1836': 5, '1022': 5, '2762': 4, '150': 5, '1961': 5, '1962': 4, '2692': 4, '1029': 5, '1207': 4, '531': 4, '3114': 4, '608': 4, '1246': 4}
2 {'1357': 5, '3068': 4, '1537': 4, '2268': 5, '2628': 3, '1103': 3, '1210': 4, '1792': 3, '1687': 3, '1213': 2, '2881': 3, '3105': 4, '434': 2, '2126': 3, '3035': 4, '292': 3, '2236': 5, '3071': 4, '902': 2, '368': 4, '1259': 5, '3147': 5, '1544': 4, '1293': 5, '1188': 4, '3256': 2, '3257': 3, '2278': 3, '2490': 3, '3654': 3, '2852': 3, '1945': 5, '982': 4, '1873': 4, '2858': 4, '1225': 5, '442': 3, '265': 4, '1408': 3, '1084': 3, '3699': 2, '480': 5, '1442': 4, '1265': 3, '1193': 5, '2353': 4, '3334': 4, '2427': 2, '590': 5, '1196': 5, '1552': 3, '736': 4, '593': 5, '2359': 3, '95': 2