In [1]:
# User Based Collaborative Filtering

import sys
import random
import math
import os
from operator import itemgetter
from collections import defaultdict

# 使得随机数据可预测，即只要seed的值一样，后续生成的随机数都一样
random.seed(0)

class UserBasedCF(object):
    ''' TopN recommendation - User Based Collaborative Filtering '''

    def __init__(self):
        self.trainset = {}
        self.testset = {}

        # n_sim_user: top 20个用户， n_rec_movie: top 10个推荐结果
        self.n_sim_user = 20
        self.n_rec_movie = 10

        # user_sim_mat: 用户之间的相似度， movie_popular: 电影的出现次数， movie_count: 总电影数量
        self.user_sim_mat = {}
        self.movie_popular = {}
        self.movie_count = 0

        print ('Similar user number = %d' % self.n_sim_user, file=sys.stderr)
        print ('recommended movie number = %d' % self.n_rec_movie, file=sys.stderr)

    @staticmethod
    def loadfile(filename):
        ''' load a file, return a generator. '''
        fp = open(filename, 'r')
        for i, line in enumerate(fp):
            yield line.strip('\r\n')
            if i % 100000 == 0:
                print ('loading %s(%s)' % (filename, i), file=sys.stderr)
        fp.close()
        print ('load %s succ' % filename, file=sys.stderr)

    def generate_dataset(self, filename, pivot=0.7):
        ''' load rating data and split it to training set and test set '''
        trainset_len = 0
        testset_len = 0

        for line in self.loadfile(filename):
            # 用户ID，电影名称，评分，时间戳
            user, movie, rating, _ = line.split('::')
            # split the data by pivot
            if random.random() < pivot:
                self.trainset.setdefault(user, {})
                self.trainset[user][movie] = int(rating)
                trainset_len += 1
            else:
                self.testset.setdefault(user, {})
                self.testset[user][movie] = int(rating)
                testset_len += 1

        print ('split training set and test set succ', file=sys.stderr)
        print ('train set = %s' % trainset_len, file=sys.stderr)
        print ('test set = %s' % testset_len, file=sys.stderr)

    def calc_user_sim(self):
        ''' calculate user similarity matrix '''
        # build inverse table for item-users
        # key=movieID, value=list of userIDs who have seen this movie
        print ('building movie-users inverse table...', file=sys.stderr)
        movie2users = dict()

        for user, movies in self.trainset.items():
            for movie in movies:
                # inverse table for item-users
                if movie not in movie2users:
                    movie2users[movie] = set()
                movie2users[movie].add(user)
                # count item popularity at the same time
                if movie not in self.movie_popular:
                    self.movie_popular[movie] = 0
                self.movie_popular[movie] += 1
        print ('build movie-users inverse table succ', file=sys.stderr)

        # save the total movie number, which will be used in evaluation
        self.movie_count = len(movie2users)
        print ('total movie number = %d' % self.movie_count, file=sys.stderr)

        # count co-rated items between users
        usersim_mat = self.user_sim_mat
        print ('building user co-rated movies matrix...', file=sys.stderr)

        for movie, users in movie2users.items():
            for u in users:
                usersim_mat.setdefault(u, defaultdict(int))
                for v in users:
                    if u == v:
                        continue
                    usersim_mat[u][v] += 1
        print ('build user co-rated movies matrix succ', file=sys.stderr)

        # calculate similarity matrix
        print ('calculating user similarity matrix...', file=sys.stderr)
        simfactor_count = 0
        PRINT_STEP = 2000000

        for u, related_users in usersim_mat.items():
            for v, count in related_users.items():
                # 余弦相似度
                usersim_mat[u][v] = count / math.sqrt(
                    len(self.trainset[u]) * len(self.trainset[v]))
                simfactor_count += 1
                # 打印进度条
                if simfactor_count % PRINT_STEP == 0:
                    print ('calculating user similarity factor(%d)' %
                           simfactor_count, file=sys.stderr)

        print ('calculate user similarity matrix(similarity factor) succ',
               file=sys.stderr)
        print ('Total similarity factor number = %d' %
               simfactor_count, file=sys.stderr)

    def recommend(self, user):
        ''' Find K similar users and recommend N movies '''
        K = self.n_sim_user
        N = self.n_rec_movie
        rank = dict()
        watched_movies = self.trainset[user]

        for similar_user, similarity_factor in sorted(self.user_sim_mat[user].items(),
                                                      key=itemgetter(1), reverse=True)[0:K]:
            for movie in self.trainset[similar_user]:
                if movie in watched_movies:
                    continue
                # predict the user's "interest" for each movie
                rank.setdefault(movie, 0)
                rank[movie] += similarity_factor
                
        # return the N best movies
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]

    def evaluate(self):
        ''' print evaluation result: precision, recall, coverage and popularity '''
        print ('Evaluation start...', file=sys.stderr)

        N = self.n_rec_movie
        #  varables for precision and recall
        hit = 0  # hit 命中(测试集和推荐集相同+1)
        rec_count = 0  # rec_count 每个用户的推荐数
        test_count = 0  # est_count 每个用户对应的测试数据集的电影数
        # varables for coverage
        all_rec_movies = set()
        # varables for popularity
        popular_sum = 0

        for i, user in enumerate(self.trainset):
            if i % 500 == 0:
                print ('recommended for %d users' % i, file=sys.stderr)
            test_movies = self.testset.get(user, {})
            rec_movies = self.recommend(user)
            # 对比测试集和推荐集的差异
            for movie, _ in rec_movies:
                if movie in test_movies:
                    hit += 1
                all_rec_movies.add(movie)
                popular_sum += math.log(1 + self.movie_popular[movie])
            rec_count += N
            test_count += len(test_movies)

        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
        popularity = popular_sum / (1.0 * rec_count)

        print ('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' %
               (precision, recall, coverage, popularity), file=sys.stderr)
        

if __name__ == '__main__':
    
    # 创建UserCF对象
    usercf = UserBasedCF()
    
    # 将数据按照 7:3的比例，拆分成：训练集和测试集，存储在usercf的trainset和testset中
    ratingfile = os.path.join('ml-1m', 'ratings.dat')
    usercf.generate_dataset(ratingfile)
    
    # 计算用户之间的相似度
    usercf.calc_user_sim()
    
    # 评估推荐效果
    usercf.evaluate()

Similar user number = 20
recommended movie number = 10
loading ml-1m\ratings.dat(0)
loading ml-1m\ratings.dat(100000)
loading ml-1m\ratings.dat(200000)
loading ml-1m\ratings.dat(300000)
loading ml-1m\ratings.dat(400000)
loading ml-1m\ratings.dat(500000)
loading ml-1m\ratings.dat(600000)
loading ml-1m\ratings.dat(700000)
loading ml-1m\ratings.dat(800000)
loading ml-1m\ratings.dat(900000)
loading ml-1m\ratings.dat(1000000)
load ml-1m\ratings.dat succ
split training set and test set succ
train set = 700450
test set = 299759
building movie-users inverse table...
build movie-users inverse table succ
total movie number = 3666
building user co-rated movies matrix...
build user co-rated movies matrix succ
calculating user similarity matrix...
calculating user similarity factor(2000000)
calculating user similarity factor(4000000)
calculating user similarity factor(6000000)
calculating user similarity factor(8000000)
calculating user similarity factor(10000000)
calculating user similarity factor

In [3]:
# 查看某用户推荐结果
user = "2"
print("推荐结果", usercf.recommend(user))
print("---", usercf.testset.get(user, {}))

推荐结果 [('1610', 3.193723637064897), ('648', 3.1459797538042378), ('1580', 2.933141002883578), ('474', 2.929314189509274), ('349', 2.876340119372603), ('260', 2.862660797889707), ('1608', 2.6470684567336105), ('316', 2.6384692015144235), ('377', 2.632365370039093), ('733', 2.617808695399308)]
--- {'1537': 4, '648': 4, '2628': 3, '2916': 3, '3108': 3, '3035': 4, '1253': 3, '1610': 5, '292': 3, '2236': 5, '368': 4, '3147': 5, '1544': 4, '1293': 5, '3255': 4, '2278': 3, '2490': 3, '1834': 4, '2852': 3, '982': 4, '1225': 5, '515': 5, '3699': 2, '2353': 4, '590': 5, '1198': 4, '593': 5, '1955': 4, '1957': 5, '163': 4, '21': 1, '1090': 2, '349': 4, '2728': 3, '2943': 4, '3678': 3, '1246': 5}
