In [10]:
import random
import pandas as pd

def LoadMovieLensData(filepath, train_rate):
    ratings = pd.read_table(filepath, sep="::", header=None, names=["UserID", "MovieID", "Rating", "TimeStamp"],\
                            engine='python')
    ratings = ratings[['UserID','MovieID']]
    train = []
    test = []
    random.seed(3)
    for idx, row in ratings.iterrows():
        user = int(row['UserID'])
        item = int(row['MovieID'])
        if random.random() < train_rate:
            train.append([user, item])
        else:
            test.append([user, item])
    return PreProcessData(train), PreProcessData(test)

def PreProcessData(originData):
    """
    建立User-Item表，结构如下：
        {"User1": {MovieID1, MoveID2, MoveID3,...}
         "User2": {MovieID12, MoveID5, MoveID8,...}
         ...
        }
    """
    trainData = dict()
    for user, item in originData:
        trainData.setdefault(user, set())
        trainData[user].add(item)
    return trainData



In [11]:
def ItemMatrix(trainData, similarity):
    """
    建立物品共现矩阵
    :param trainData: User-Item表 
    :param similarity: 相似度计算函数选择
    :return: 
    """
    N = defaultdict(int)  # 记录每个物品的喜爱人数
    itemSimMatrix = defaultdict(int) # 共现矩阵
    for user, items in trainData.items():
        for i in items:
            itemSimMatrix.setdefault(i, dict())
            N[i] += 1
            for j in items:
                if i == j:
                    continue
                itemSimMatrix[i].setdefault(j, 0)
                if similarity == "cosine":
                    itemSimMatrix[i][j] += 1
                elif similarity == "iuf":
                    itemSimMatrix[i][j] += 1. / math.log1p(len(items) * 1.)
    return itemSimMatrix

In [12]:
def ItemSimilarityMatrix(ItemMatrix, N, isNorm):
    """
    计算物品相似度矩阵
    :param ItemMatrix: 
    :param N: 
    :param isNorm: 
    :return: 
    """
    itemSimMatrix = dict()
    for i, related_items in ItemMatrix.items():
        for j, cij in related_items.items():
            # 计算相似度
            itemSimMatrix[i][j] = cij / math.sqrt(N[i] * N[j])
    # 是否要标准化物品相似度矩阵
    if isNorm:
        for i, relations in itemSimMatrix.items():
            max_num = relations[max(relations, key=relations.get)]
            # 对字典进行归一化操作之后返回新的字典
            itemSimMatrix[i] = {k: v / max_num for k, v in relations.items()}
    return itemSimMatrix



In [13]:
def ItemSimilarityMatrix(ItemMatrix, N, isNorm):
    """
    计算物品相似度矩阵
    :param ItemMatrix: 
    :param N: 
    :param isNorm: 
    :return: 
    """
    itemSimMatrix = dict()
    for i, related_items in ItemMatrix.items():
        for j, cij in related_items.items():
            # 计算相似度
            itemSimMatrix[i][j] = cij / math.sqrt(N[i] * N[j])
    # 是否要标准化物品相似度矩阵
    if isNorm:
        for i, relations in itemSimMatrix.items():
            max_num = relations[max(relations, key=relations.get)]
            # 对字典进行归一化操作之后返回新的字典
            itemSimMatrix[i] = {k: v / max_num for k, v in relations.items()}
    return itemSimMatrix

In [14]:
def recommend(trainData, itemSimMatrix, user, N, K):
    """
    :param trainData: User-Item表
    :param itemSimMatrix: 物品相似度矩阵
    :param user: 被推荐的用户user
    :param N: 推荐的商品个数
    :param K: 查找的最相似的用户个数
    :return: 按照user对推荐物品的感兴趣程度排序的N个商品
    """
    recommends = dict()
    # 先获取user的喜爱物品列表
    items = trainData[user]
    for item in items:
        # 对每个用户喜爱物品在物品相似矩阵中找到与其最相似的K个
        for i, sim in sorted(itemSimMatrix[item].items(), key=itemgetter(1), reverse=True)[:K]:
            if i in items:
                continue  # 如果与user喜爱的物品重复了，则直接跳过
            recommends.setdefault(i, 0.)
            recommends[i] += sim
    # 根据被推荐物品的相似度逆序排列，然后推荐前N个物品给到用户
    return dict(sorted(recommends.items(), key=itemgetter(1), reverse=True)[:N])

In [15]:
import math
import random
import pandas as pd
from collections import defaultdict
from operator import itemgetter

def LoadMovieLensData(filepath, train_rate):
    ratings = pd.read_table(filepath, sep="::", header=None, names=["UserID", "MovieID", "Rating", "TimeStamp"],\
                            engine='python')
    ratings = ratings[['UserID','MovieID']]

    train = []
    test = []
    random.seed(3)
    for idx, row in ratings.iterrows():
        user = int(row['UserID'])
        item = int(row['MovieID'])
        if random.random() < train_rate:
            train.append([user, item])
        else:
            test.append([user, item])
    return PreProcessData(train), PreProcessData(test)

def PreProcessData(originData):
    """
    建立User-Item表，结构如下：
        {"User1": {MovieID1, MoveID2, MoveID3,...}
         "User2": {MovieID12, MoveID5, MoveID8,...}
         ...
        }
    """
    trainData = dict()
    for user, item in originData:
        trainData.setdefault(user, set())
        trainData[user].add(item)
    return trainData


class ItemCF(object):
    """ Item based Collaborative Filtering Algorithm Implementation"""
    def __init__(self, trainData, similarity="cosine", norm=True):
        self._trainData = trainData
        self._similarity = similarity
        self._isNorm = norm
        self._itemSimMatrix = dict() # 物品相似度矩阵

    def similarity(self):
        N = defaultdict(int) #记录每个物品的喜爱人数
        for user, items in self._trainData.items():
            for i in items:
                self._itemSimMatrix.setdefault(i, dict())
                N[i] += 1
                for j in items:
                    if i == j:
                        continue
                    self._itemSimMatrix[i].setdefault(j, 0)
                    if self._similarity == "cosine":
                        self._itemSimMatrix[i][j] += 1
                    elif self._similarity == "iuf":
                        self._itemSimMatrix[i][j] += 1. / math.log1p(len(items) * 1.)
        for i, related_items in self._itemSimMatrix.items():
            for j, cij in related_items.items():
                self._itemSimMatrix[i][j] = cij / math.sqrt(N[i]*N[j])
        # 是否要标准化物品相似度矩阵
        if self._isNorm:
            for i, relations in self._itemSimMatrix.items():
                max_num = relations[max(relations, key=relations.get)]
                # 对字典进行归一化操作之后返回新的字典
                self._itemSimMatrix[i] = {k : v/max_num for k, v in relations.items()}

    def recommend(self, user, N, K):
        """
        :param user: 被推荐的用户user
        :param N: 推荐的商品个数
        :param K: 查找的最相似的用户个数
        :return: 按照user对推荐物品的感兴趣程度排序的N个商品
        """
        recommends = dict()
        # 先获取user的喜爱物品列表
        items = self._trainData[user]
        for item in items:
            # 对每个用户喜爱物品在物品相似矩阵中找到与其最相似的K个
            for i, sim in sorted(self._itemSimMatrix[item].items(), key=itemgetter(1), reverse=True)[:K]:
                if i in items:
                    continue  # 如果与user喜爱的物品重复了，则直接跳过
                recommends.setdefault(i, 0.)
                recommends[i] += sim
        # 根据被推荐物品的相似度逆序排列，然后推荐前N个物品给到用户
        return dict(sorted(recommends.items(), key=itemgetter(1), reverse=True)[:N])

    def train(self):
        self.similarity()

if __name__ == "__main__":
    train, test = LoadMovieLensData("ratings.dat", 0.8)
    print("train data size: %d, test data size: %d" % (len(train), len(test)))
    ItemCF = ItemCF(train, similarity='iuf', norm=True)
    ItemCF.train()

    # 分别对以下4个用户进行物品推荐
    print(ItemCF.recommend(1, 5, 80))
    print(ItemCF.recommend(2, 5, 80))
    print(ItemCF.recommend(3, 5, 80))
    print(ItemCF.recommend(4, 5, 80))

train data size: 6040, test data size: 6030
{1196: 23.965756454677646, 1097: 23.182727104691942, 1198: 22.94448230324663, 1: 22.036373892558373, 1265: 20.785969662808093}
{1580: 49.09537130022913, 377: 42.20039910397584, 608: 41.58398854994261, 2916: 40.4354238235108, 296: 39.01577825731824}
{1036: 20.575069918297654, 592: 19.545923446539252, 1210: 18.631538385064193, 2174: 18.288138210241225, 1240: 17.97079899608672}
{589: 14.280943815751622, 2571: 14.18409548185235, 1200: 14.024903571665726, 858: 13.68667814399541, 1387: 13.252895283195155}
