**Examples of Collaborative Filtering based Recommendation Systems**...

reference website: https://blog.csdn.net/Jack_yun_feng/article/details/100176399

The idea of this model based part is to use linear regression method to predict the rating pattern for each user to each product.

Recommende the product with predicted highest rate to user. 

Algorithm is not complex, but it still has a problem. 

The rating of a product can only tell us this item is good but it cannot tell us if the targeting user would buy it or not.

In [1]:
# !pip install ipynb



In [2]:
# from ipynb.fs.full.user_base import *

User ID A --> recommend item ID List:  ['B002OVV7F0', 'B002XQ1YL8', 'B002L6NZHG', 'B001SYYMF6', 'B001PTFWSA', 'B001CTUX5K', 'B000OWHKMU', 'B000LZLILE', 'B000F1OVRC', 'B0009OAI18']

User ID A --> products that user has bought and rating:  [('0205616461', 5.0), ('B002OVV7F0', 3.0), ('B0031IH5FQ', 5.0), ('B006GQPZ8E', 4.0)]
Spliting data...
Data Splited!!!


In [3]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dataset = pd.read_csv("ratings_Beauty.csv")
dataset_1000 = dataset[:1000]
dataset.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200


Step 1: Make a class to encapsulate all the related functions. (To prevent naming conflict from different class functions)

__init__ : includes all the variable initialization and useful parameters.

        number_epochs : number of learning iterations.

        alpha : the learning rate

        reg : the regression rate

        columns : the required column_names from the dataset

fit : further processing parameters

        1) Group dataset based on 'UserId' and 'ProductId', respectively.

        2) Calculate global mean of the 'Rating'.

        3) Initialize bu (user_bias) and bi (item_bias).

        4) Call sgd() to train the model

sgd : Stochastic gradient descent

        Optimizaing bias to increase the accuracy of the prediction.

predict : Predicting equation

        predict_rating = global_mean + user_bias(UserId) + item_bias(ProductId)


In [4]:
class BaselineCFBySGD(object):

    def __init__(self, dataset, number_epochs, alpha, reg, columns=["UserId", "ProductId", "Rating"]):
        self.dataset = dataset
        # 梯度下降最高迭代次数
        self.number_epochs = number_epochs
        # 学习率
        self.alpha = alpha
        # 正则参数
        self.reg = reg
        # 数据集中user-item-rating字段的名称
        self.columns = columns
        # 开始
        self.fit()

    def fit(self):
        '''
        :param dataset: uid, iid, rating
        :return:
        '''
        # 用户评分数据
        self.users_ratings = self.dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # 物品评分数据
        self.items_ratings = self.dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # 计算全局平均分
        self.global_mean = self.dataset[self.columns[2]].mean()
        # 调用sgd方法训练模型参数
        self.bu, self.bi = self.sgd()

    def sgd(self):
        '''
        利用随机梯度下降，优化bu，bi的值
        :return: bu, bi
        '''
        # 初始化bu、bi的值，全部设为0
        bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings))))
        bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings))))

        for i in range(self.number_epochs):
        #   print("iter%d" % i)
            for uid, iid, real_rating, _ in self.dataset.itertuples(index=False):
                error = real_rating - (self.global_mean + bu[uid] + bi[iid])

                bu[uid] += self.alpha * (error - self.reg * bu[uid])
                bi[iid] += self.alpha * (error - self.reg * bi[iid])

        return bu, bi

    def predict(self, uid, iid):
        try:
            predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        except Exception as e: 
            # print("Kicked:", e)
            predict_rating = 0
            #predict_rating = 0.0
        return predict_rating


In [5]:
def data_split(dataset, x=0.8):
    '''
    切分数据集
    :param x: 训练集的比例，如x=0.8，则0.2是测试集
    :return: 用户-物品评分矩阵
    '''
    print("Spliting data...")

    trainset = dataset.sample(frac=x)
    trainset_index = trainset.index.tolist()
    testset = dataset.drop(trainset_index)
    #print(trainset.size, "\n", testset.size)
  
    drop_list = []
    for data in np.array(testset):
        if data[0] not in trainset["UserId"]:
            not_predi = testset[testset["UserId"] == data[0]].index
            drop_list.append(not_predi[0])
    testset = testset.drop(drop_list)
    print("Data Splited!!!")

    return trainset, testset

In [6]:
# trainset = dataset[:100000]
# bcf = BaselineCFBySGD(trainset, 50, 0.034, 0.1, ["UserId", "ProductId", "Rating"])

In [7]:
# trainUserDic, trainItemDic = genUserDic(np.array(trainset))

In [15]:
def modelRecommendation(bcf, uid, items, k = 10):
    pred_rating_result = [(iid, bcf.predict(uid,iid)) for iid in items]
    pred_rating_result.sort(key = lambda x: x[1], reverse=True)
    # print(pred_rating_result)
    # print(len(pred_rating_result))
    recomList = [iid for iid, mark in pred_rating_result]

    if len(pred_rating_result) < k:
        return recomList, pred_rating_result
    else:
        return recomList[:k], pred_rating_result[:k]

In [9]:
# uids = pd.unique(trainset["UserId"])
# uid = uids[0]
# iids = pd.unique(trainset["ProductId"])

# a,b = modelRecommendation(bcf, uid,iids)

6384


In [10]:
# iids = pd.unique(trainset["ProductId"])
# len(iids)

6384

In [29]:
# def recall(testUsers, testUserDic, itemDic, userDic, recomMethod = recommendation, k = 10, bcf = None,trainset=None):
#     """计算召回率"""
#     hit = 0
#     all_ = 0
#     count = 0
#     for userId in testUsers:
#         # count += 1
#         # print(count)
#         if recomMethod == recommendation:
#             recomList, neighbors = recomMethod(userId, userDic, itemDic)
#         elif recomMethod == modelRecommendation: recomList, recomListMark = recomMethod(bcf, userId, pd.unique(trainset["ProductId"]),k = k)
#         elif recomMethod == hybridRecommendation: recomList, recomListMark = recomMethod(userId, k = k)
#         # print("recomList", recomList)
#         # print("testUserDic[userId]", testUserDic[userId])
#         for tureItem, mark in testUserDic[userId]:
#             if tureItem in recomList:
#                 hit += 1
#         all_ += len(testUserDic[userId])
#     return hit / (all_ * 1.)


In [34]:
# listTestUsers = list(trainUserDic.keys())[:10]                    
# testUsers = set(listTestUsers)

In [35]:
# for x in range(1000):
#     a = recall(testUsers, trainUserDic, trainItemDic, trainUserDic,recomMethod=modelRecommendation, bcf=bcf,trainset=trainset, k=x)
#     print(a)

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
