# 开始，数据准备

In [None]:
# 基于项目的协同过滤推荐算法实现
import os
import random
import math
from operator import itemgetter
import shelve
from contextlib import closing
import pandas as pd
import numpy as np

In [None]:
# read file 
def load_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i == 0:
                # 去掉文件第一行的title
                continue
            yield line.strip('\r\n')
    print('Load %s success!' % filename)


In [None]:
def get_dataset(filename, pivot=0.75):
    trainSet = {}
    testSet = {}
    trainSet_len = 0
    testSet_len = 0
    # 加载文件， 按行读取
    for line in load_file(filename):
        # 读取列属性
        user, movie, rating, timestamp = line.split(',')
        # 数据划分测试集合和数据集合 (0,1) < (0,pivot)
        if random.random() < pivot:
            trainSet.setdefault(user, {})
            trainSet[user][movie] = rating
            trainSet_len += 1
        else:
            testSet.setdefault(user, {})
            testSet[user][movie] = rating
            testSet_len += 1
    print('Split trainingSet and testSet success!')
    print('TrainSet = %s' % trainSet_len)
    print('TestSet = %s' % testSet_len)
    return trainSet, testSet

In [None]:
# 读取数据集合
path = 'ratings.csv'
os.path.exists(path)
trainSet, testSet = get_dataset(path)

## 数据显示 trainSet

In [None]:
pd.DataFrame(trainSet)

## TestSet 测试数据

In [None]:
pd.DataFrame(testSet)

In [None]:
# 统计电影的播放次数，movie_movie 矩阵
def count_movie(trainSet):

    # 统计电影被看的次数
    movie_popular = {}
    for user, movies in trainSet.items():
        for movie in movies:
            if movie not in movie_popular:
                movie_popular[movie] = 0
            movie_popular[movie] += 1
    movie_count = len(movie_popular)
    print("Total movie number = %d" % movie_count)

    movie_sim_matrix = {}
    # 遍历训练数据，获得用户对有过的行为的物品
    for user, movies in trainSet.items():
        # 遍历该用户每件物品项
        for m1 in movies:
            # 遍历该用户每件物品项
            for m2 in movies:
                # 若该项为当前物品，跳过
                if m1 == m2:
                    continue
                movie_sim_matrix.setdefault(m1, {})
                movie_sim_matrix[m1].setdefault(m2, 0)
                # 同一个用户，遍历到其他用品则加1
                movie_sim_matrix[m1][m2] += 1
    # movie and movie 矩阵
    print("Build 同现矩阵co-rated users matrix success!")
    return movie_popular,movie_sim_matrix

In [None]:
# movie 统计，movie-movie矩阵
movie_popular, movie_sim_matrix = count_movie(trainSet)

In [None]:
# 验证结果
print(len(movie_popular), len(movie_sim_matrix))

## 相似度算法
### 相似度算法一

In [None]:
# 计算电影之间的相似度 相似度算法 AB交集 / 根号下(A*B)
def calc_movie_sim(movie_popular, movie_sim_matrix):
    # 计算电影之间的相似性
    print("Calculating movie similarity matrix ...")
    for m1, related_movies in movie_sim_matrix.items():
        for m2, count in related_movies.items():
            # 注意0向量的处理，即某电影的用户数为0
            if movie_popular[m1] == 0 or movie_popular[m2] == 0:
                movie_sim_matrix[m1][m2] = 0
            else:
                movie_sim_matrix[m1][m2] = count / math.sqrt(movie_popular[m1] * movie_popular[m2])
    print('Calculate movie similarity matrix success!')
    return movie_sim_matrix

## 相似性矩阵

In [None]:
movie_sim_matrix_1 = calc_movie_sim(trainSet, movie_popular, movie_sim_matrix)

In [None]:
pd.DataFrame(movie_sim_matrix_1)

### 显示结果

In [None]:
pd.Series(movie_popular)

## movie_movie 矩阵

* movie_sim_matrix

In [None]:
pd.DataFrame(movie_sim_matrix)

## 相似性算法2
## jacard 相似度算法，没有利用评分 只有0和1
电影之间的相似度根据，共同看过AB的人数/看过A或B的人数

In [None]:
# 计算电影之间的相似度, jacard 算法 交集/并集
def calc_movie_sim_jacard(movie_popular, movie_sim_matrix):
    # 计算电影之间的相似性
    print("Calculating movie similarity matrix ...")
    for m1, related_movies in movie_sim_matrix.items():
        for m2, count in related_movies.items():
            # 注意0向量的处理，即某电影的用户数为0
            if movie_popular[m1] == 0 or movie_popular[m2] == 0:
                movie_sim_matrix[m1][m2] = 0
            else:
                movie_sim_matrix[m1][m2] = count / (movie_popular[m1] + movie_popular[m2] -count)
    # jacard = (交集)/(并集)
    print('Calculate movie similarity matrix success!')
    return  movie_sim_matrix

In [None]:
movie_sim_matrix_2 = calc_movie_sim_jacard(movie_popular, movie_sim_matrix)
pd.DataFrame(movie_sim_matrix_2)

## 统计用户

In [None]:
# return list of users
def get_user_List(trainSet):
    name = []
    for user, _ in trainSet.items():
        name.append(user)
    return name
def get_movie_name_list(filename):
    nameSet = set()
    # 加载文件， 按行读取
    for line in load_file(filename):
        # 读取列属性
        user, movie, rating, timestamp = line.split(',')
        nameSet.add(movie)
    # print(len(nameSet))
    print('Split trainingSet and testSet success! ')
    return nameSet

In [None]:
name_list = get_user_List(trainSet)
movie_list = get_movie_name_list(path)
# movie_list
print("len of movie", len(movie_list))
print("len of user",len(name_list))

## 推荐电影

In [None]:
# 针对目标用户U，找到K部相似的电影，并推荐其N部电影，
# 用户未产生过行为的物品
def recommend(user, n_sim_movie, n_rec_movie, trainSet, movie_sim_matrix):
    K = n_sim_movie
    N = n_rec_movie
    # 用户user对物品的偏好值
    rank = {}
    # 用户user产生过行为的物品，与物品item按相似度从大到小排列，取与物品item相似度最大的k个商品
    # 验证是否有用户的历史记录
    try:
        watched_movies = trainSet[user]
    except KeyError:
        print(user + " is not exits")

    for movie, rating in watched_movies.items():
        # 遍历与物品item最相似的前k个产品，获得这些物品及相似分数
        for related_movie, w in sorted(movie_sim_matrix[movie].items(), key=itemgetter(1), reverse=True)[:K]:
            # 若该物品为当前物品，跳过
            if related_movie in watched_movies:
                continue
            # 计算用户user对related_movie的偏好值，初始化该值为0
            rank.setdefault(related_movie, 0)
            # 通过与其相似物品对物品related_movie的偏好值相乘并相加。
            # 排名的依据—— > 推荐电影与该已看电影的相似度(累计) * 用户对已看电影的评分
            rank[related_movie] += w * float(rating)
    return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]


In [None]:
rec_matrix = {}
for i in name_list:
    rec_user = recommend(i,20,10, trainSet, movie_sim_matrix_2)
    rec_matrix.setdefault(i, rec_user)
pd.DataFrame(rec_matrix)

In [None]:
rec_matrix_1 = {}
for i in name_list:
    rec_user = recommend(i,20,10, trainSet, movie_sim_matrix_1)
    rec_matrix_1.setdefault(i, rec_user)
pd.DataFrame(rec_matrix_1)

In [None]:
# print(rec_matrix['1'])
# print(rec_matrix_2['1'])
# print(testSet['1'])
# print(trainSet['1'])
table = {}
table.setdefault('1',rec_matrix['1'])
table.setdefault('2',rec_matrix_1['1'])
pd.DataFrame(table)

In [None]:
print(rec_matrix['1'])

In [None]:
# 计算电影之间的相似度, 利用其他算法
def calc_movie_sim_test(trainSet, func):
    movie_sim_matrix = {}

    for user1 in trainSet.keys():
        movie_sim_matrix.setdefault(user1, {})
        for user2 in trainSet.keys():
            if user1 == user2:
                movie_sim_matrix.get(user1).setdefault(user2, 1)
            else:
                movie_sim_matrix.get(user1).setdefault(user2, func(trainSet.get(user1), trainSet.get(user2)))
    # movie{1,2}=sim
    return  movie_sim_matrix


In [None]:
# 用户之间的相似度
func = lambda x,y:np.random.randint(1,4)
movie_sim_matrix_3 = calc_movie_sim_test(trainSet, func)
pd.DataFrame(calc_movie_sim_test(trainSet, func))

In [None]:
import numpy as np
import pandas as pd

#1.1 欧氏距离
def euclidean(x,y):
    '''欧式距离'''
    xy = x - y
    tmp = np.power(xy,2)
    tmp = np.sum(tmp)
    return np.sqrt(tmp)

# 1.2 曼哈顿距离
def manhattan(x,y):
    '''曼哈顿距离'''
    xy = np.abs(x - y)
    return np.sum(xy)

# 1.3 切比雪夫距离
def chebyshev(x,y):
    '''切比雪夫距离'''
    xy = np.abs(x - y)
    return np.max(tmp)

# 1.4 闵可夫斯基距离
def minkowski(x,y,p):
    '''闵可夫斯基距离'''
    xy = np.abs(x - y)
    tmp = np.power(xy,p)
    tmp = np.sum(tmp)
    return np.power(tmp, 1/p)

# 1.5 标准化欧氏距离
def stand_euclidean(x,y,s):
    xy = x - y
    xys = xy / s
    tmp = np.power(xys,2)
    tmp = np.sum(tmp)
    return np.sqrt(tmp)
# 2.1 夹角余弦
def cosine(x,y):
#     xy = np.sum(np.multiply(x,y))
    x = pd.Series(x)
    y = pd.Series(y)    
    xy = x.astype(float).mul(b.astype(float), fill_value=0)
    x = np.linalg.norm(x,2)
    y = np.linalg.norm(y,2)
    return xy / (x*y)

def tanimoto(x,y):
    x = pd.Series(x)
    y = pd.Series(y)    
    xy = x.astype(float).mul(b.astype(float), fill_value=0)
    x = np.linalg.norm(x,2)
    y = np.linalg.norm(y,2)
    return xy / (x + y - xy)
# print("夹角余弦",cosine([3,0,2],[0,5,3]))
# print("Tanimoto系数",cosine([3,0,2],[0,5,3]))

In [None]:
# 余弦相似度
def cosine(x,y):
    x = pd.Series(x)
    y = pd.Series(y) 
    xy = x.astype(float).mul(y.astype(float), fill_value=0)
    x = np.linalg.norm(x.values.astype(float),2)
    y = np.linalg.norm(y.values.astype(float),2)
    return xy.sum() / (x*y)
# z = a.astype(float).mul(b.astype(float),fill_value=0)
# # z.sum()
# cosine(a,b)
# x.values.astype(float)

# cosine(trainSet.get('1'),trainSet.get('2'))

In [None]:
func = cosine
movie_sim_matrix_3 = calc_movie_sim_test(trainSet, func)
pd.DataFrame(movie_sim_matrix_3)

In [None]:
rec_matrix_2 = {}
for i in name_list:
    rec_user = recommend_2(i,20,10,movie_sim_matrix_2)
    rec_matrix_2.setdefault(i, rec_user)
pd.DataFrame(rec_matrix_2)                                                                                          

In [None]:
pd.DataFrame(rec_matrix)

In [None]:
pd.Series(testSet['1'])

In [None]:
# 产生推荐并通过准确率、召回率和覆盖率进行评估
def evaluate(self):
    print('Evaluating start ...')
    N = self.n_rec_movie
    # 准确率和召回率
    hit = 0
    rec_count = 0
    test_count = 0
    # 覆盖率
    all_rec_movies = set()

    for i, user in enumerate(self.trainSet):
        test_moives = self.testSet.get(user, {})
        rec_movies = self.recommend(user)
        for movie, w in rec_movies:
            if movie in test_moives:
                hit += 1
            all_rec_movies.add(movie)
        rec_count += N
        test_count += len(test_moives)

    precision = hit / (1.0 * rec_count)
    recall = hit / (1.0 * test_count)
    coverage = len(all_rec_movies) / (1.0 * self.movie_count)
    print('precisioin=%.4f\trecall=%.4f\tcoverage=%.4f' % (
        precision, recall, coverage))


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# scikit_learn?
# !python -m pip install --upgrade pip
# !pip uninstall sklearn
from sklearn.metrics import f1_score

In [None]:
y_true = [0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,1]
y_pred = [0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0]

# 计算f1_Score

In [None]:
f1_score(y_true=y_true, y_pred=y_pred)

In [None]:
f1_score?

In [None]:
x = np.random.randint(0,2,(2,20))

f1_score(x[0],x[1])

In [None]:
from sklearn.metrics import accuracy_score,precision_score, recall_score
# 准确率
accu = accuracy_score(y_true, y_pred)

pre = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("accu = %f, precision = %f, recallrate = %f, f1_score = %f "%(accu, pre, recall, f1))