# movielens电影推荐系统

In [157]:
from numpy import *
import pandas as pd
import math
from sklearn.cross_validation import train_test_split

## 准备数据

In [158]:
movies_df = pd.read_csv(r'D:\dataSet\movielens\movies.csv')
ratings_df = pd.read_csv(r'D:\dataSet\movielens\ratings.csv')
tags_df = pd.read_csv(r'D:\dataSet\movielens\tags.csv')

In [159]:
movies_df.head(1)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [160]:
ratings_df.drop('timestamp', axis=1, inplace = True)   #现在不考虑时间

In [161]:
ratings_df.head(5)        #一位用户会对多部电影评分

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [162]:
def splitData():
    train_index = [] ; test_index = []    #根据index随机划分ratings_df
    for index in ratings_df.index:
        if random.randint(10) == 5:
            test_index.append(index)
        else:
            train_index.append(index)
    
    train_ratings_df = ratings_df.iloc[train_index]
    test_ratings_df = ratings_df.iloc[test_index]
    
    return train_ratings_df, test_ratings_df

train_ratings_df, test_ratings_df = splitData()

## 构建模型

In [163]:
def userSimilarity():  #得到训练集中用户相似度矩阵
    train_item_user = {}   #value:list, 喜欢该item的用户列表,评分>=3.0代表喜欢
    for movieId in set(train_ratings_df['movieId']):
        train_item_user[movieId] = array(train_ratings_df.loc[(train_ratings_df['movieId'] == movieId) & \
                        (train_ratings_df['rating'] >= 3.0)]['userId'])
    #把评分>=3.0视为正反馈
    
    primal_userSim = {}
    user_like_dict = {}
    for users_list in train_item_user.values():  #遍历每一项物品:对该物品下的用户进行两两配对,得到用户之间的交集
        for preIndex in range(0, len(users_list)):  #前一个用户
            #得到分母,记录用户喜欢的电影数
            user_like_dict.setdefault(users_list[preIndex], 0)
            user_like_dict[users_list[preIndex]] += 1
            #遍历到最后1个时,由于当前列表最后一个用户已经与前面的用户两两建立了关系,所以只要记录最后一个用户看过该电影就OK
            if preIndex == (len(users_list)-1):
                break
                
            primal_userSim.setdefault(users_list[preIndex], {})
            #记录用户之间的交集数
            for nextIndex in range(preIndex + 1, len(users_list)):  #两两配对中的第二个用户
                primal_userSim[users_list[preIndex]].setdefault(users_list[nextIndex], 0)
                primal_userSim.setdefault(users_list[nextIndex], {})
                primal_userSim[users_list[nextIndex]].setdefault(users_list[preIndex], 0)  
                primal_userSim[users_list[preIndex]][users_list[nextIndex]] += 1 / math.log(1 + len(users_list))  #降低了热门物品对相似度的影响 
                primal_userSim[users_list[nextIndex]][users_list[preIndex]] += 1 / math.log(1 + len(users_list))

    for user in primal_userSim.keys():
        for item in primal_userSim[user].items():
            primal_userSim[user][item[0]] /= math.sqrt((user_like_dict[user]) * (user_like_dict[item[0]])) 
    
    return primal_userSim

In [164]:
user_sim_dict = userSimilarity()

In [165]:
def getClosestdUser(userId, closestN):
    closestUser_Score = sorted(user_sim_dict[userId].items(), key=lambda x:x[1], reverse=True)[:closestN]    
    closestUser = [item[0] for item in closestUser_Score]
    
    return closestUser

In [166]:
def closeUser_to_item_interest():  #返回相似用户对该电影的兴趣程度,目前是最简单的形式;我觉得可以考虑返回该相似用户对该电影的评分
    return 1

def getTotalItemUserDict():
    total_item_user_dict = {}
    for movieId in set(ratings_df['movieId']):
        total_item_user_dict[movieId] = array(ratings_df.loc[ratings_df['movieId'] == movieId]['userId']) 
    return total_item_user_dict

total_item_user_dict = getTotalItemUserDict()

def getUserItemDict(userId, closestUser_list, closestK):  #根据相似的K个用户判断用户对物品的评分
    userItemDict = {}
    for unSeenMovieId in set(ratings_df.loc[ratings_df['userId'] != userId]['movieId']):
        userItemDict.setdefault(unSeenMovieId, 0)  #如果相似用户都不喜欢该电影,则该用户对该电影的兴趣估计为0
        for closeUser in closestUser_list:
            if closeUser not in total_item_user_dict[unSeenMovieId]:   #如果该相似用户不喜欢该电影
                continue
            userItemDict[unSeenMovieId] += user_sim_dict[userId][closeUser] * closeUser_to_item_interest()
            
    return userItemDict

def recommend(userId, closestK, topN):  #参考最相似的closestK个用户来推荐topN个物品
    closestUser = getClosestdUser(userId, closestK)
    user_item_dict = getUserItemDict(userId, closestUser, closestK)
    recommend_result = sorted(user_item_dict.items(), key=lambda x:x[1], reverse=True)[:topN] 
    
    return recommend_result[:topN]

result = recommend(1, 10, 5)
result

[(914, 0.15086589432081035),
 (1371, 0.14546128940907177),
 (4085, 0.12133494565239011),
 (2278, 0.12056757144328015),
 (1339, 0.11885996796900761)]

In [167]:
def PrecisionRecall(num_closest=10, topN=5):
    fenzi = 0 ; fenmu_precision = 0 ; fenmu_recall = 0
    for test_user in list(set(test_ratings_df['userId']))[:100]:
        recommend_result = recommend(test_user, num_closest, topN)
        recommend_moviesId = [item[0] for item in recommend_result]
        test_user_liked_movies = test_ratings_df.loc[(test_ratings_df['userId'] == test_user) \
                                        & (test_ratings_df['rating'] >= 3.5)]['movieId']  
        intersection = set(recommend_moviesId).intersection(set(test_user_liked_movies)) 
        #print(len(intersection))
        fenzi += len(intersection)
        fenmu_precision += len(recommend_result)
        fenmu_recall += len(test_user_liked_movies)
        
    precision = fenzi / fenmu_precision
    recall = fenzi / fenmu_recall
    print('precision: %f, recall: %f' %(precision, recall))
    
    return precision, recall

In [168]:
precision, recall = PrecisionRecall(10, 15)
#0.032667, recall: 0.051633
#0.045, 0.067

precision: 0.036667, recall: 0.055388
