# movielens电影推荐系统

In [23]:
from numpy import *
import pandas as pd
from sklearn.cross_validation import train_test_split

## 准备数据

In [24]:
movies_df = pd.read_csv(r'D:\dataSet\movielens\movies.csv')
ratings_df = pd.read_csv(r'D:\dataSet\movielens\ratings.csv')
tags_df = pd.read_csv(r'D:\dataSet\movielens\tags.csv')

In [25]:
movies_df.head(1)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [26]:
ratings_df.drop('timestamp', axis=1, inplace = True)   #现在不考虑时间

In [94]:
ratings_df.head(5)        #一位用户会对多部电影评分

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [28]:
def splitData():
    train_index = [] ; test_index = []    #根据index随机划分ratings_df
    for index in ratings_df.index:
        if random.randint(10) == 5:
            test_index.append(index)
        else:
            train_index.append(index)
    
    train_ratings_df = ratings_df.iloc[train_index]
    test_ratings_df = ratings_df.iloc[test_index]
    
    return train_ratings_df, test_ratings_df

train_ratings_df, test_ratings_df = splitData()

In [29]:
#tags_df.head(5)         #一个用户会对多部电影打标签;发现标签的种类太多太主观,不好用来对用户分类

## 构建模型

根据用户对电影的评分判断用户喜欢的电影类型,预测用户对所有电影的评分然后推荐topN
calScore函数如何设定很重要,不能直接相加得到每个类别的感兴趣程度,如果类别A,B目前分数相等,用户又看了一部电影但是给的评分很低,且该电影的类别只包含A,那么A>B,但是这部电影用户并不喜欢

In [81]:
def calScore(rating):    #如果评分<某个数值,那么用户对该电影的标签会得到一个负的权重变化量
    return rating  #认为如果评分小于3.5,那么这部电影的标签是该用户不喜欢的

def getUser_Genres():
    user_genres_dict = {}
    for index, row in train_ratings_df.iterrows():   #进行了修改
        user_genres_dict.setdefault(row['userId'], {})
        for genres in list(movies_df.loc[movies_df['movieId'] == row['movieId']]['genres'])[0].split('|'):    
            user_genres_dict[row['userId']].setdefault(genres, 0)
            user_genres_dict[row['userId']][genres] += calScore(row['rating'])  #参数是评分
    
    return user_genres_dict

In [82]:
user_genres_dict = getUser_Genres()
user_genres_dict

{1.0: {'Drama': 18.0,
  'Animation': 6.0,
  'Children': 5.0,
  'Musical': 3.0,
  'Thriller': 17.0,
  'Action': 12.0,
  'Adventure': 17.5,
  'Sci-Fi': 12.0,
  'War': 2.0,
  'Fantasy': 6.5,
  'Horror': 6.0,
  'Romance': 3.5,
  'Comedy': 10.0,
  'Crime': 5.0,
  'Western': 3.0},
 2.0: {'Action': 51.0,
  'Adventure': 45.0,
  'Thriller': 56.0,
  'Drama': 119.0,
  'Romance': 68.0,
  'Comedy': 91.0,
  'Mystery': 16.0,
  'Crime': 29.0,
  'War': 19.0,
  'IMAX': 8.0,
  'Sci-Fi': 11.0,
  'Fantasy': 32.0,
  'Western': 10.0,
  'Horror': 10.0,
  'Children': 28.0,
  'Animation': 22.0,
  'Musical': 15.0},
 3.0: {'Adventure': 29.5,
  'Children': 11.0,
  'Fantasy': 12.5,
  'Crime': 34.0,
  'Drama': 87.5,
  'Comedy': 62.5,
  'Thriller': 28.5,
  'Romance': 30.0,
  'War': 21.0,
  'Action': 48.0,
  'Animation': 11.5,
  'Musical': 5.0,
  'IMAX': 11.5,
  'Sci-Fi': 19.0,
  'Western': 4.0,
  'Horror': 6.5,
  'Mystery': 3.5,
  'Documentary': 7.5},
 4.0: {'Action': 228.0,
  'Adventure': 225.0,
  'Thriller': 161.0,

In [89]:
def calMovieScore(genres_weight):     #genres_weight是用户对某类型的感兴趣分数
    return genres_weight

def getRecommendResult(userId, topN):
    recommend_dict = {}
    unratingedmoviesId = array(train_ratings_df.loc[ratings_df['userId'] != userId]['movieId'])  #用户未打分==未看过
    for movieId in unratingedmoviesId:
        recommend_dict.setdefault(movieId, 0)
        genres_list = array(movies_df.loc[movies_df['movieId'] == movieId]['genres'])[0].split('|')
        intersection = list(set(genres_list).intersection(user_genres_dict[userId].keys()))
        #intersection:电影种类与用户看过种类的交集
        for genres in intersection:
            recommend_dict[movieId] += calMovieScore(user_genres_dict[userId][genres])
    
    #排序后选取topN部电影作为推荐结果
    sorted_recommend_dict = sorted(recommend_dict.items(), key= lambda x:x[1], reverse= True)
    if len(sorted_recommend_dict) < topN:
        return sorted_recommend_dict
    else:
        return sorted_recommend_dict[:topN]

def recommend(userId, topN):  #这个函数用来使推荐结果表格化
    recommend_dict = getRecommendResult(userId, topN)  #这是一个列表,元素是二元组
    temp_df = pd.DataFrame(array(recommend_dict), columns = ['movieId', 'score'])
    result_df = pd.merge(movies_df, temp_df, on = 'movieId')
    result_df.sort_values('score', ascending=False, inplace=True)
    
    return result_df

In [90]:
recommend_result = recommend(userId = 1, topN = 5)
recommend_result

Unnamed: 0,movieId,title,genres,score
1,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,14100.0
3,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,14040.0
4,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,11056.5
0,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,10790.0
2,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,10500.0


In [91]:
#进行模型评估
def PrecisionRecall(topN):
    fenzi = 0 ; fenmu_precision = 0 ; fenmu_recall = 0
    for test_user in list(set(test_ratings_df['userId']))[:10]:  #遍历测试集中的用户
        recommend_result = getRecommendResult(test_user, topN)  #取得训练集中该用户的推荐结果
        #recommend_result是一个列表,元素为二元组,现在要取出二元组中的movieId
        recommendedMovieId = [i[0] for i in recommend_result]
        #测试集中用户评分大于3.5的才认为是该用户喜欢的电影
        movieIdInTest = test_ratings_df.loc[(test_ratings_df['userId'] == test_user) & (test_ratings_df['rating'] > 3.5)]['movieId']

        intersection = set(recommendedMovieId).intersection(set(movieIdInTest))
        fenzi += len(intersection)
        fenmu_precision += topN
        fenmu_recall += len(movieIdInTest)
    
    precision = fenzi / fenmu_precision
    recall = fenzi / fenmu_recall
    
    return precision, recall

In [92]:
def main():
    precision, recall = PrecisionRecall(5)
    #全部重新运行所有cell的话会导致随机划分的数据集产生变化,所有结果也会有变化
    print('recall: %f, precision: %f' %(recall, precision))
    
if __name__ == '__main__':
    main()

recall: 0.033898, precision: 0.040000
