In [15]:
import numpy as np
import pandas as pd

from gensim.models import TfidfModel
from pprint import pprint
from gensim.corpora import Dictionary

import collections
from functools import reduce

import json
import os

In [40]:
def get_movie_dataset():
    # 加载基于所有电影的标签
    _tags = pd.read_csv("all-tags.csv", usecols=range(1, 3)).dropna()
    tags = _tags.groupby("movieId").agg(list)
    
    # 加载电影列表数据
    movies = pd.read_csv("movies.csv", index_col="movieId")
    # 将电影的类别词分开
    movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
    
    # 为每部电影匹配对应的标签数据， 如果没有将会是NAN
    movies_index = set(movies.index) & set(tags.index)
    new_tags = tags.loc[list(movies_index)]
    ret = movies.join(new_tags)
    
    # 构建电影数据集， 包含电影ID， 电影名称， 类别和标签四个字段
    # 如果电影没有标签数据， 就替换为空列表
    df = map(lambda x: (x[0], x[1], x[2], x[2]+x[3]) if x[3] is not np.nan else (x[0], x[1], x[2], []), ret.itertuples())
    movies_dataset = pd.DataFrame(df, columns=['movieId', 'title', 'genres', 'tags'])
    
    movies_dataset.set_index("movieId", inplace=True)
    return movies_dataset

def create_movie_profile(movie_dataset):
    '''
    使用tfidf，分析提取topn关键词
    :param movie_dataset:
    :return:
    '''
    dataset = movie_dataset["tags"].values

    # 根据数据集建立词袋，并统计词频，将所有词放入一个词典，使用索引进行获取
    dct = Dictionary(dataset)
    # 根据将每条数据，返回对应的词索引和词频
    corpus = [dct.doc2bow(line) for line in dataset]
    # 训练TF-IDF模型，即计算TF-IDF值
    model = TfidfModel(corpus)

    _movie_profile = []
    for i, data in enumerate(movie_dataset.itertuples()):
        mid = data[0]
        title = data[1]
        genres = data[2]
        vector = model[corpus[i]]
        movie_tags = sorted(vector, key=lambda x: x[1], reverse=True)[:30]
        topN_tags_weights = dict(map(lambda x: (dct[x[0]], x[1]), movie_tags))
        # 将类别词的添加进去，并设置权重值为1.0
        for g in genres:
            topN_tags_weights[g] = 1.0
        topN_tags = [i[0] for i in topN_tags_weights.items()]
        _movie_profile.append((mid, title, topN_tags, topN_tags_weights))

    movie_profile = pd.DataFrame(_movie_profile, columns=["movieId", "title", "profile", "weights"])
    movie_profile.set_index("movieId", inplace=True)
    return movie_profile

def create_inverted_table(movie_profile):
    inverted_table = {}
    for mid, weights in movie_profile['weights'].items():
        for tag, weight in weights.items():
            # 到inverted_table dict 用tag作为key去取值， 如果取不到就返回[]
            _ = inverted_table.get(tag, [])
            _.append((mid, weight))
            inverted_table.setdefault(tag, _)
    return inverted_table

def get_watch_record():
    watch_record = pd.read_csv("ratings.csv", usecols=range(2), dtype={"userId":np.int32, "movieId": np.int32})
    watch_record = watch_record.groupby("userId").agg(list)
    return watch_record

def create_user_profile(watch_record):
    user_profile = {}
    for uid, mids in watch_record.itertuples():
        record_movie_profile = movie_profile.loc[list(mids)]  # 这里把当前用户看过的电影从movie_profile中找出来
        # print(record_movie_profile)
        # 下面需要把这些电影的标签都合并到一块， 然后统计出现的次数
        counter = collections.Counter(reduce(lambda x, y: list(x) + list(y), record_movie_profile['profile'].values))
        
        # 兴趣词
        interest_words = counter.most_common(50)
        maxcount = interest_words[0][1]
        interest_words = [(w, round(c/maxcount, 4)) for w, c, in interest_words]  # 这里归一化一下
        user_profile[uid] = interest_words
    return user_profile

def update_all_recommends():
    movie_dataset = get_movie_dataset()
    movie_profile = create_movie_profile(movie_dataset)
    inverted_table = create_inverted_table(movie_profile)
    watch_record = get_watch_record()
    user_profile = create_user_profile(watch_record)
    
    if not os.path.exists('recommends.json'):
        with open('recommends.json', 'w') as f:
            json.dump({}, f)
    
    with open('recommends.json', 'r+') as f:
        recommends_data = {}
        
        for uid, interest_words in user_profile.items():
            result_table = {}   # 电影id: [0.2, 0.5]
            for interest_word, interest_weight in interest_words:
                related_movies = inverted_table[interest_word]
                for mid, relate_weight in related_movies:
                    _ = result_table.get(mid, [])
                    _.append(interest_weight)    #只考虑用户的兴趣程度
                    # _.append(related_weight)   # 只考虑兴趣词与电影的关联程度
                    # _.append(interest_weight * related_weight)     # 二者都考虑
                    result_table.setdefault(mid, _)
            rs_result = map(lambda x: (x[0], movie_dataset.loc[x[0], 'title'], sum(x[1])), result_table.items()) 
            # 过滤掉已经观看过的电影
            rs_result = filter(lambda x: x[0] not in watch_record.loc[uid, 'movieId'], rs_result)
            rs_result = sorted(rs_result, key=lambda x: x[2], reverse=True)[:100]
            
            # 更新或添加到 recommends_data 字典中
            recommends_data[str(uid)] = rs_result
        
        # 将更新后的数据写入到 recommends.json 文件中
        f.seek(0)
        json.dump(recommends_data, f, indent=4)

def update_one_recommend(user_id):
    movie_dataset = get_movie_dataset()
    movie_profile = create_movie_profile(movie_dataset)
    inverted_table = create_inverted_table(movie_profile)
    watch_record = get_watch_record()
    user_profile = create_user_profile(watch_record)
    
    if not os.path.exists('recommends.json'):
        with open('recommends.json', 'w') as f:
            json.dump({}, f)
    
    with open('recommends.json', 'r+') as f:
        recommends_data = json.load(f)
        # pprint(recommends_data)
        
        for uid, interest_words in user_profile.items():
            if user_id != uid:
                continue
            result_table = {}   # 电影id: [0.2, 0.5]
            for interest_word, interest_weight in interest_words:
                related_movies = inverted_table[interest_word]
                for mid, relate_weight in related_movies:
                    _ = result_table.get(mid, [])
                    _.append(interest_weight)    #只考虑用户的兴趣程度
                    # _.append(related_weight)   # 只考虑兴趣词与电影的关联程度
                    # _.append(interest_weight * related_weight)     # 二者都考虑
                    result_table.setdefault(mid, _)
            # print(result_table)
            rs_result = map(lambda x: (x[0], movie_dataset.loc[x[0], 'title'], sum(x[1])), result_table.items()) 
            # 过滤掉已经观看过的电影
            rs_result = filter(lambda x: x[0] not in watch_record.loc[uid, 'movieId'], rs_result)
            rs_result = sorted(rs_result, key=lambda x: x[2], reverse=True)[:100]
            recommends_data[str(uid)] = rs_result
            
            # 将更新后的数据写入到 recommends.json 文件中
            f.seek(0)
            json.dump(recommends_data, f, indent=4)

In [41]:
update_all_recommends()

In [None]:
update_one_recommend(1)

In [13]:
movie_dataset = get_movie_dataset()
movie_profile = create_movie_profile(movie_dataset)
inverted_table = create_inverted_table(movie_profile)

In [9]:
watch_record = pd.read_csv("ratings.csv", usecols=range(2), dtype={"userId":np.int32, "movieId": np.int32})
watch_record = watch_record.groupby("userId").agg(list)
# print(watch_record)

user_profile = {}
for uid, mids in watch_record.itertuples():
    record_movie_profile = movie_profile.loc[list(mids)]  # 这里把当前用户看过的电影从movie_profile中找出来
    # print(record_movie_profile)
    # 下面需要把这些电影的标签都合并到一块， 然后统计出现的次数
    counter = collections.Counter(reduce(lambda x, y: list(x) + list(y), record_movie_profile['profile'].values))
    
    # 兴趣词
    interest_words = counter.most_common(50)
    maxcount = interest_words[0][1]
    interest_words = [(w, round(c/maxcount, 4)) for w, c, in interest_words]  # 这里归一化一下
    user_profile[uid] = interest_words

In [8]:

for uid, interest_words in user_profile.items():
    result_table = {}   # 电影id: [0.2, 0.5]
    for interest_word, interest_weight in interest_words:
        related_movies = inverted_table[interest_word]
        for mid, relate_weight in related_movies:
            _ = result_table.get(mid, [])
            _.append(interest_weight)    #只考虑用户的兴趣程度
            # _.append(related_weight)   # 只考虑兴趣词与电影的关联程度
            # _.append(interest_weight * related_weight)     # 二者都考虑
            result_table.setdefault(mid, _)
    # print(result_table)
    rs_result = map(lambda x: (x[0], movie_dataset.loc[x[0], 'title'], sum(x[1])), result_table.items()) 
    # 过滤掉已经观看过的电影
    rs_result = filter(lambda x: x[0] not in watch_record.loc[uid, 'movieId'], rs_result)
    rs_result = sorted(rs_result, key=lambda x: x[2], reverse=True)[:100]
    print(uid)
    pprint(rs_result)
    # recommend_list.update({uid: rs_result})