In [12]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate
import warnings; warnings.simplefilter('ignore')

# 一般推荐

In [13]:
md = pd.read_csv('./data/movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [14]:
md.shape

(45466, 24)

In [15]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

## 计算权重公式如下
Weighted Rating (WR) =  
$$ (\frac{v}{v + m} * R) + (\frac{m}{v + m} * C)$$


v is the number of votes for the movie

m is the minimum votes required to be listed in the chart

R is the average rating of the movie

C is the mean vote across the whole report

In [16]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [17]:
m = vote_counts.quantile(0.95)
m

434.0

In [22]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [23]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [25]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v+m) * R) + (m / (m+v) * C)

In [26]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [27]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

## 推荐的Top15电影

In [28]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.851924


In [29]:
s = md.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [30]:

def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

## 浪漫系列的Top15电影

In [31]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,8.565285
351,Forrest Gump,1994,8147,8,48.307194,7.971357
876,Vertigo,1958,1162,8,18.20822,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.845107,7.745154
1132,Cinema Paradiso,1988,834,8,14.177005,7.744878
19901,Paperman,2012,734,8,7.198633,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.994281,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


# 基于内容推荐

In [39]:
links_small = pd.read_csv('./data/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

(9112,)

In [35]:
md = md.drop([19730, 29503, 35587])

In [40]:
md['id'] = md['id'].astype('int')

In [41]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [47]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [49]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [52]:
tfidf_matrix.shape

(9099, 268124)

## 利用余弦相似度计算电影间的距离

$$ cos(x,y) = \frac{x * yT}{||x|| * ||y||} $$


In [53]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [54]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [56]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [57]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [63]:
get_recommendations('The Godfather').head(10)

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
Name: title, dtype: object

In [62]:
get_recommendations('Inception').head(10)

5239                              Cypher
141                                Crumb
6398                         Renaissance
653                            Lone Star
1703                               House
4739                    The Pink Panther
319                                 Cobb
2828    What Ever Happened to Baby Jane?
8867                     Pitch Perfect 2
979          Once Upon a Time in America
Name: title, dtype: object

## 基于元数据推荐

In [65]:
credits = pd.read_csv('./data/credits.csv')
keywords = pd.read_csv('./data/keywords.csv')

In [66]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [67]:
md.shape

(45463, 25)

In [68]:
md = md.merge(credits, on = 'id')
md = md.merge(keywords, on = 'id')

In [69]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [70]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [76]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [77]:
smd['director'] = smd['crew'].apply(get_director)

In [81]:
# 前三名
# smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [82]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [83]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [85]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

In [94]:
s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [95]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [96]:
s = s[s > 1]

In [98]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [100]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [101]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [102]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [103]:
count = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [104]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [105]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

### 重新编写的get_recommdentions方法

In [107]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [108]:
improved_recommendations('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.917588
8613,Interstellar,11187,8,2014,7.897107
6623,The Prestige,4510,8,2006,7.758148
3381,Memento,4168,8,2000,7.740175
8031,The Dark Knight Rises,9263,7,2012,6.921448
6218,Batman Begins,7511,7,2005,6.904127
7583,Kick-Ass,4747,7,2010,6.852979
8682,The Equalizer,2997,7,2014,6.77799
1134,Batman Returns,1706,6,1992,5.846862
9024,Batman v Superman: Dawn of Justice,7189,5,2016,5.013943


In [110]:
improved_recommendations('The Godfather')

Unnamed: 0,title,vote_count,vote_average,year,wr
284,The Shawshank Redemption,8358,8,1994,7.864
994,The Godfather: Part II,3418,8,1974,7.689586
986,GoodFellas,3211,8,1990,7.671958
981,Apocalypse Now,2112,8,1979,7.530356
1602,The Godfather: Part III,1589,7,1990,6.623473
1100,Dracula,1087,7,1992,6.499201
2998,The Conversation,377,7,1974,6.060771
2808,Midnight Express,309,7,1978,5.974812
7464,The Bad Lieutenant: Port of Call - New Orleans,331,6,2009,5.571615
642,Jack,340,5,1996,5.137319


# 基于协同过滤推荐

In [112]:
reader = Reader()

In [113]:
ratings = pd.read_csv('./data/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [118]:
from surprise.model_selection import KFold

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
kf = KFold(n_splits=5)
kf.split(data)

<generator object KFold.split at 0x7f9b2f3bc580>

In [125]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8964  0.8960  0.8954  0.8938  0.9045  0.8972  0.0037  
MAE (testset)     0.6914  0.6904  0.6893  0.6893  0.6952  0.6911  0.0022  
Fit time          3.61    3.59    3.59    3.51    3.56    3.57    0.03    
Test time         0.09    0.08    0.08    0.08    0.07    0.08    0.00    


{'test_rmse': array([0.89643694, 0.89603316, 0.89537797, 0.89380801, 0.90446479]),
 'test_mae': array([0.69135475, 0.69035256, 0.6893486 , 0.68931899, 0.69524611]),
 'fit_time': (3.606393814086914,
  3.586665153503418,
  3.5938589572906494,
  3.5111989974975586,
  3.556964874267578),
 'test_time': (0.08506298065185547,
  0.07992219924926758,
  0.07663631439208984,
  0.07637190818786621,
  0.07418417930603027)}

In [127]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9b48c8c520>

In [128]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [129]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.766589174925106, details={'was_impossible': False})

# 混合推荐

构建一个简单的混合推荐器，它将我们在基于内容和基于协作过滤器的引擎中实现的技术结合在一起。

输入:用户ID和电影标题

输出:类似的电影根据特定用户的预期评级进行排序。

In [130]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [134]:
id_map = pd.read_csv('./data/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [135]:
indices_map = id_map.set_index('id')

In [136]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [139]:
hybrid(2, 'The Godfather')

Unnamed: 0,title,vote_count,vote_average,year,id,est
994,The Godfather: Part II,3418.0,8.3,1974,240,4.429065
284,The Shawshank Redemption,8358.0,8.5,1994,278,4.311179
986,GoodFellas,3211.0,8.2,1990,769,3.957546
2998,The Conversation,377.0,7.5,1974,592,3.921289
981,Apocalypse Now,2112.0,8.0,1979,28,3.79568
4631,A Decade Under the Influence,9.0,8.0,2003,38868,3.714562
2742,...And Justice for All,118.0,7.1,1979,17443,3.701462
1765,The Paradine Case,42.0,6.3,1947,31667,3.650567
5907,Hearts of Darkness: A Filmmaker's Apocalypse,61.0,8.0,1991,4539,3.609857
1346,The Rainmaker,239.0,6.7,1997,11975,3.588877


In [140]:
hybrid(500, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,year,id,est
4347,Piranha Part Two: The Spawning,41.0,3.9,1981,31646,3.487422
974,Aliens,3282.0,7.7,1986,679,3.27341
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,3.176958
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,3.148655
1241,The Fifth Element,3962.0,7.3,1997,18,3.045705
8096,John Carter,2170.0,6.1,2012,49529,3.035169
2132,Superman II,642.0,6.5,1980,8536,3.008443
8724,Jupiter Ascending,2816.0,5.2,2015,76757,2.996394
4404,Treasure Planet,980.0,7.2,2002,9016,2.982294
1011,The Terminator,4208.0,7.4,1984,218,2.969003


### 结论
在这个notebook中，基于不同的想法和算法构建了4个不同的推荐引擎：

1. Simple Recommender：该系统使用整体TMDB投票数和投票平均值来构建一般和特定类型的热门电影图表。 IMDB加权评级系统用于计算最终执行排序的评级。

2. 基于内容的推荐器：我们构建了两个基于内容的引擎；一种以电影概述和标语为输入，另一种以演员、剧组、流派和关键字等元数据来进行预测。我们还设计了一个简单的过滤器，以更偏向于获得更多投票和更高评级的电影。

3. 协同过滤：我们使用强大的Surprise库构建了基于单值分解的协同过滤器。获得的 RMSE 小于 1，引擎给出了给定用户和电影的估计评级。

4. 混合引擎：我们汇集了来自内容和协作过滤的想法，构建了一个引擎，该引擎根据内部为该用户计算的估计评分向该用户提供电影建议。