# metadata-based-recommenderのサンプル

## 概要

以下にサンプルのmetadata-based-recommenderを実装する  
description-based-recommenderは、「文章」で類似度を計測するのに対して  
metadata-based-recommenderは、「要素」で類似度を計測する。  
（例えば、映画の種類とかディレクターとかの情報）  

In [1]:
import pandas as pd
metadata_orig_df = pd.read_csv('./csv/movies_metadata.csv', low_memory=False)[0:10000]
metadata_df = pd.DataFrame({ 'id': metadata_orig_df['id'],'title': metadata_orig_df['title'], 'genres': metadata_orig_df['genres'] })
metadata_df.head()

Unnamed: 0,id,title,genres
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]"


In [2]:
metadata_df.shape

(10000, 3)

In [3]:
cred_df = pd.read_csv('./csv/credits.csv', low_memory=False)
cred_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [4]:
cred_df.shape

(45476, 3)

In [5]:
keyword_df = pd.read_csv('./csv/keywords.csv', low_memory=False)
keyword_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [6]:
keyword_df.shape

(46419, 2)

3つの情報を結合させるためにIDをint型に変換する

In [7]:
import numpy as np
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan
    
metadata_df['id'] = metadata_df['id'].apply(clean_ids)
metadata_df['id'] = metadata_df[metadata_df['id'].notnull()]
metadata_df['id'] = metadata_df['id'].astype('int')

keyword_df['id'] = keyword_df['id'].astype('int')

cred_df['id'] = cred_df['id'].astype('int')

In [8]:
metadata_df = metadata_df.merge(keyword_df, on='id')
metadata_df = metadata_df.merge(cred_df, on='id')

metadata_df.head()

Unnamed: 0,id,title,genres,keywords,cast,crew
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [9]:
metadata_df.shape

(10048, 6)

文字列データをPythonObjectに変換する

In [10]:
from ast import literal_eval
df = metadata_df

df['crew'] = df['crew'].apply(literal_eval)
df['cast'] = df['cast'].apply(literal_eval)
df['keywords'] = df['keywords'].apply(literal_eval)
df['genres'] = df['genres'].apply(literal_eval)

In [11]:
df.iloc[0]['crew'][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [12]:
df.iloc[0]['cast'][0]

{'cast_id': 14,
 'character': 'Woody (voice)',
 'credit_id': '52fe4284c3a36847f8024f95',
 'gender': 2,
 'id': 31,
 'name': 'Tom Hanks',
 'order': 0,
 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}

In [13]:
df.iloc[0]['keywords'][0]

{'id': 931, 'name': 'jealousy'}

In [14]:
df.iloc[0]['genres'][0]

{'id': 16, 'name': 'Animation'}

crewの情報からdirewctorを取り出す

In [15]:
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

df['director'] = df['crew'].apply(get_director)
df['director'].head()

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
Name: director, dtype: object

3要素ずつ取り出し
cast, keywords, gneresを変換

In [16]:
def generate_list(x):
    if isinstance(x, list):
        names = [ele['name'] for ele in x]
        if len(names) > 3:
            return names[:3]
        return names
    
    return []

df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)
df['genres'] = df['genres'].apply(generate_list)

df[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


文字列を全て結合する
例えば、[Tom Hanks]は[tomhanks]になる

In [17]:
def sanitize(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
        
df['cast'] = df['cast'].apply(sanitize)
df['director'] = df['director'].apply(sanitize)
df['Tom Hanks'] = df['genres'].apply(sanitize)
df['keywords'] = df['keywords'].apply(sanitize)

比較対象の文字列を生成する

In [18]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

df['soup'] = df.apply(create_soup, axis=1)
df.iloc[0]['soup']

'jealousy toy boy tomhanks timallen donrickles johnlasseter Animation Comedy Family'

CountVectorizerを生成して類似度を算出

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)
indicies = pd.Series(df.index, index=df['title'])


In [22]:
def recommend_movie(title, cosine_sim=cosine_sim, df=df, indicies=indicies):
    idx = indicies[title]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1],  reverse=True)
    sim_scores = sim_scores[1:11]
    
    movie_indices = [i[0] for i in  sim_scores]
    
    return df['title'].iloc[movie_indices]


recommend_movie('The Wizard of Oz')

4787    Harry Potter and the Philosopher's Stone
5705     Harry Potter and the Chamber of Secrets
1996                                Return to Oz
1055         Willy Wonka & the Chocolate Factory
1920                            Babes in Toyland
2298                      Santa Claus: The Movie
3229                          The Legend of Lobo
3505                    The Slipper and the Rose
6291                             Treasure Island
59                    The Indian in the Cupboard
Name: title, dtype: object