# Knowledge-based recommenderのサンプル

## 概要
Knowledge-based recommenderのサンプルを以下に実装する


In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('./csv/movies_metadata.csv', low_memory=False)[0:10000]

df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [2]:
df = df[['title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]

df.columns

Index(['title', 'genres', 'release_date', 'runtime', 'vote_average',
       'vote_count'],
      dtype='object')

In [3]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

df['year'] = df['release_date'].apply(lambda x: str(x).split("-")[0] if x != np.nan else np.nan)

def convert_int(x):
    try:
        return int(x)
    except:
        return 0
    
df['year'] = df['year'].apply(convert_int)

In [4]:
df = df.drop('release_date', axis=1)
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


In [5]:
from ast import literal_eval

df['genres'] = df['genres'].fillna('[]')

df['genres'] = df['genres'].apply(literal_eval)

df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,1995
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[Comedy, Drama, Romance]",127.0,6.1,34.0,1995
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,1995


In [6]:
s = df.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)

s.name = 'genre'
gen_df = df.drop('genres', axis=1).join(s)
gen_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995,Animation
0,Toy Story,81.0,7.7,5415.0,1995,Comedy
0,Toy Story,81.0,7.7,5415.0,1995,Family
1,Jumanji,104.0,6.9,2413.0,1995,Adventure
1,Jumanji,104.0,6.9,2413.0,1995,Fantasy


## ロジック部分

### データの絞り込み
SQL操作とかデータのフィルタリング部分
特に難しい事はない。普通の検索処理

### スコア計算
計算モデルを組まないといけないのでmetabase-recommenderとかに比べて難易度高いかも?
求人の紹介なら年収とか人の数とか知名度がパラメータになりそう

組めたとしても、どういう基準でリコメンドしているんですか？という質問に対する説明がかなり難しい

In [7]:
def build_chart(gen_df, percentile=0.8):
    print("Genres:")
    genre = input()
    
    print("Shoutest Duration:")
    low_time = int( input() )
    
    print("Longest Duration:")
    high_time = int( input() )
    
    print("Earliest Year:")
    low_year = int( input() )
    
    print("Latest Year:")
    high_year = int( input() )
    
    movies = gen_df.copy()
    
    # ここらへんはSQLで代用可能
    # 普通の検索処理と同じ
    movies = movies[(movies['genre'] == genre) &
                    (movies['runtime'] >= low_time) &
                    (movies['runtime'] <= high_time) &
                    (movies['year'] >= low_year ) &
                    (movies['year'] <= high_year )]

    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count'] / ( x['vote_count'] + m ) * x['vote_average']) + (m / ( m + x['vote_count']) * C), axis=1)
        
    q_movies = q_movies.sort_values('score', ascending=False)
    
    return q_movies

In [9]:
build_chart(gen_df).head()

Genres:
Animation
Shoutest Duration:
30
Longest Duration:
120
Earliest Year:
1990
Latest Year:
2005


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score
359,The Lion King,89.0,8.0,5520.0,1994,Animation,7.768508
9698,Howl's Moving Castle,119.0,8.2,2049.0,2004,Animation,7.640806
0,Toy Story,81.0,7.7,5415.0,1995,Animation,7.51284
6232,Finding Nemo,100.0,7.6,6292.0,2003,Animation,7.449359
4756,"Monsters, Inc.",92.0,7.5,6150.0,2001,Animation,7.360774
