<a href="https://colab.research.google.com/github/KevinTheRainmaker/MovieRecSys/blob/main/MovieRecSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/data/TMDB_5000/' 

Mounted at /content/gdrive


## Dataset

TMDB 5000 Datasets from Kaggle

In [2]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

movies = pd.read_csv(os.path.join(root_path, 'tmdb_5000_movies.csv'))
credits = pd.read_csv(os.path.join(root_path, 'tmdb_5000_credits.csv'))

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Combine datasets

In [5]:
credits.rename(columns = {'movie_id' : 'id'}, inplace = True)

In [6]:
df = movies.merge(credits, on=['title', 'id'])
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'cast', 'crew'],
      dtype='object')

### Simple EDA

In [7]:
df.shape

(4803, 22)

In [8]:
df['original_language'].value_counts()

en    4505
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
cn      12
ru      11
ko      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
ta       2
cs       2
ro       2
id       2
ar       2
vi       1
sl       1
ps       1
no       1
ky       1
hu       1
pl       1
af       1
nb       1
tr       1
is       1
xx       1
te       1
el       1
Name: original_language, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [10]:
df.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                       0
crew                       0
dtype: int64

In [11]:
df.duplicated().sum()

0

### Extract some features
- id
- title
- overview
- genres
- keywords
- cast
- crew
- popularity
- vote average
- vote count

In [12]:
extract = df[['id','title','overview','genres','keywords','cast','crew', 'popularity','vote_average','vote_count']].copy()

# drop missing values (overview)
extract.dropna(inplace=True)

In [13]:
pd.set_option('max_colwidth', 100)
extract[['genres','keywords']].sample()

Unnamed: 0,genres,keywords
3793,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""name"": ""Romance""}]",[]


In [14]:
extract.shape

(4800, 10)

In [15]:
extract.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4800 entries, 0 to 4802
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            4800 non-null   int64  
 1   title         4800 non-null   object 
 2   overview      4800 non-null   object 
 3   genres        4800 non-null   object 
 4   keywords      4800 non-null   object 
 5   cast          4800 non-null   object 
 6   crew          4800 non-null   object 
 7   popularity    4800 non-null   float64
 8   vote_average  4800 non-null   float64
 9   vote_count    4800 non-null   int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 412.5+ KB


### Data Preprocessing

In [16]:
type(extract['genres'][0])

str

In [17]:
# genres, keywords: str to list 
from ast import literal_eval
extract['genres'] = extract['genres'].apply(literal_eval)
extract['keywords'] = extract['keywords'].apply(literal_eval)

In [18]:
type(extract['genres'][0])

list

In [19]:
extract['genres'][0]

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [20]:
# extract value only from 'name' key in dictionary
extract['genres'] = extract['genres'].apply(lambda x : [y['name'] for y in x])
extract['genres'] = extract['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
extract['keywords'] = extract['keywords'].apply(lambda x : [y['name'] for y in x])
extract['keywords'] = extract['keywords'].apply(lambda x:[i.replace(" ", "-") for i in x])

In [21]:
extract['genres'][0]

['Action', 'Adventure', 'Fantasy', 'ScienceFiction']

In [22]:
extract['keywords'][0]

['culture-clash',
 'future',
 'space-war',
 'space-colony',
 'society',
 'space-travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien-planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love-affair',
 'anti-war',
 'power-relations',
 'mind-and-soul',
 '3d']

In [23]:
type(extract['genres'][0])

list

In [24]:
pd.set_option('max_colwidth', 100)
extract[['genres','keywords']].sample()

Unnamed: 0,genres,keywords
1353,"[Drama, Comedy]","[autism, based-on-article]"


In [25]:
# cast
extract[['cast']].head(1)

Unnamed: 0,cast
0,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""credit_id"": ""5602a8a7c3a3685532001c9a"", ""gender"": ..."


In [26]:
def convert_cast(obj):
  L = []
  counter = 0
  for i in literal_eval(obj):
    if counter != 3: # max: 3
      L.append(i['name'])
      counter+=1
    else:
      break
  return L

In [27]:
extract['cast'] = extract['cast'].apply(convert_cast)
extract['cast'] = extract['cast'].apply(lambda x:[i.replace(" ", "") for i in x])

In [28]:
extract[['cast']].sample()

Unnamed: 0,cast
3606,"[OwenWilson, LakeBell, PierceBrosnan]"


In [29]:
# crew
def fetch_director(obj):
  L = []
  for i in literal_eval(obj):
    if i['job']=='Director':
      L.append(i['name'])
      break
  return L

In [30]:
# crew to director
extract['crew'] = extract['crew'].apply(fetch_director)
extract.rename(columns = {'crew' : 'director'}, inplace = True)
extract['director'] = extract['director'].apply(lambda x:[i.replace(" ", "") for i in x])

In [31]:
extract[['director']].head(1)

Unnamed: 0,director
0,[JamesCameron]


In [32]:
# overview
# extract['overview'] = extract['overview'].apply(lambda x: x.lower().split())

In [33]:
extract.columns

Index(['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'director',
       'popularity', 'vote_average', 'vote_count'],
      dtype='object')

In [34]:
extract['tags'] = extract['genres'] + extract['cast'] + extract['director']
movies_df = extract#[['id','title','overview','tags']]

In [35]:
movies_df.sample()

Unnamed: 0,id,title,overview,genres,keywords,cast,director,popularity,vote_average,vote_count,tags
412,9341,The Core,Geophysicist Dr. Josh Keyes discovers that an unknown force has caused the earth's inner core to...,"[Action, Thriller, Adventure, ScienceFiction]","[magnetic-field, center-of-the-earth, disaster-film]","[AaronEckhart, HilarySwank, DelroyLindo]",[JonAmiel],29.211255,5.4,516,"[Action, Thriller, Adventure, ScienceFiction, AaronEckhart, HilarySwank, DelroyLindo, JonAmiel]"


In [36]:
movies_df.shape

(4800, 11)

## Measure Content Simillarity

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
# genres
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x: (' ').join(x)) 

count_vect = CountVectorizer(min_df=0, ngram_range=(1,1)) 
'''
max_df / min_df: 
  토큰이 나타난 횟수를 기준으로, 토큰의 빈도가 max_df로 지정한 값을 초과 하거나 
  min_df로 지정한 값보다 작은 경우에는 무시한다. 
  인수 값은 정수인 경우 횟수, 부동소수점인 경우 비중을 뜻한다

ngram_range: 
  (min_n, max_n)으로, BoW 생성에 사용할 토큰의 크기인 n-gram의 범위를 결정.
  여기서는 장르의 공백을 제거했으므로, min_n과 max_n을 모두 모노그램으로 한다.
'''
genre_mat = count_vect.fit_transform(movies_df['genres_literal']) # csr_matrix: CSR 형식 희소 행렬
genre_mat.shape

(4800, 20)

In [39]:
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim)

(4800, 4800)
[[1.         0.8660254  0.57735027 ... 0.         0.         0.        ]
 [0.8660254  1.         0.66666667 ... 0.         0.         0.        ]
 [0.57735027 0.66666667 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [40]:
genre_sim[1]

array([0.8660254 , 1.        , 0.66666667, ..., 0.        , 0.        ,
       0.        ])

In [41]:
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:1])

[[   0 3493   46 ... 3037 3036 2399]]


In [42]:
# keywords
movies_df['keywords_literal'] = movies_df['keywords'].apply(lambda x: (' ').join(x)) 

count_vect = CountVectorizer(min_df=0, ngram_range=(1,1)) 

keyword_mat = count_vect.fit_transform(movies_df['keywords_literal'])
keyword_mat.shape

(4800, 7159)

In [43]:
keyword_sim = cosine_similarity(keyword_mat, keyword_mat)
print(keyword_sim.shape)
print(keyword_sim)

(4800, 4800)
[[1.         0.029173   0.         ... 0.04612656 0.         0.        ]
 [0.029173   1.         0.         ... 0.06324555 0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.04612656 0.06324555 0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


keyword는 추천에 큰 의미가 없는 듯 하다

In [44]:
keyword_sim_sorted_ind = keyword_sim.argsort()[:, ::-1]
print(keyword_sim_sorted_ind[:1])

[[   0 2403  278 ... 3069 3068 2399]]


In [45]:
# cast
movies_df['cast_literal'] = movies_df['cast'].apply(lambda x: (' ').join(x)) 
count_vect = CountVectorizer(min_df=10, ngram_range=(1,1)) # 10개 이상의 작품에 나온 배우
cast_mat = count_vect.fit_transform(movies_df['cast_literal'])
print(cast_mat.shape)
cast_sim = cosine_similarity(cast_mat, cast_mat)

(4800, 284)


cast 또한 추천에 있어 큰 의미를 보이지 않는다

In [46]:
cast_sim_sorted_ind = cast_sim.argsort()[:, ::-1]
print(cast_sim_sorted_ind[:1])

[[   0 2153 3157 ... 3203 3204 2399]]


In [47]:
# director
movies_df['director_literal'] = movies_df['director'].apply(lambda x: (' ').join(x)) 
count_vect = CountVectorizer(min_df=5, ngram_range=(1,1)) # 5개 이상 작품을 만든 감독
director_mat = count_vect.fit_transform(movies_df['director_literal'])
print(director_mat.shape)
director_sim = cosine_similarity(director_mat, director_mat)

print(director_sim.shape)
print(director_sim)

(4800, 221)
(4800, 4800)
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


director도 마찬가지

In [48]:
director_sim_sorted_ind = director_sim.argsort()[:, ::-1]
print(director_sim_sorted_ind[:1])

[[   0   25  279 ... 3201 3202 2399]]


**결론적으로, 장르를 제외하면 유의미한 유사도 판단 기준이 없다.**

In [49]:
new_df = movies_df[['id','title','genres_literal','popularity','vote_average','vote_count']]

## Recommendation using Genre Contents Filtering

In [50]:
new_df[new_df['title'] == 'Batman Begins']

Unnamed: 0,id,title,genres_literal,popularity,vote_average,vote_count
119,272,Batman Begins,Action Crime Drama,115.040024,7.5,7359


In [51]:
new_df[new_df['title'] == 'Batman Begins'].index.values # index number

array([119])

### Weighted vote

$Weighted\ Rating = {v\over {v+m}}\times r + {m\over {v+m}}\times c$

- $v$: 개별 영화에 평점을 투표한 횟수 = vote_count

- $m$: 평점을 부여하기 위한 최소 투표 횟수 (조절 가능. 높을수록 투표수가 많은 영화를 우선시)

- $r$: 개별 영화에 대한 평균 평점 = vote_average

- $c$: 전체 영화에 대한 평균 평점 = movies_df['vote_average'].mean()

In [52]:
m = movies_df['vote_count'].quantile(0.6) # m: 상위 60%
c = movies_df['vote_average'].mean()
print('c:', round(c,3), '\nm:', round(m,3))

c: 6.093 
m: 371.0


In [53]:
percentile = 0.6

c = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(percentile)

def weighted_vote_average(record):
    v = record['vote_count']
    r = record['vote_average']
    return ((v/(v+m)) * r) + ((m/(v+m)) * c) # weighted rating

movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)

In [54]:
def recommend(movie):
  similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, movie, 10)
  return similar_movies[['title','overview','genres','popularity','weighted_vote']]

In [55]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10):
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values
    
    #top_n의 2배에 해당하는 유사 장르 인덱스 추출
    similar_indexes = sorted_ind[title_index, :(top_n * 2)]
    similar_indexes = similar_indexes.reshape(-1)
    
    #기준 영화 인덱스는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    #top_n의 2배에 해당하는 후보군에서 weighted_vote가 높은 순으로 top_n만큼 추출
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]

In [56]:
# Recommend
recommend('Avatar')

Unnamed: 0,title,overview,genres,popularity,weighted_vote
46,X-Men: Days of Future Past,The ultimate X-Men ensemble fights a war for the survival of the species across two time periods...,"[Action, Adventure, Fantasy, ScienceFiction]",118.078691,7.418471
322,The Fifth Element,"In 2257, a taxi driver is unintentionally given the task of saving a young girl who is part of t...","[Adventure, Fantasy, Action, Thriller, ScienceFiction]",109.528572,7.194777
813,Superman,"Mild-mannered Clark Kent works as a reporter at the Daily Planet alongside his crush, Lois Lane ...","[Action, Adventure, Fantasy, ScienceFiction]",48.507081,6.685048
14,Man of Steel,"A young boy learns that he has extraordinary powers and is not of this earth. As a young man, he...","[Action, Adventure, Fantasy, ScienceFiction]",99.398009,6.477559
870,Superman II,"Three escaped criminals from the planet Krypton test the Man of Steel's mettle. Led by Gen. Zod,...","[Action, Adventure, Fantasy, ScienceFiction]",30.515175,6.348972
232,The Wolverine,"Wolverine faces his ultimate nemesis - and tests of his physical, emotional, and mortal limits -...","[Action, ScienceFiction, Adventure, Fantasy]",15.953444,6.282634
3208,Star Wars: Clone Wars: Volume 1,"The Saga continues with the Emmy-winning ""Star Wars: Clone Wars."" This animated micro-series, di...","[Action, Adventure, Animation, Fantasy, ScienceFiction]",1.881466,6.222292
1390,Underworld: Rise of the Lycans,"A prequel to the first two Underworld films, this fantasy explains the origins of the feud betwe...","[Fantasy, Action, Adventure, ScienceFiction, Thriller]",51.927471,6.177917
1191,Small Soldiers,"When missile technology is used to enhance toy action figures, the toys soon begin to take their...","[Comedy, Adventure, Fantasy, ScienceFiction, Action]",23.088571,6.154957
1932,Sheena,Sheena's white parents are killed while on Safari. She is raised by the mystical witch woman of ...,"[Action, Adventure, Comedy, Fantasy, ScienceFiction]",4.020194,6.031736


In [57]:
recommended_list = list(recommend('Avatar').title.values)

for i in recommended_list:
  print(i)

X-Men: Days of Future Past
The Fifth Element
Superman
Man of Steel
Superman II
The Wolverine
Star Wars: Clone Wars: Volume 1
Underworld: Rise of the Lycans
Small Soldiers
Sheena


## Fronend/Streamlit

In [59]:
# dump pickle to use in streamlit
import pickle
pickle.dump(movies_df.to_dict(), open('movies_dict.pkl', 'wb'))
pickle.dump(genre_sim_sorted_ind, open('genre_sim_sorted_ind.pkl', 'wb'))