<a href="https://colab.research.google.com/github/KevinTheRainmaker/MovieRecSys/blob/main/MovieRecSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/data/TMDB_5000/' 

Mounted at /content/gdrive


## Dataset

TMDB 5000 Datasets from Kaggle

In [2]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

movies = pd.read_csv(os.path.join(root_path, 'tmdb_5000_movies.csv'))
credits = pd.read_csv(os.path.join(root_path, 'tmdb_5000_credits.csv'))

In [3]:
movies.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
96,160000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...",http://inceptionmovie.warnerbros.com/,27205,"[{""id"": 1014, ""name"": ""loss of lover""}, {""id"":...",en,Inception,"Cobb, a skilled thief who commits corporate es...",167.58371,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2010-07-14,825532764,148.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Your mind is the scene of the crime.,Inception,8.1,13752


In [4]:
credits.sample()

Unnamed: 0,movie_id,title,cast,crew
1341,16911,The Inhabited Island,"[{""cast_id"": 1, ""character"": ""Maxim Kammerer"",...","[{""credit_id"": ""54e46c62c3a368454b00a240"", ""de..."


### Combine datasets

In [5]:
credits.rename(columns = {'movie_id' : 'id'}, inplace = True)

In [6]:
df = movies.merge(credits, on=['title', 'id'])
df.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
3777,0,"[{""id"": 14, ""name"": ""Fantasy""}, {""id"": 12, ""na...",,27551,"[{""id"": 12554, ""name"": ""dragon""}, {""id"": 14665...",en,The Barbarians,Orphaned brothers Kutchek and Gore are adopted...,1.631993,"[{""name"": ""Cannon Films"", ""id"": 4110}]",...,0,87.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Warriors. Conquerors. Heroes.,The Barbarians,5.1,8,"[{""cast_id"": 1, ""character"": ""Kutchek"", ""credi...","[{""credit_id"": ""58b45b74c3a368525401aa86"", ""de..."


### Simple EDA

In [7]:
df.shape

(4803, 22)

In [8]:
df['original_language'].value_counts()

en    4505
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
cn      12
ru      11
ko      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
ta       2
cs       2
ro       2
id       2
ar       2
vi       1
sl       1
ps       1
no       1
ky       1
hu       1
pl       1
af       1
nb       1
tr       1
is       1
xx       1
te       1
el       1
Name: original_language, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [10]:
df.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                       0
crew                       0
dtype: int64

### Extract some features
- genres
- id
- keywords
- title
- overview
- cast
- crew

In [11]:
extract = df[['id','title','overview','genres','keywords','cast','crew']].copy()
extract.dropna(inplace=True)
extract.sample()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
4450,367551,American Hero,"Melvin, a reluctant hero who is far from super...","[{""id"": 28, ""name"": ""Action""}, {""id"": 35, ""nam...",[],"[{""cast_id"": 0, ""character"": ""Melvin"", ""credit...","[{""credit_id"": ""56414f6f92514128ad000f37"", ""de..."


In [12]:
extract.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [13]:
extract.duplicated().sum()

0

In [14]:
extract.shape

(4800, 7)

### Data Preprocessing

In [15]:
# genres, keywords
from ast import literal_eval
extract['genres'] = extract['genres'].apply(literal_eval)
extract['keywords'] = extract['keywords'].apply(literal_eval)
extract['genres'] = extract['genres'].apply(lambda x : [y['name'] for y in x])
extract['keywords'] = extract['keywords'].apply(lambda x : [y['name'] for y in x])

In [16]:
extract['genres'] = extract['genres'].apply(lambda x:[i.lower().replace(" ", "_") for i in x]) # ex: Science Fiction to science_fiction
extract['keywords'] = extract['keywords'].apply(lambda x:[i.replace(" ", "_") for i in x]) # ex: space war to space_war

In [17]:
extract[['genres','keywords']].sample()

Unnamed: 0,genres,keywords
165,"[drama, action, science_fiction]","[california, san_francisco, monster, general, ..."


In [18]:
# cast
extract['cast']

0       [{"cast_id": 242, "character": "Jake Sully", "...
1       [{"cast_id": 4, "character": "Captain Jack Spa...
2       [{"cast_id": 1, "character": "James Bond", "cr...
3       [{"cast_id": 2, "character": "Bruce Wayne / Ba...
4       [{"cast_id": 5, "character": "John Carter", "c...
                              ...                        
4798    [{"cast_id": 1, "character": "El Mariachi", "c...
4799    [{"cast_id": 1, "character": "Buzzy", "credit_...
4800    [{"cast_id": 8, "character": "Oliver O\u2019To...
4801    [{"cast_id": 3, "character": "Sam", "credit_id...
4802    [{"cast_id": 3, "character": "Herself", "credi...
Name: cast, Length: 4800, dtype: object

In [19]:
def convert_cast(obj):
  L = []
  counter = 0
  for i in literal_eval(obj):
    if counter != 3: # max: 3
      L.append(i['name'])
      counter+=1
    else:
      break
  return L

In [20]:
extract['cast'] = extract['cast'].apply(convert_cast)
extract['cast'] = extract['cast'].apply(lambda x:[i.replace(" ", "") for i in x])

In [21]:
extract.sample()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
3238,773,Little Miss Sunshine,"A family loaded with quirky, colorful characte...","[comedy, drama]","[california, brother_sister_relationship, wife...","[GregKinnear, ToniCollette, SteveCarell]","[{""credit_id"": ""52fe4275c3a36847f802011b"", ""de..."


In [22]:
# crew
def fetch_director(obj):
  L = []
  for i in literal_eval(obj):
    if i['job']=='Director':
      L.append(i['name'])
      break
  return L

In [23]:
# crew to director
extract['director'] = extract['crew'].apply(fetch_director)
extract = extract.drop(columns=['crew'])
extract['director'] = extract['director'].apply(lambda x:[i.replace(" ", "") for i in x])

In [24]:
extract.sample()

Unnamed: 0,id,title,overview,genres,keywords,cast,director
1208,209451,Jersey Boys,From director Clint Eastwood comes the big-scr...,"[music, drama]","[biography, based_on_play]","[ChristopherWalken, VincentPiazza, FreyaTingley]",[ClintEastwood]


In [25]:
# overview
# extract['overview'] = extract['overview'].apply(lambda x: x.lower().split())

In [26]:
# extract.sample()

In [27]:
extract['tags'] = extract['genres'] + extract['keywords'] + extract['cast'] + extract['director']
movies_df = extract#[['id','title','overview','tags']]

In [28]:
movies_df.sample()

Unnamed: 0,id,title,overview,genres,keywords,cast,director,tags
2584,18681,Eye of the Beholder,A reclusive surveillance expert is hired to sp...,"[drama, mystery, thriller]","[beautiful_woman, serial_killer, secret_servic...","[EwanMcGregor, AshleyJudd, PatrickBergin]",[StephanElliott],"[drama, mystery, thriller, beautiful_woman, se..."


In [29]:
movies_df['keywords'][0]

['culture_clash',
 'future',
 'space_war',
 'space_colony',
 'society',
 'space_travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien_planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love_affair',
 'anti_war',
 'power_relations',
 'mind_and_soul',
 '3d']

## Measure Content Simillarity

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
# genres
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x: (' ').join(x)) 

count_vect = CountVectorizer(min_df=0, ngram_range=(1,2)) 
# max_df / min_df: 토큰이 나타난 횟수를 기준으로, max_df 값보다 크거나, min_df 값보다 작으면 무시
# ngram_range: (min_n, max_n)으로, BoW 생성에 사용할 토큰의 크기인 n-gram의 범위를 결정 - 여기서는 최소 모노그램, 최대 바이그램

genre_mat = count_vect.fit_transform(movies_df['genres_literal']) # csr_matrix: CSR 형식 희소 행렬
genre_mat.shape

(4800, 272)

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim)

(4800, 4800)
[[1.         0.6761234  0.50709255 ... 0.         0.         0.        ]
 [0.6761234  1.         0.4        ... 0.         0.         0.        ]
 [0.50709255 0.4        1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [33]:
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:1])

[[   0   14 3493 ... 3037 3036 2399]]


In [34]:
# keywords
movies_df['keywords_literal'] = movies_df['keywords'].apply(lambda x: (' ').join(x)) 

count_vect = CountVectorizer(min_df=0, ngram_range=(1,2)) 
# max_df / min_df: 토큰이 나타난 횟수를 기준으로, max_df 값보다 크거나, min_df 값보다 작으면 무시
# ngram_range: (min_n, max_n)으로, BoW 생성에 사용할 토큰의 크기인 n-gram의 범위를 결정 - 여기서는 최소 모노그램, 최대 바이그램

keyword_mat = count_vect.fit_transform(movies_df['keywords_literal']) # csr_matrix: CSR 형식 희소 행렬
keyword_mat.shape

(4800, 39186)

In [35]:
keyword_sim = cosine_similarity(keyword_mat, keyword_mat)
print(keyword_sim.shape)
print(keyword_sim)

(4800, 4800)
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [36]:
keyword_sim_sorted_ind = keyword_sim.argsort()[:, ::-1]
print(keyword_sim_sorted_ind[:1])

[[   0   47  492 ... 3176 3177 2399]]


In [None]:
# cast + director
