<a href="https://colab.research.google.com/github/KevinTheRainmaker/MovieRecSys/blob/main/MovieRecSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/data/TMDB_5000/' 

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Dataset

TMDB 5000 Datasets from Kaggle

In [60]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

movies = pd.read_csv(os.path.join(root_path, 'tmdb_5000_movies.csv'))
credits = pd.read_csv(os.path.join(root_path, 'tmdb_5000_credits.csv'))

In [61]:
movies.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
1685,30000000,"[{""id"": 35, ""name"": ""Comedy""}]",,4967,"[{""id"": 128, ""name"": ""love triangle""}, {""id"": ...",en,Keeping the Faith,"Best friends since they were kids, Rabbi Jacob...",8.085872,"[{""name"": ""Spyglass Entertainment"", ""id"": 158}...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2000-04-14,37036004,127.0,"[{""iso_639_1"": ""he"", ""name"": ""\u05e2\u05b4\u05...",Released,"If you have to believe in something, you may a...",Keeping the Faith,5.9,159


In [62]:
credits.sample()

Unnamed: 0,movie_id,title,cast,crew
3372,13006,Split Second,"[{""cast_id"": 1, ""character"": ""Harley Stone"", ""...","[{""credit_id"": ""57011d5092514106c6004157"", ""de..."


### Combine datasets

In [63]:
credits.rename(columns = {'movie_id' : 'id'}, inplace = True)

In [64]:
df = movies.merge(credits, on=['title', 'id'])
df.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
656,65000000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",,9440,"[{""id"": 833, ""name"": ""white house""}, {""id"": 84...",en,Primary Colors,In this adaptation of the best-selling roman à...,9.110247,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...",...,0,143.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,What went down on the way to the top.,Primary Colors,6.1,94,"[{""cast_id"": 10, ""character"": ""Jack Stanton"", ...","[{""credit_id"": ""52fe44f7c3a36847f80b4b73"", ""de..."


### Simple EDA

In [65]:
df.shape

(4803, 22)

In [66]:
df['original_language'].value_counts()

en    4505
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
cn      12
ru      11
ko      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
ta       2
cs       2
ro       2
id       2
ar       2
vi       1
sl       1
ps       1
no       1
ky       1
hu       1
pl       1
af       1
nb       1
tr       1
is       1
xx       1
te       1
el       1
Name: original_language, dtype: int64

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [68]:
df.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                       0
crew                       0
dtype: int64

### Extract some features
- genres
- id
- keywords
- title
- overview
- cast
- crew
- popularity
- vote average
- vote count

In [69]:
extract = df[['id','title','overview','genres','keywords','cast','crew', 'popularity','vote_average','vote_count']].copy()
extract.dropna(inplace=True)
extract.sample()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,popularity,vote_average,vote_count
1313,622,The Ninth Gate,An all-expenses-paid international search for ...,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 9648, ""n...","[{""id"": 242, ""name"": ""new york""}, {""id"": 1523,...","[{""cast_id"": 25, ""character"": ""Dean Corso"", ""c...","[{""credit_id"": ""52fe425fc3a36847f801952f"", ""de...",30.359164,6.3,756


In [70]:
extract.isnull().sum()

id              0
title           0
overview        0
genres          0
keywords        0
cast            0
crew            0
popularity      0
vote_average    0
vote_count      0
dtype: int64

In [71]:
extract.duplicated().sum()

0

In [72]:
extract.shape

(4800, 10)

### Data Preprocessing

In [73]:
# genres, keywords
from ast import literal_eval
extract['genres'] = extract['genres'].apply(literal_eval)
extract['keywords'] = extract['keywords'].apply(literal_eval)
extract['genres'] = extract['genres'].apply(lambda x : [y['name'] for y in x])
extract['keywords'] = extract['keywords'].apply(lambda x : [y['name'] for y in x])

In [74]:
extract['genres'] = extract['genres'].apply(lambda x:[i.lower().replace(" ", "_") for i in x]) # ex: Science Fiction to science_fiction
extract['keywords'] = extract['keywords'].apply(lambda x:[i.replace(" ", "_") for i in x]) # ex: space war to space_war

In [75]:
extract[['genres','keywords']].sample()

Unnamed: 0,genres,keywords
735,"[thriller, action, comedy, science_fiction]","[bomb, intelligence, chauffeur, wound, secret_..."


In [76]:
# cast
extract['cast']

0       [{"cast_id": 242, "character": "Jake Sully", "...
1       [{"cast_id": 4, "character": "Captain Jack Spa...
2       [{"cast_id": 1, "character": "James Bond", "cr...
3       [{"cast_id": 2, "character": "Bruce Wayne / Ba...
4       [{"cast_id": 5, "character": "John Carter", "c...
                              ...                        
4798    [{"cast_id": 1, "character": "El Mariachi", "c...
4799    [{"cast_id": 1, "character": "Buzzy", "credit_...
4800    [{"cast_id": 8, "character": "Oliver O\u2019To...
4801    [{"cast_id": 3, "character": "Sam", "credit_id...
4802    [{"cast_id": 3, "character": "Herself", "credi...
Name: cast, Length: 4800, dtype: object

In [78]:
def convert_cast(obj):
  L = []
  counter = 0
  for i in literal_eval(obj):
    if counter != 3: # max: 3
      L.append(i['name'])
      counter+=1
    else:
      break
  return L

In [79]:
extract['cast'] = extract['cast'].apply(convert_cast)
extract['cast'] = extract['cast'].apply(lambda x:[i.replace(" ", "") for i in x])

In [80]:
extract.sample()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,popularity,vote_average,vote_count
3971,191229,Iguana,A grotesquely disfigured harpooner called Igua...,[],[],"[EverettMcGill, FabioTesti, MichaelMadsen]","[{""credit_id"": ""52fe4c7d9251416c910f68ab"", ""de...",0.214704,6.0,1


In [81]:
# crew
def fetch_director(obj):
  L = []
  for i in literal_eval(obj):
    if i['job']=='Director':
      L.append(i['name'])
      break
  return L

In [82]:
# crew to director
extract['director'] = extract['crew'].apply(fetch_director)
extract = extract.drop(columns=['crew'])
extract['director'] = extract['director'].apply(lambda x:[i.replace(" ", "") for i in x])

In [83]:
extract.sample()

Unnamed: 0,id,title,overview,genres,keywords,cast,popularity,vote_average,vote_count,director
1988,19908,Zombieland,Columbus has made a habit of running from what...,"[comedy, horror]","[washington_d.c., sister_sister_relationship, ...","[JesseEisenberg, WoodyHarrelson, EmmaStone]",57.300674,7.2,3550,[RubenFleischer]


In [84]:
# overview
# extract['overview'] = extract['overview'].apply(lambda x: x.lower().split())

In [85]:
# extract.sample()

In [86]:
extract['tags'] = extract['genres'] + extract['keywords'] + extract['cast'] + extract['director']
movies_df = extract#[['id','title','overview','tags']]

In [87]:
movies_df.sample()

Unnamed: 0,id,title,overview,genres,keywords,cast,popularity,vote_average,vote_count,director,tags
3665,78814,We Have Your Husband,"American-born Jayne Valseca, her husband Eduar...","[tv_movie, crime, drama, thriller]",[],"[TeriPolo, EsaiMorales, NicholasGonzalez]",0.102003,5.0,3,[EricBross],"[tv_movie, crime, drama, thriller, TeriPolo, E..."


In [88]:
movies_df.shape

(4800, 11)

In [89]:
movies_df['keywords'][0]

['culture_clash',
 'future',
 'space_war',
 'space_colony',
 'society',
 'space_travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien_planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love_affair',
 'anti_war',
 'power_relations',
 'mind_and_soul',
 '3d']

## Measure Content Simillarity

In [90]:
from sklearn.feature_extraction.text import CountVectorizer

In [91]:
# genres
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x: (' ').join(x)) 

count_vect = CountVectorizer(min_df=0, ngram_range=(1,2)) 
# max_df / min_df: 토큰이 나타난 횟수를 기준으로, max_df 값보다 크거나, min_df 값보다 작으면 무시
# ngram_range: (min_n, max_n)으로, BoW 생성에 사용할 토큰의 크기인 n-gram의 범위를 결정 - 여기서는 최소 모노그램, 최대 바이그램

genre_mat = count_vect.fit_transform(movies_df['genres_literal']) # csr_matrix: CSR 형식 희소 행렬
genre_mat.shape

(4800, 272)

In [92]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim)

(4800, 4800)
[[1.         0.6761234  0.50709255 ... 0.         0.         0.        ]
 [0.6761234  1.         0.4        ... 0.         0.         0.        ]
 [0.50709255 0.4        1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [93]:
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:1])

[[   0   14 3493 ... 3037 3036 2399]]


In [94]:
# keywords
movies_df['keywords_literal'] = movies_df['keywords'].apply(lambda x: (' ').join(x)) 

count_vect = CountVectorizer(min_df=0, ngram_range=(1,10)) 

keyword_mat = count_vect.fit_transform(movies_df['keywords_literal']) # csr_matrix: CSR 형식 희소 행렬
keyword_mat.shape

(4800, 178859)

In [95]:
keyword_sim = cosine_similarity(keyword_mat, keyword_mat)
print(keyword_sim.shape)
print(keyword_sim)

(4800, 4800)
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [96]:
keyword_sim_sorted_ind = keyword_sim.argsort()[:, ::-1]
print(keyword_sim_sorted_ind[:1])

[[   0  492   47 ... 3176 3177 2399]]


In [97]:
# cast + director = people
movies_df['people'] = movies_df['cast'] + movies_df['director']
movies_df['people_literal'] = movies_df['people'].apply(lambda x: (' ').join(x)) 
count_vect = CountVectorizer(min_df=0, ngram_range=(1,3)) 
people_mat = count_vect.fit_transform(movies_df['people_literal'])
people_sim = cosine_similarity(people_mat, people_mat)
people_sim_sorted_ind = people_sim.argsort()[:, ::-1]

In [98]:
print(people_sim_sorted_ind[:1])

[[   0 2403 1245 ... 3202 3203 2399]]


## Recommendation using Contents Filtering

In [99]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10):
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values
    
    #top_n의 2배에 해당하는 유사 장르 인덱스 추출
    similar_indexes = sorted_ind[title_index, :(top_n * 2)]
    similar_indexes = similar_indexes.reshape(-1)
    
    #기준 영화 인덱스는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    #top_n의 2배에 해당하는 후보군에서 weighted_vote가 높은 순으로 top_n만큼 추출
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]

In [100]:
c = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)
print('c:', round(c,3), '\nm:', round(m,3))

c: 6.093 
m: 371.0


In [101]:
percentile = 0.6

c = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(percentile)

def weighted_vote_average(record):
    v = record['vote_count']
    r = record['vote_average']
#     print(v,r)
    return ((v/(v+m)) * r) + ((m/(v+m)) * c)

movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)

In [102]:
movies_df[['title','vote_average','weighted_vote', 'vote_count']].sort_values('weighted_vote', ascending=False)[:10]

Unnamed: 0,title,vote_average,weighted_vote,vote_count
1881,The Shawshank Redemption,8.5,8.395869,8205
3337,The Godfather,8.4,8.263358,5893
662,Fight Club,8.3,8.216309,9413
3232,Pulp Fiction,8.3,8.206941,8428
65,The Dark Knight,8.2,8.13682,12002
1818,Schindler's List,8.3,8.125781,4329
3865,Whiplash,8.3,8.122956,4254
809,Forrest Gump,8.2,8.105793,7927
2294,Spirited Away,8.3,8.10555,3840
2731,The Godfather: Part II,8.3,8.079232,3338


In [103]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10):
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values
    
    #top_n의 2배에 해당하는 유사 장르 인덱스 추출
    similar_indexes = sorted_ind[title_index, :(top_n * 2)]
    similar_indexes = similar_indexes.reshape(-1)
    
    #기준 영화 인덱스는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    #top_n의 2배에 해당하는 후보군에서 weighted_vote가 높은 순으로 top_n만큼 추출
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]

In [104]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title','vote_average','weighted_vote']]

Unnamed: 0,title,vote_average,weighted_vote
875,Moulin Rouge!,7.4,7.109798
637,Les Misérables,7.1,6.934311
1684,Walk the Line,7.3,6.888771
4704,Once,7.3,6.75652
1080,Across the Universe,7.1,6.621603
3211,Crazy Heart,6.8,6.39329
2524,Coal Miner's Daughter,7.2,6.262265
2350,Center Stage,6.8,6.230959
4097,We Are Your Friends,6.3,6.225337
3212,The Rose,6.8,6.142537


In [106]:
recommend = similar_movies

Unnamed: 0,id,title,overview,genres,keywords,cast,popularity,vote_average,vote_count,director,tags,genres_literal,keywords_literal,people,people_literal,weighted_vote
875,824,Moulin Rouge!,A celebration of love and creative inspiration...,"[drama, music, romance]","[duke, musical, writer's_block, music, termina...","[NicoleKidman, EwanMcGregor, JohnLeguizamo]",57.374341,7.4,1300,[BazLuhrmann],"[drama, music, romance, duke, musical, writer'...",drama music romance,duke musical writer's_block music terminal_ill...,"[NicoleKidman, EwanMcGregor, JohnLeguizamo, Ba...",NicoleKidman EwanMcGregor JohnLeguizamo BazLuh...,7.109798
637,82695,Les Misérables,An adaptation of the successful stage musical ...,"[drama, music, romance]","[france, robbery, brothel, mayor, star, musica...","[HughJackman, RussellCrowe, AnneHathaway]",48.356214,7.1,1884,[TomHooper],"[drama, music, romance, france, robbery, broth...",drama music romance,france robbery brothel mayor star musical arre...,"[HughJackman, RussellCrowe, AnneHathaway, TomH...",HughJackman RussellCrowe AnneHathaway TomHooper,6.934311
1684,69,Walk the Line,A chronicle of country music legend Johnny Cas...,"[drama, music, romance]","[germany, prison, music_record, adultery, coun...","[JoaquinPhoenix, ReeseWitherspoon, GinniferGoo...",35.580032,7.3,718,[JamesMangold],"[drama, music, romance, germany, prison, music...",drama music romance,germany prison music_record adultery country_m...,"[JoaquinPhoenix, ReeseWitherspoon, GinniferGoo...",JoaquinPhoenix ReeseWitherspoon GinniferGoodwi...,6.888771
4704,5723,Once,A vacuum repairman moonlights as a street musi...,"[drama, music, romance]","[rock_and_roll, pop, irland, music_style, love...","[GlenHansard, MarkétaIrglová, HughWalsh]",19.052179,7.3,453,[JohnCarney],"[drama, music, romance, rock_and_roll, pop, ir...",drama music romance,rock_and_roll pop irland music_style love_of_o...,"[GlenHansard, MarkétaIrglová, HughWalsh, JohnC...",GlenHansard MarkétaIrglová HughWalsh JohnCarney,6.75652
1080,4688,Across the Universe,Musical based on The Beatles songbook and set ...,"[adventure, drama, music, romance]","[riot, protest, musical, music, cultural_diffe...","[EvanRachelWood, JimSturgess, JoeAnderson]",11.872841,7.1,410,[JulieTaymor],"[adventure, drama, music, romance, riot, prote...",adventure drama music romance,riot protest musical music cultural_difference...,"[EvanRachelWood, JimSturgess, JoeAnderson, Jul...",EvanRachelWood JimSturgess JoeAnderson JulieTa...,6.621603
3211,25196,Crazy Heart,When reporter Jean Craddock interviews Bad Bla...,"[drama, music, romance]","[taxi, country_music, journalist, guitar, bar,...","[JeffBridges, MaggieGyllenhaal, ColinFarrell]",17.697042,6.8,274,[ScottCooper],"[drama, music, romance, taxi, country_music, j...",drama music romance,taxi country_music journalist guitar bar music...,"[JeffBridges, MaggieGyllenhaal, ColinFarrell, ...",JeffBridges MaggieGyllenhaal ColinFarrell Scot...,6.39329
2524,16769,Coal Miner's Daughter,"Biography of Loretta Lynn, a country and weste...","[drama, music]","[country_music, female_friendship, biography, ...","[SissySpacek, TommyLeeJones, LevonHelm]",3.826685,7.2,67,[MichaelApted],"[drama, music, country_music, female_friendshi...",drama music,country_music female_friendship biography lore...,"[SissySpacek, TommyLeeJones, LevonHelm, Michae...",SissySpacek TommyLeeJones LevonHelm MichaelApted,6.262265
2350,10560,Center Stage,A group of 12 teenagers from various backgroun...,"[drama, music]","[new_york, competition, dancer, dance, ball, c...","[AmandaSchull, ZoeSaldana, SusanMayPratt]",6.961736,6.8,90,[NicholasHytner],"[drama, music, new_york, competition, dancer, ...",drama music,new_york competition dancer dance ball career ...,"[AmandaSchull, ZoeSaldana, SusanMayPratt, Nich...",AmandaSchull ZoeSaldana SusanMayPratt Nicholas...,6.230959
4097,301351,We Are Your Friends,Young Cole Carter dreams of hitting the big ti...,"[drama, music, romance]",[dj],"[ZacEfron, EmilyRatajkowski, WesBentley]",28.819424,6.3,658,[MaxJoseph],"[drama, music, romance, dj, ZacEfron, EmilyRat...",drama music romance,dj,"[ZacEfron, EmilyRatajkowski, WesBentley, MaxJo...",ZacEfron EmilyRatajkowski WesBentley MaxJoseph,6.225337
3212,16323,The Rose,Midler is the rock-and-roll singer Mary Rose F...,"[drama, music, romance]","[drug, rock_band, roses]","[BetteMidler, AlanBates, FredericForrest]",7.000949,6.8,28,[MarkRydell],"[drama, music, romance, drug, rock_band, roses...",drama music romance,drug rock_band roses,"[BetteMidler, AlanBates, FredericForrest, Mark...",BetteMidler AlanBates FredericForrest MarkRydell,6.142537
