In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

In [2]:
df = pd.read_csv("../data/processed/movie_data_processed.csv", index_col= False)
df["Year"] = df["Year"].fillna("Unknown")
df.head(3)

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,imdbRating,imdbVotes,imdbID,BoxOffice,Status,wr
0,Breaking Bad,Unknown,TV-MA,49 min,"Crime, Drama, Thriller",Unknown,Vince Gilligan,"Bryan Cranston, Aaron Paul, Anna Gunn",A chemistry teacher diagnosed with inoperable ...,"English, Spanish",United States,Won 16 Primetime Emmys. 169 wins & 269 nominat...,https://m.media-amazon.com/images/M/MV5BMzU5ZG...,"[{'Source': 'Internet Movie Database', 'Value'...",9.5,2225876,tt0903747,Unknown,Finished,9.464575
1,The Shawshank Redemption,1994-01-01,R,142 min,Drama,Frank Darabont,"Stephen King, Frank Darabont","Tim Robbins, Morgan Freeman, Bob Gunton",A banker convicted of uxoricide forms a friend...,English,United States,Nominated for 7 Oscars. 21 wins & 42 nominatio...,https://m.media-amazon.com/images/M/MV5BMDAyY2...,"[{'Source': 'Internet Movie Database', 'Value'...",9.3,2945396,tt0111161,"$28,767,189",Finished,9.274599
2,Band of Brothers,2001-01-01,TV-MA,594 min,"Drama, History, War",Unknown,Unknown,"Scott Grimes, Damian Lewis, Ron Livingston",The story of Easy Company of the U.S. Army 101...,"English, Dutch, French, German, Lithuanian","United Kingdom, United States",Won 6 Primetime Emmys. 34 wins & 26 nomination...,https://m.media-amazon.com/images/M/MV5BMTQ3NT...,"[{'Source': 'Internet Movie Database', 'Value'...",9.4,542582,tt0185906,Unknown,Finished,9.262584


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6548 entries, 0 to 6547
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       6548 non-null   object 
 1   Year        6548 non-null   object 
 2   Rated       6548 non-null   object 
 3   Runtime     6548 non-null   object 
 4   Genre       6548 non-null   object 
 5   Director    6548 non-null   object 
 6   Writer      6548 non-null   object 
 7   Actors      6548 non-null   object 
 8   Plot        6548 non-null   object 
 9   Language    6548 non-null   object 
 10  Country     6548 non-null   object 
 11  Awards      6548 non-null   object 
 12  Poster      6548 non-null   object 
 13  Ratings     6548 non-null   object 
 14  imdbRating  6548 non-null   float64
 15  imdbVotes   6548 non-null   int64  
 16  imdbID      6548 non-null   object 
 17  BoxOffice   6548 non-null   object 
 18  Status      6548 non-null   object 
 19  wr          6548 non-null  

# 1. Simple Recommender

> Make general recommendations to all users, based on popularity and/or genre. The basic idea behind this system is that more popular and critically acclaimed movies are more likely to be liked by the average audience. An example might be the IMDB Top 250

In [4]:
def get_simple_recommendations(df, value, collumn):
    # Lọc các dòng chứa genre
    filtered_df = df[df[collumn].str.contains(value, na=False)]
    
    # Sắp xếp theo trọng số và lấy top 250
    filtered_df = filtered_df.sort_values('wr', ascending=False).head(10)
    
    return filtered_df[['Title', 'Year', 'Genre','imdbVotes', 'imdbRating', collumn]]

In [5]:
result = get_simple_recommendations(df, 'Action', "Genre")
result.head(10)

Unnamed: 0,Title,Year,Genre,imdbVotes,imdbRating,Genre.1
4,Game of Thrones,Unknown,"Action, Adventure, Drama",2359163,9.2,"Action, Adventure, Drama"
7,Avatar: The Last Airbender,Unknown,"Animation, Action, Adventure",386757,9.3,"Animation, Action, Adventure"
13,The Dark Knight,2008-01-01,"Action, Crime, Drama",2919777,9.0,"Action, Crime, Drama"
14,Attack on Titan,Unknown,"Animation, Action, Adventure",551083,9.1,"Animation, Action, Adventure"
15,The Lord of the Rings: The Return of the King,2003-01-01,"Action, Adventure, Drama",2015572,9.0,"Action, Adventure, Drama"
22,The Lord of the Rings: The Fellowship of the Ring,2001-01-01,"Action, Adventure, Drama",2045291,8.9,"Action, Adventure, Drama"
26,Fullmetal Alchemist: Brotherhood,Unknown,"Animation, Action, Adventure",205897,9.1,"Animation, Action, Adventure"
27,Arcane,Unknown,"Animation, Action, Adventure",293759,9.0,"Animation, Action, Adventure"
28,Inception,2010-01-01,"Action, Adventure, Sci-Fi",2592712,8.8,"Action, Adventure, Sci-Fi"
31,The Lord of the Rings: The Two Towers,2002-01-01,"Action, Adventure, Drama",1819421,8.8,"Action, Adventure, Drama"


In [6]:
result = get_simple_recommendations(df, 'Stephen King', "Writer")
result.head(10)

Unnamed: 0,Title,Year,Genre,imdbVotes,imdbRating,Writer
1,The Shawshank Redemption,1994-01-01,Drama,2945396,9.3,"Stephen King, Frank Darabont"
59,The Green Mile,1999-01-01,"Crime, Drama, Fantasy",1433920,8.6,"Stephen King, Frank Darabont"
125,The Shining,1980-01-01,"Drama, Horror",1126290,8.4,"Stephen King, Stanley Kubrick, Diane Johnson"
330,Stand by Me,1986-01-01,"Adventure, Comedy, Drama",450642,8.1,"Stephen King, Raynold Gideon, Bruce A. Evans"
700,Misery,1990-01-01,"Drama, Thriller",240915,7.8,"Stephen King, William Goldman"
1485,Carrie,1976-01-01,"Horror, Mystery",210615,7.4,"Stephen King, Lawrence D. Cohen"
1726,Doctor Sleep,2019-01-01,"Drama, Fantasy, Horror",224755,7.3,"Mike Flanagan, Stephen King"
2160,The Mist,2007-01-01,"Horror, Sci-Fi, Thriller",349565,7.1,"Frank Darabont, Stephen King"
2585,Dolores Claiborne,1995-01-01,"Crime, Drama, Mystery",49323,7.4,"Stephen King, Tony Gilroy"
2619,The Dead Zone,1983-01-01,"Drama, Horror, Sci-Fi",79060,7.2,"Stephen King, Jeffrey Boam"


# 2. Content based

> For the content-based method, the system will evaluate the characteristics of the recommended items. It will suggest items based on the user's profile or based on the content and attributes of items similar to the item the user has selected.

For example, when the user watches the movie **The Shawshank Redemption**, the system will recommend the movie **The Green Mile**, which is in the same **Drama** genre or writer **Stephen King** as the movie the user likes.

&rarr; Therefore, the system only needs to know which movie the user watches, not the ratings data, which helps it work even when the user does not have the habit of rating movies. And of course, it only recommends movies with similar characteristics, not a variety of movies or movies that are highly rated by the movie-watching community.

In [7]:
tf = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tf.fit_transform(df['Plot'])

In [8]:
tfidf_matrix.shape

(6548, 16163)

In [9]:
# def computeCosineSimilarity(word_matrix):
#     cosine_similarity = linear_kernel(word_matrix, word_matrix)
#     return cosine_similarity

In [10]:
def computePearsonCorrelation(word_matrix):
    return np.corrcoef(word_matrix)

In [11]:
indices = pd.Series(df.index, index=df['Title'])
indices

Title
Breaking Bad                   0
The Shawshank Redemption       1
Band of Brothers               2
Chernobyl                      3
Game of Thrones                4
                            ... 
Bounce                      6543
Mine                        6544
Drive Me Crazy              6545
Suspect Zero                6546
The Reef                    6547
Length: 6548, dtype: int64

In [12]:
def get_content_based_recommendations(movie_title, similarity_scores):
    # Fetch index of movie based on given title
    movie_idx = indices[movie_title]
    
    # Fetch similarity score of all movies with the given movie
    # Fetch it as a tuple of (index, score)
    similarity_scores = list(enumerate(similarity_scores[movie_idx]))
    
    # Sort the above score
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Pick index and score of 10 most similar movies
    # Skip the 0th index since it is same movie (itself)
    similarity_scores = similarity_scores[1:11]
    
    # Find the indices of these similar movies
    movie_similar_indices = [i[0] for i in similarity_scores]
    
    # Find title of these top movies and return
    return df.iloc[movie_similar_indices][['Title', 'imdbID', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']].sort_values(by='wr', ascending=False)

In [13]:
similarity = cosine_similarity(tfidf_matrix)

In [14]:
get_content_based_recommendations('The Dark Knight', similarity)

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
207,Batman Begins,tt0372784,2005-01-01,"Action, Crime, Drama",1601743,8.2,8.16797
219,Kill Bill: Vol. 1,tt0266697,2003-01-01,"Action, Crime, Thriller",1214597,8.2,8.157936
562,The Batman,tt1877830,2022-01-01,"Action, Crime, Drama",828211,7.8,7.748807
697,Gotham,tt3749900,Unknown,"Action, Crime, Drama",243328,7.8,7.635636
716,The Penguin,tt15435876,2024-01-01,"Crime, Drama, Fantasy",33845,8.8,7.624992
1112,Batman,tt0096895,1989-01-01,"Action, Adventure",412616,7.5,7.41446
2178,Batman Returns,tt0103776,1992-01-01,"Action, Crime, Fantasy",332740,7.1,7.019165
5145,Batman: The Killing Joke,tt4853102,2016-01-01,"Animation, Action, Crime",62497,6.4,6.235893
6230,Becky,tt10314450,2020-01-01,"Action, Crime, Drama",25862,6.0,5.888669
6303,Domino,tt0421054,2005-01-01,"Action, Biography, Crime",69894,5.9,5.865767


In [15]:
tags_df = pd.read_csv('../data/processed/tags_processed.csv')

In [16]:
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724905 entries, 0 to 724904
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   userId   724905 non-null  int64 
 1   movieId  724905 non-null  int64 
 2   tag      724905 non-null  object
 3   imdbId   724905 non-null  object
dtypes: int64(2), object(2)
memory usage: 22.1+ MB


In [17]:
tags_grouped = tags_df.groupby('imdbId')['tag'].apply(list).reset_index()
del tags_df
tags_grouped.rename(columns={"imdbId": "imdbID"}, inplace=True)
tags_grouped

Unnamed: 0,imdbID,tag
0,tt0000417,"[classic, experimental, sci-fi, black and whit..."
1,tt0000439,"[less than 300 ratings, not available from Net..."
2,tt0004972,"[racism, Racist, Racist History, racist, contr..."
3,tt0010323,"[german expressionism, psychology, serial kill..."
4,tt0012349,"[Tumey's DVDs, charity, orphan, poverty, silen..."
...,...,...
5766,tt9806192,"[animation, beautiful, cinematography, dreamli..."
5767,tt9812474,"[grief, atmospheric, fantasy, A24, atmospheric..."
5768,tt9845564,"[convoy, rapist, revenge, trapped inside, winter]"
5769,tt9873892,"[cheesy, cliche, Black, Blaxploitation, satire]"


In [18]:
df = df.merge(tags_grouped, on='imdbID', how='left')
df.head(1)

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Language,...,Awards,Poster,Ratings,imdbRating,imdbVotes,imdbID,BoxOffice,Status,wr,tag
0,Breaking Bad,Unknown,TV-MA,49 min,"Crime, Drama, Thriller",Unknown,Vince Gilligan,"Bryan Cranston, Aaron Paul, Anna Gunn",A chemistry teacher diagnosed with inoperable ...,"English, Spanish",...,Won 16 Primetime Emmys. 169 wins & 269 nominat...,https://m.media-amazon.com/images/M/MV5BMzU5ZG...,"[{'Source': 'Internet Movie Database', 'Value'...",9.5,2225876,tt0903747,Unknown,Finished,9.464575,


In [19]:
tag_counts = df.apply(lambda x: pd.Series(x['tag']),axis=1).stack().reset_index(level=1, drop=True)
tag_counts.name = 'tag'

In [20]:
tag_counts = tag_counts.value_counts()
tag_counts[:5]

tag
sci-fi         4621
atmospheric    4167
action         3804
funny          3314
comedy         3201
Name: count, dtype: int64

In [21]:
tag_counts = tag_counts[tag_counts > 1]

In [22]:
def filter_tags(x):
    words = []
    for i in x:
        if i in tag_counts:
            words.append(i)
    return words

In [23]:
df["tag"] = df["tag"].fillna('')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6548 entries, 0 to 6547
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       6548 non-null   object 
 1   Year        6548 non-null   object 
 2   Rated       6548 non-null   object 
 3   Runtime     6548 non-null   object 
 4   Genre       6548 non-null   object 
 5   Director    6548 non-null   object 
 6   Writer      6548 non-null   object 
 7   Actors      6548 non-null   object 
 8   Plot        6548 non-null   object 
 9   Language    6548 non-null   object 
 10  Country     6548 non-null   object 
 11  Awards      6548 non-null   object 
 12  Poster      6548 non-null   object 
 13  Ratings     6548 non-null   object 
 14  imdbRating  6548 non-null   float64
 15  imdbVotes   6548 non-null   int64  
 16  imdbID      6548 non-null   object 
 17  BoxOffice   6548 non-null   object 
 18  Status      6548 non-null   object 
 19  wr          6548 non-null  

In [24]:
reco_features = ['Title', 'Director', 'Actors', 'tag', 'Genre']

In [25]:
def cleanUpData(data):
    if isinstance(data, list):
        return [str.lower(val.replace(" ", "")) for val in data]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(data, str):
            return str.lower(data.replace(" ", ""))
        else:
            return ''

In [26]:
# Apply data cleanup to reco features
modified_features = ['Director', 'Actors', 'tag', 'Genre']

for feature in modified_features:
    df[feature] = df[feature].apply(cleanUpData)
    
df[reco_features].head(5)

Unnamed: 0,Title,Director,Actors,tag,Genre
0,Breaking Bad,unknown,"bryancranston,aaronpaul,annagunn",,"crime,drama,thriller"
1,The Shawshank Redemption,frankdarabont,"timrobbins,morganfreeman,bobgunton","[basedonabook, morganfreeman, twistending, fri...",drama
2,Band of Brothers,unknown,"scottgrimes,damianlewis,ronlivingston","[accurate, gritty, war, worldwarii, notamovie,...","drama,history,war"
3,Chernobyl,unknown,"jessiebuckley,jaredharris,stellanskarsgård",,"drama,history,thriller"
4,Game of Thrones,unknown,"emiliaclarke,peterdinklage,kitharington",,"action,adventure,drama"


In [27]:
# Chuyển đổi các cột thành danh sách
df['Director'] = df['Director'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df['Actors'] = df['Actors'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df['Genre'] = df['Genre'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df[reco_features].head(1)

Unnamed: 0,Title,Director,Actors,tag,Genre
0,Breaking Bad,[unknown],"[bryancranston, aaronpaul, annagunn]",,"[crime, drama, thriller]"


In [28]:
def createSoup(data):
    # Loại bỏ trùng lặp trong mỗi danh sách trước khi nối
    director = ' '.join(data['Director'])
    actors = ' '.join(data['Actors'])
    tag = ' '.join(set(data['tag']))  # Dùng set để loại bỏ trùng lặp
    genre = ' '.join(data['Genre'])
    
    # Ghép tất cả lại thành chuỗi duy nhất
    return f"{director} {actors} {tag} {genre}"

createSoup(df.iloc[0,:])

'unknown bryancranston aaronpaul annagunn  crime drama thriller'

In [29]:
# Create a new feature Soup with mixed data
df['soup'] = df.apply(createSoup, axis=1)

reco_features = ['Title', 'Director', 'Actors', 'tag', 'Genre', 'soup']
df[reco_features].head(5)

Unnamed: 0,Title,Director,Actors,tag,Genre,soup
0,Breaking Bad,[unknown],"[bryancranston, aaronpaul, annagunn]",,"[crime, drama, thriller]",unknown bryancranston aaronpaul annagunn crim...
1,The Shawshank Redemption,[frankdarabont],"[timrobbins, morganfreeman, bobgunton]","[basedonabook, morganfreeman, twistending, fri...",[drama],frankdarabont timrobbins morganfreeman bobgunt...
2,Band of Brothers,[unknown],"[scottgrimes, damianlewis, ronlivingston]","[accurate, gritty, war, worldwarii, notamovie,...","[drama, history, war]",unknown scottgrimes damianlewis ronlivingston ...
3,Chernobyl,[unknown],"[jessiebuckley, jaredharris, stellanskarsgård]",,"[drama, history, thriller]",unknown jessiebuckley jaredharris stellanskars...
4,Game of Thrones,[unknown],"[emiliaclarke, peterdinklage, kitharington]",,"[action, adventure, drama]",unknown emiliaclarke peterdinklage kitharingto...


In [30]:
# Define a CountVectorizer Object
cntVec = CountVectorizer(stop_words='english')

# Remove NaN from soup with empty strings
df['soup'] = df['soup'].fillna('')

# Construct CountVectorizer matrix by fitting and transforming the data
cntVec_matrix = cntVec.fit_transform(df['soup'])

print("Shape of CountVectorizer matrix =", cntVec_matrix.shape)

# Topmost frequently occuring words
words = cntVec.get_feature_names_out()
counts = cntVec_matrix.sum(axis=0).reshape(-1,1).tolist()
print("Most frequently occuring words in plot overview:")
word_count = dict(sorted(zip(words, counts), key=lambda x : x[1], reverse=True)[:20])
print(word_count)

Shape of CountVectorizer matrix = (6548, 73221)
Most frequently occuring words in plot overview:
{'drama': [4338], 'comedy': [2963], 'action': [2336], 'crime': [1767], 'romance': [1556], 'adventure': [1545], 'thriller': [1347], 'nudity': [1113], 'mystery': [990], 'horror': [897], 'fi': [872], 'unknown': [857], 'sci': [856], 'murder': [853], 'violence': [801], 'fantasy': [753], 'funny': [704], 'animation': [701], 'sdvds': [678], 'biography': [677]}


In [39]:
# Find recommendations based on Cosine Similarity
similarity = cosine_similarity(cntVec_matrix)
get_content_based_recommendations('Spectre', similarity)

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
569,Skyfall,tt1074638,2012-01-01,"[action, adventure, thriller]",739511,7.8,7.742838
756,Mission: Impossible - Fallout,tt4912910,2018-01-01,"[action, adventure, thriller]",385191,7.7,7.598233
1329,Mission: Impossible - Rogue Nation,tt2381249,2015-01-01,"[action, adventure, thriller]",416465,7.4,7.320066
1540,No Time to Die,tt2382320,2021-01-01,"[action, adventure, thriller]",453414,7.3,7.230764
2774,Extraction II,tt12263384,2023-01-01,"[action, crime, thriller]",155496,7.0,6.85012
3964,Jason Bourne,tt4196776,2016-01-01,"[action, thriller]",245974,6.6,6.53267
4685,Plane,tt5884796,2023-01-01,"[action, adventure, thriller]",84481,6.5,6.349922
5224,War,tt7430722,2019-01-01,"[action, adventure, thriller]",34154,6.5,6.213585
5817,Copshop,tt5748448,2021-01-01,"[action, thriller]",40435,6.2,6.046084
6268,Power Rangers,tt3717490,2017-01-01,"[action, adventure, fantasy]",115015,5.9,5.877103


# 3. Item-based Collaborative Filtering

- Based on user's ratings, we create an interaction matrix between user rating and movies, which then we can use to compute correlation between movies,  
 then we filter out 10 movies with highest correlation coefficient.

In [40]:
df_ratings = pd.read_csv('../data/processed/ratings_processed.csv')
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21201140 entries, 0 to 21201139
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   userId  int64  
 1   rating  float64
 2   imdbID  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 485.3+ MB


In [41]:
df_ratings = df_ratings.merge(df[['imdbID', 'Title']], on= 'imdbID', how= 'inner')

df_ratings.head()

Unnamed: 0,userId,rating,imdbID,Title
0,1,8.0,tt0114388,Sense and Sensibility
1,3,10.0,tt0114388,Sense and Sensibility
2,15,9.0,tt0114388,Sense and Sensibility
3,28,8.0,tt0114388,Sense and Sensibility
4,29,8.0,tt0114388,Sense and Sensibility


In [42]:
# Create User-Item interaction matrix
matrix = df_ratings.pivot_table(index='userId', columns='Title', values='rating')

# Free memory
del df_ratings

matrix.head()

Title,10 Cloverfield Lane,10 Things I Hate About You,12 Angry Men,12 Monkeys,12 Years a Slave,127 Hours,13 Going on 30,1408,1917,2 Fast 2 Furious,...,You've Got Mail,Young Frankenstein,Young Guns,Zero Dark Thirty,Zodiac,Zombieland,Zoolander,Zootopia,eXistenZ,xXx
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,10.0,10.0,,,,,,,...,2.0,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,4.0,
5,,,,,,,,,,,...,,,,,,,,,,


In [43]:
def get_collaborative_filtering_recommendations(movie):
    
    # Fetch ratings for movie
    movie_user_rating = matrix[movie]

    # Find correlation between movies
    similar_to_movie= matrix.corrwith(movie_user_rating)

    # Getting correlated movies
    corr_movies = pd.DataFrame(similar_to_movie, columns=['Correlation'])
    corr_movies = corr_movies.sort_values(by='Correlation', ascending=False)
    
    corr_movies_indeces = corr_movies[1:11].index
    
    return df[df['Title'].isin(corr_movies_indeces)][['Title', 'imdbID', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']].sort_values(by='wr', ascending=False)

In [44]:
get_collaborative_filtering_recommendations('Spectre')

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
386,Casino Royale,tt0381061,2006-01-01,"[action, adventure, thriller]",702997,8.0,7.934089
569,Skyfall,tt1074638,2012-01-01,"[action, adventure, thriller]",739511,7.8,7.742838
1329,Mission: Impossible - Rogue Nation,tt2381249,2015-01-01,"[action, adventure, thriller]",416465,7.4,7.320066
2813,Mission: Impossible III,tt0317919,2006-01-01,"[action, adventure, thriller]",395751,6.9,6.841569
3856,Live and Let Die,tt0070328,1973-01-01,"[action, adventure, thriller]",116799,6.7,6.554277
4233,Quantum of Solace,tt0830515,2008-01-01,"[action, adventure, mystery]",475392,6.5,6.468051
4389,Tomorrow Never Dies,tt0120347,1997-01-01,"[action, adventure, thriller]",206514,6.5,6.430333
4557,Diamonds Are Forever,tt0066995,1971-01-01,"[action, adventure, thriller]",115496,6.5,6.383962
4720,The World Is Not Enough,tt0143145,1999-01-01,"[action, adventure, thriller]",211725,6.4,6.341011
5713,Die Another Day,tt0246460,2002-01-01,"[action, adventure, thriller]",231107,6.1,6.070799


# 4. Weighted-Mixed Hybrid

- Weighted means movies are ranked based on weighted average score.
- Mixed means we take both recommendations lists as the result.

In [46]:
def get_hybrid_recommendations(movie):
    content_based_recommends = get_content_based_recommendations(movie, similarity)
    collaborative_filtering_recommends = get_collaborative_filtering_recommendations(movie)
    
    # Combine 2 recommendations lists
    hybrid_recommends = pd.concat([content_based_recommends, collaborative_filtering_recommends], ignore_index=True) \
                            .sort_values(by='wr', ascending=False) \
                            .drop_duplicates(['Title'],ignore_index= True)
    
    return hybrid_recommends

In [47]:
get_hybrid_recommendations('Spectre')

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
0,Casino Royale,tt0381061,2006-01-01,"[action, adventure, thriller]",702997,8.0,7.934089
1,Skyfall,tt1074638,2012-01-01,"[action, adventure, thriller]",739511,7.8,7.742838
2,Mission: Impossible - Fallout,tt4912910,2018-01-01,"[action, adventure, thriller]",385191,7.7,7.598233
3,Mission: Impossible - Rogue Nation,tt2381249,2015-01-01,"[action, adventure, thriller]",416465,7.4,7.320066
4,No Time to Die,tt2382320,2021-01-01,"[action, adventure, thriller]",453414,7.3,7.230764
5,Extraction II,tt12263384,2023-01-01,"[action, crime, thriller]",155496,7.0,6.85012
6,Mission: Impossible III,tt0317919,2006-01-01,"[action, adventure, thriller]",395751,6.9,6.841569
7,Live and Let Die,tt0070328,1973-01-01,"[action, adventure, thriller]",116799,6.7,6.554277
8,Jason Bourne,tt4196776,2016-01-01,"[action, thriller]",245974,6.6,6.53267
9,Quantum of Solace,tt0830515,2008-01-01,"[action, adventure, mystery]",475392,6.5,6.468051


# 5. Evaluation Metric

- The code below is a metric that Netflix and Amazon uses for their recommendation system evaluation.
- However, it is implemented for user-based recommender rather than item-based recommender because it requires user-oriented recommendations for the evaluation  
to be correct, not general opinions such as the result of item-based recommender.

In [None]:
# # Number of users to sample
# user_count = 10

# # Hit count, which represents accurate recommend attempts
# hit = 0

# for i in range(1, user_count + 1):
#     # Get user's unrated and rated movies
#     user_nan = matrix.columns[matrix[i-1:i].isna().any()].tolist()
#     user_rated = matrix.columns[matrix[i-1:i].notna().any()].tolist()
    
#     # Mixed 99 user-unrated sample with 1 user-rated sample
#     random_unrated = random.sample(user_nan, 99)
#     random_rated = random.sample(user_rated, 1)
#     test_titles = random_unrated + random_rated
    
#     # Run recommendation system on all of the sample title
#     all_recommends = pd.DataFrame()
#     for title in test_titles:
#         recommends = get_collaborative_filtering_recommendations(title)
#         all_recommends = pd.concat([all_recommends, recommends])

#     # Count occurences
#     counted_recommend = all_recommends.value_counts().reset_index(name='Counts')
    
#     # Get top 10 occurences in recommended list
#     top_10_occur = counted_recommend[0:10]
    
#     if test_titles[0] in top_10_occur['Title']:
#         hit += 1

# # Calculated hit ratio
# hit_ratio = hit / user_count
# print('Accuracy: ', hit_ratio)