In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("../data/processed/movie_data_processed.csv", index_col= False)
df["Year"] = df["Year"].fillna("Unknown")
df.head(1)

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,imdbRating,imdbVotes,imdbID,BoxOffice,wr
0,Breaking Bad,Unknown,TV-MA,49 min,"Crime, Drama, Thriller",Unknown,Vince Gilligan,"Bryan Cranston, Aaron Paul, Anna Gunn",A chemistry teacher diagnosed with inoperable ...,"English, Spanish",United States,Won 16 Primetime Emmys. 169 wins & 269 nominat...,https://m.media-amazon.com/images/M/MV5BMzU5ZG...,"[{'Source': 'Internet Movie Database', 'Value'...",9.5,2225876,tt0903747,Unknown,9.465909


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7675 entries, 0 to 7674
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       7675 non-null   object 
 1   Year        7675 non-null   object 
 2   Rated       7675 non-null   object 
 3   Runtime     7675 non-null   object 
 4   Genre       7675 non-null   object 
 5   Director    7675 non-null   object 
 6   Writer      7675 non-null   object 
 7   Actors      7675 non-null   object 
 8   Plot        7675 non-null   object 
 9   Language    7675 non-null   object 
 10  Country     7675 non-null   object 
 11  Awards      7675 non-null   object 
 12  Poster      7675 non-null   object 
 13  Ratings     7675 non-null   object 
 14  imdbRating  7675 non-null   float64
 15  imdbVotes   7675 non-null   int64  
 16  imdbID      7675 non-null   object 
 17  BoxOffice   7675 non-null   object 
 18  wr          7675 non-null   float64
dtypes: float64(2), int64(1), ob

# 1. Simple Recommender

> Đưa ra các đề xuất tổng quát cho mọi người dùng, dựa trên mức độ phổ biến và/hoặc thể loại phim. Ý tưởng cơ bản đằng sau hệ thống này là những bộ phim nổi tiếng hơn và được giới phê bình đánh giá cao hơn sẽ có xác suất được khán giả bình thường thích cao hơn. Một ví dụ có thể là IMDB Top 250

In [4]:
def build_chart(df, genre, percentile=0.85):
    # Lọc các dòng chứa genre
    filtered_df = df[df['Genre'].str.contains(genre, na=False)]
    
    # Chuyển các cột cần thiết về kiểu số
    vote_counts = filtered_df[filtered_df['imdbVotes'].notnull()]['imdbVotes'].astype('int')
    vote_averages = filtered_df[filtered_df['imdbRating'].notnull()]['imdbRating'].astype('float')
    
    # Tính giá trị trung bình và ngưỡng
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    # Chọn các dòng đủ điều kiện
    qualified = filtered_df[
        (filtered_df['imdbVotes'] >= m) & 
        (filtered_df['imdbVotes'].notnull()) & 
        (filtered_df['imdbRating'].notnull())
    ][['Title', 'Year', 'Genre','imdbVotes', 'imdbRating']]
    
    # Chuyển các giá trị cần thiết về kiểu số
    qualified['imdbVotes'] = qualified['imdbVotes'].astype('int')
    qualified['imdbRating'] = qualified['imdbRating'].astype('float')
    
    # Tính trọng số (weighted rating)
    qualified['wr'] = qualified.apply(
        lambda x: (x['imdbVotes'] / (x['imdbVotes'] + m) * x['imdbRating']) + 
                  (m / (m + x['imdbVotes']) * C), 
        axis=1
    )
    
    # Sắp xếp theo trọng số và lấy top 250
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified


In [5]:
result = build_chart(df, 'Drama', percentile=0.85)
result.head(10)

Unnamed: 0,Title,Year,Genre,imdbVotes,imdbRating,wr
0,Breaking Bad,Unknown,"Crime, Drama, Thriller",2225876,9.5,9.315011
1,The Shawshank Redemption,1994-01-01,Drama,2945396,9.3,9.169902
7,Game of Thrones,Unknown,"Action, Adventure, Drama",2359163,9.2,9.047768
8,The Godfather,1972-01-01,"Crime, Drama",2048917,9.2,9.026738
6,Chernobyl,2019-01-01,"Drama, History, Thriller",902682,9.3,8.928219
22,The Dark Knight,2008-01-01,"Action, Crime, Drama",2919777,9.0,8.887786
23,The Lord of the Rings: The Return of the King,2003-01-01,"Action, Adventure, Drama",2015572,9.0,8.841926
2,Band of Brothers,2001-01-01,"Drama, History, War",542582,9.4,8.820549
3,Band of Brothers,2001-01-01,"Drama, History, War",542582,9.4,8.820549
4,Band of Brothers,2001-01-01,"Drama, History, War",542582,9.4,8.820549


In [6]:
result = build_chart(df, 'Action', percentile=0.85)
result.head(10)

Unnamed: 0,Title,Year,Genre,imdbVotes,imdbRating,wr
7,Game of Thrones,Unknown,"Action, Adventure, Drama",2359163,9.2,8.896898
22,The Dark Knight,2008-01-01,"Action, Crime, Drama",2919777,9.0,8.771123
23,The Lord of the Rings: The Return of the King,2003-01-01,"Action, Adventure, Drama",2015572,9.0,8.684446
30,The Lord of the Rings: The Fellowship of the Ring,2001-01-01,"Action, Adventure, Drama",2045291,8.9,8.603717
36,Inception,2010-01-01,"Action, Adventure, Sci-Fi",2592712,8.8,8.570973
41,The Lord of the Rings: The Two Towers,2002-01-01,"Action, Adventure, Drama",1819421,8.8,8.490158
42,The Lord of the Rings: The Two Towers,2002-01-01,"Action, Adventure, Drama",1819421,8.8,8.490158
53,The Matrix,1999-01-01,"Action, Sci-Fi",2087394,8.7,8.439045
54,Star Wars: Episode V - The Empire Strikes Back,1980-01-01,"Action, Adventure, Fantasy",1400371,8.7,8.337895
71,Star Wars: Episode IV - A New Hope,1977-01-01,"Action, Adventure, Fantasy",1474225,8.6,8.272532


# 2. Content based

In [7]:
tf = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tf.fit_transform(df['Plot'])

In [8]:
tfidf_matrix.shape

(7675, 16473)

In [9]:
# def computeCosineSimilarity(word_matrix):
#     cosine_similarity = linear_kernel(word_matrix, word_matrix)
#     return cosine_similarity

In [10]:
def computePearsonCorrelation(word_matrix):
    return np.corrcoef(word_matrix)

In [11]:
indices = pd.Series(df.index, index=df['Title'])
indices

Title
Breaking Bad                   0
The Shawshank Redemption       1
Band of Brothers               2
Band of Brothers               3
Band of Brothers               4
                            ... 
The Reef                    7670
Silent Night                7671
Absentia                    7672
Absentia                    7673
Life of Crime               7674
Length: 7675, dtype: int64

In [12]:
# def get_content_based_recommendations(title):
#     idx = indices[title]
#     sim_scores = list(enumerate(cosine_sim[idx]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = sim_scores[1:11]
#     movie_indices = [i[0] for i in sim_scores]
#     return df.iloc[movie_indices][['Title', 'imdbID', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']].sort_values(by='wr', ascending=False)

In [13]:
def get_content_based_recommendations(movie_title, similarity_scores):
    # Fetch index of movie based on given title
    movie_idx = indices[movie_title]
    
    # Fetch similarity score of all movies with the given movie
    # Fetch it as a tuple of (index, score)
    similarity_scores = list(enumerate(similarity_scores[movie_idx]))
    
    # Sort the above score
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Pick index and score of 10 most similar movies
    # Skip the 0th index since it is same movie (itself)
    similarity_scores = similarity_scores[1:11]
    
    # Find the indices of these similar movies
    movie_similar_indices = [i[0] for i in similarity_scores]
    
    # Find title of these top movies and return
    return df.iloc[movie_similar_indices][['Title', 'imdbID', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']].sort_values(by='wr', ascending=False)

In [14]:
similarity = cosine_similarity(tfidf_matrix)

In [15]:
get_content_based_recommendations('The Dark Knight', similarity)

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
250,Batman Begins,tt0372784,2005-01-01,"Action, Crime, Drama",1601743,8.2,8.169103
637,The Batman,tt1877830,2022-01-01,"Action, Crime, Drama",828211,7.8,7.750532
638,The Batman,tt1877830,2022-01-01,"Action, Crime, Drama",815820,7.8,7.749799
790,Gotham,tt3749900,Unknown,"Action, Crime, Drama",243328,7.8,7.640803
1269,Batman,tt0096895,1989-01-01,"Action, Adventure",412616,7.5,7.41715
1271,Batman,tt0096895,1989-01-01,"Action, Adventure",411015,7.5,7.416842
2468,Batman Returns,tt0103776,1992-01-01,"Action, Crime, Fantasy",332740,7.1,7.021463
5922,Batman: The Killing Joke,tt4853102,2016-01-01,"Animation, Action, Crime",62497,6.4,6.237243
7273,Becky,tt10314450,2020-01-01,"Action, Crime, Drama",25862,6.0,5.884377
7368,Domino,tt0421054,2005-01-01,"Action, Biography, Crime",69894,5.9,5.863342


In [16]:
tags_df = pd.read_csv('../data/processed/tags_processed.csv')

In [17]:
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724905 entries, 0 to 724904
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   userId   724905 non-null  int64 
 1   movieId  724905 non-null  int64 
 2   tag      724905 non-null  object
 3   imdbId   724905 non-null  object
dtypes: int64(2), object(2)
memory usage: 22.1+ MB


In [18]:
tags_grouped = tags_df.groupby('imdbId')['tag'].apply(list).reset_index()
del tags_df
tags_grouped.rename(columns={"imdbId": "imdbID"}, inplace=True)
tags_grouped

Unnamed: 0,imdbID,tag
0,tt0000417,"[classic, experimental, sci-fi, black and whit..."
1,tt0000439,"[less than 300 ratings, not available from Net..."
2,tt0004972,"[racism, Racist, Racist History, racist, contr..."
3,tt0010323,"[german expressionism, psychology, serial kill..."
4,tt0012349,"[Tumey's DVDs, charity, orphan, poverty, silen..."
...,...,...
5766,tt9806192,"[animation, beautiful, cinematography, dreamli..."
5767,tt9812474,"[grief, atmospheric, fantasy, A24, atmospheric..."
5768,tt9845564,"[convoy, rapist, revenge, trapped inside, winter]"
5769,tt9873892,"[cheesy, cliche, Black, Blaxploitation, satire]"


In [19]:
df = df.merge(tags_grouped, on='imdbID', how='left')
df.head(1)

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,imdbRating,imdbVotes,imdbID,BoxOffice,wr,tag
0,Breaking Bad,Unknown,TV-MA,49 min,"Crime, Drama, Thriller",Unknown,Vince Gilligan,"Bryan Cranston, Aaron Paul, Anna Gunn",A chemistry teacher diagnosed with inoperable ...,"English, Spanish",United States,Won 16 Primetime Emmys. 169 wins & 269 nominat...,https://m.media-amazon.com/images/M/MV5BMzU5ZG...,"[{'Source': 'Internet Movie Database', 'Value'...",9.5,2225876,tt0903747,Unknown,9.465909,


In [20]:
tag_counts = df.apply(lambda x: pd.Series(x['tag']),axis=1).stack().reset_index(level=1, drop=True)
tag_counts.name = 'tag'

In [21]:
tag_counts = tag_counts.value_counts()
tag_counts[:5]

tag
sci-fi                5365
atmospheric           4816
action                4589
funny                 3698
visually appealing    3560
Name: count, dtype: int64

In [22]:
tag_counts = tag_counts[tag_counts > 1]

In [23]:
def filter_tags(x):
    words = []
    for i in x:
        if i in tag_counts:
            words.append(i)
    return words

In [24]:
df["tag"] = df["tag"].fillna('')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7675 entries, 0 to 7674
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       7675 non-null   object 
 1   Year        7675 non-null   object 
 2   Rated       7675 non-null   object 
 3   Runtime     7675 non-null   object 
 4   Genre       7675 non-null   object 
 5   Director    7675 non-null   object 
 6   Writer      7675 non-null   object 
 7   Actors      7675 non-null   object 
 8   Plot        7675 non-null   object 
 9   Language    7675 non-null   object 
 10  Country     7675 non-null   object 
 11  Awards      7675 non-null   object 
 12  Poster      7675 non-null   object 
 13  Ratings     7675 non-null   object 
 14  imdbRating  7675 non-null   float64
 15  imdbVotes   7675 non-null   int64  
 16  imdbID      7675 non-null   object 
 17  BoxOffice   7675 non-null   object 
 18  wr          7675 non-null   float64
 19  tag         7675 non-null  

In [25]:
reco_features = ['Title', 'Director', 'Actors', 'tag', 'Genre']

In [26]:
def cleanUpData(data):
    if isinstance(data, list):
        return [str.lower(val.replace(" ", "")) for val in data]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(data, str):
            return str.lower(data.replace(" ", ""))
        else:
            return ''

In [27]:
# Apply data cleanup to reco features
modified_features = ['Director', 'Actors', 'tag', 'Genre']

for feature in modified_features:
    df[feature] = df[feature].apply(cleanUpData)
    
df[reco_features].head(5)

Unnamed: 0,Title,Director,Actors,tag,Genre
0,Breaking Bad,unknown,"bryancranston,aaronpaul,annagunn",,"crime,drama,thriller"
1,The Shawshank Redemption,frankdarabont,"timrobbins,morganfreeman,bobgunton","[basedonabook, morganfreeman, twistending, fri...",drama
2,Band of Brothers,unknown,"scottgrimes,damianlewis,ronlivingston","[accurate, gritty, war, worldwarii, notamovie,...","drama,history,war"
3,Band of Brothers,unknown,"scottgrimes,damianlewis,ronlivingston","[accurate, gritty, war, worldwarii, notamovie,...","drama,history,war"
4,Band of Brothers,unknown,"scottgrimes,damianlewis,ronlivingston","[accurate, gritty, war, worldwarii, notamovie,...","drama,history,war"


In [28]:
# Chuyển đổi các cột thành danh sách
df['Director'] = df['Director'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df['Actors'] = df['Actors'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df['Genre'] = df['Genre'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df[reco_features].head(1)

Unnamed: 0,Title,Director,Actors,tag,Genre
0,Breaking Bad,[unknown],"[bryancranston, aaronpaul, annagunn]",,"[crime, drama, thriller]"


In [29]:
def createSoup(data):
    # Loại bỏ trùng lặp trong mỗi danh sách trước khi nối
    director = ' '.join(data['Director'])
    actors = ' '.join(data['Actors'])
    tag = ' '.join(set(data['tag']))  # Dùng set để loại bỏ trùng lặp
    genre = ' '.join(data['Genre'])
    
    # Ghép tất cả lại thành chuỗi duy nhất
    return f"{director} {actors} {tag} {genre}"

createSoup(df.iloc[0,:])

'unknown bryancranston aaronpaul annagunn  crime drama thriller'

In [30]:
# Create a new feature Soup with mixed data
df['soup'] = df.apply(createSoup, axis=1)

reco_features = ['Title', 'Director', 'Actors', 'tag', 'Genre', 'soup']
df[reco_features].head(5)

Unnamed: 0,Title,Director,Actors,tag,Genre,soup
0,Breaking Bad,[unknown],"[bryancranston, aaronpaul, annagunn]",,"[crime, drama, thriller]",unknown bryancranston aaronpaul annagunn crim...
1,The Shawshank Redemption,[frankdarabont],"[timrobbins, morganfreeman, bobgunton]","[basedonabook, morganfreeman, twistending, fri...",[drama],frankdarabont timrobbins morganfreeman bobgunt...
2,Band of Brothers,[unknown],"[scottgrimes, damianlewis, ronlivingston]","[accurate, gritty, war, worldwarii, notamovie,...","[drama, history, war]",unknown scottgrimes damianlewis ronlivingston ...
3,Band of Brothers,[unknown],"[scottgrimes, damianlewis, ronlivingston]","[accurate, gritty, war, worldwarii, notamovie,...","[drama, history, war]",unknown scottgrimes damianlewis ronlivingston ...
4,Band of Brothers,[unknown],"[scottgrimes, damianlewis, ronlivingston]","[accurate, gritty, war, worldwarii, notamovie,...","[drama, history, war]",unknown scottgrimes damianlewis ronlivingston ...


In [31]:
# Define a CountVectorizer Object
from sklearn.feature_extraction.text import CountVectorizer
cntVec = CountVectorizer(stop_words='english')

# Remove NaN from soup with empty strings
df['soup'] = df['soup'].fillna('')

# Construct CountVectorizer matrix by fitting and transforming the data
cntVec_matrix = cntVec.fit_transform(df['soup'])

print("Shape of CountVectorizer matrix =", cntVec_matrix.shape)

# Topmost frequently occuring words
words = cntVec.get_feature_names_out()
counts = cntVec_matrix.sum(axis=0).reshape(-1,1).tolist()
print("Most frequently occuring words in plot overview:")
word_count = dict(sorted(zip(words, counts), key=lambda x : x[1], reverse=True)[:20])
print(word_count)

Shape of CountVectorizer matrix = (7675, 74721)
Most frequently occuring words in plot overview:
{'drama': [5056], 'comedy': [3367], 'action': [2799], 'crime': [2093], 'adventure': [1821], 'romance': [1753], 'thriller': [1614], 'nudity': [1238], 'mystery': [1171], 'horror': [1093], 'fi': [1018], 'murder': [1012], 'sci': [988], 'unknown': [976], 'violence': [943], 'fantasy': [887], 'animation': [818], 'funny': [800], 'basedonabook': [792], 'death': [767]}


In [32]:
# Find recommendations based on Cosine Similarity
similarity = cosine_similarity(cntVec_matrix)
get_content_based_recommendations('Spectre', similarity)

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
647,Skyfall,tt1074638,2012-01-01,"[action, adventure, thriller]",739511,7.8,7.744757
859,Mission: Impossible - Fallout,tt4912910,2018-01-01,"[action, adventure, thriller]",385191,7.7,7.60151
1512,Mission: Impossible - Rogue Nation,tt2381249,2015-01-01,"[action, adventure, thriller]",416465,7.4,7.322539
1768,No Time to Die,tt2382320,2021-01-01,"[action, adventure, thriller]",453414,7.3,7.232876
3154,Extraction II,tt12263384,2023-01-01,"[action, crime, thriller]",155496,7.0,6.853877
4548,Jason Bourne,tt4196776,2016-01-01,"[action, thriller]",245974,6.6,6.534086
5393,Plane,tt5884796,2023-01-01,"[action, adventure, thriller]",84481,6.5,6.351961
5394,Plane,tt5884796,2023-01-01,"[action, adventure, thriller]",84481,6.5,6.351961
6759,Copshop,tt5748448,2021-01-01,"[action, thriller]",40435,6.2,6.045134
7308,Power Rangers,tt3717490,2017-01-01,"[action, adventure, fantasy]",115015,5.9,5.875561


# 3. Item-based Collaborative Filtering

In [33]:
df_ratings = pd.read_csv('../data/processed/ratings_processed.csv')
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21201140 entries, 0 to 21201139
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   userId  int64  
 1   rating  float64
 2   imdbID  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 485.3+ MB


In [34]:
df_movies_ratings = pd.merge(df_ratings, df[['imdbID','Title', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']], on= 'imdbID', how= 'inner')
df_movies_ratings

Unnamed: 0,userId,rating,imdbID,Title,Year,Genre,imdbVotes,imdbRating,wr
0,1,8.0,tt0114388,Sense and Sensibility,1995-01-01,"[drama, romance]",128329,7.7,7.431359
1,1,8.0,tt0114388,Sense and Sensibility,1995-01-01,"[drama, romance]",127437,7.7,7.429738
2,3,10.0,tt0114388,Sense and Sensibility,1995-01-01,"[drama, romance]",128329,7.7,7.431359
3,3,10.0,tt0114388,Sense and Sensibility,1995-01-01,"[drama, romance]",127437,7.7,7.429738
4,15,9.0,tt0114388,Sense and Sensibility,1995-01-01,"[drama, romance]",128329,7.7,7.431359
...,...,...,...,...,...,...,...,...,...
23573574,200778,3.0,tt0387808,Idiocracy,2006-01-01,"[adventure, comedy, sci-fi]",189394,6.5,6.426009
23573575,200825,6.0,tt0387808,Idiocracy,2006-01-01,"[adventure, comedy, sci-fi]",189394,6.5,6.426009
23573576,200886,9.0,tt0387808,Idiocracy,2006-01-01,"[adventure, comedy, sci-fi]",189394,6.5,6.426009
23573577,200930,8.0,tt0387808,Idiocracy,2006-01-01,"[adventure, comedy, sci-fi]",189394,6.5,6.426009


In [35]:
# Create User-Item interaction matrix
matrix = df_movies_ratings.pivot_table(index='userId', columns='Title', values='rating')

del df_movies_ratings

matrix.head()

Title,10 Cloverfield Lane,10 Things I Hate About You,12 Angry Men,12 Monkeys,12 Years a Slave,127 Hours,13 Going on 30,1408,1917,2 Fast 2 Furious,...,You've Got Mail,Young Frankenstein,Young Guns,Zero Dark Thirty,Zodiac,Zombieland,Zoolander,Zootopia,eXistenZ,xXx
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,10.0,10.0,,,,,,,...,2.0,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,4.0,
5,,,,,,,,,,,...,,,,,,,,,,


In [36]:
def get_collaborative_filtering_recommendations(movie):
    
    # Fetch ratings for movie
    movie_user_rating = matrix[movie]

    # Find correlation between movies
    similar_to_movie= matrix.corrwith(movie_user_rating)

    # Getting correlated movies
    corr_movies = pd.DataFrame(similar_to_movie, columns=['Correlation'])
    corr_movies = corr_movies.sort_values(by='Correlation', ascending=False)
    
    corr_movies_indeces = corr_movies[1:11].index
    
    return df[df['Title'].isin(corr_movies_indeces)][['Title', 'imdbID', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']].sort_values(by='wr', ascending=False)

In [37]:
get_collaborative_filtering_recommendations('Spectre')

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
446,Casino Royale,tt0381061,2006-01-01,"[action, adventure, thriller]",702997,8.0,7.936341
647,Skyfall,tt1074638,2012-01-01,"[action, adventure, thriller]",739511,7.8,7.744757
1512,Mission: Impossible - Rogue Nation,tt2381249,2015-01-01,"[action, adventure, thriller]",416465,7.4,7.322539
3206,Mission: Impossible III,tt0317919,2006-01-01,"[action, adventure, thriller]",395751,6.9,6.843139
4415,Live and Let Die,tt0070328,1973-01-01,"[action, adventure, thriller]",116799,6.7,6.557166
4887,Quantum of Solace,tt0830515,2008-01-01,"[action, adventure, mystery]",475392,6.5,6.468695
5057,Tomorrow Never Dies,tt0120347,1997-01-01,"[action, adventure, thriller]",206514,6.5,6.431592
5247,Diamonds Are Forever,tt0066995,1971-01-01,"[action, adventure, thriller]",115496,6.5,6.385759
5438,The World Is Not Enough,tt0143145,1999-01-01,"[action, adventure, thriller]",211725,6.4,6.341898
6630,Die Another Day,tt0246460,2002-01-01,"[action, adventure, thriller]",231107,6.1,6.070657


# 4. Weighted Hybrid

In [None]:
def get_hybrid_recommendations(movie):
    content_based_recommends = get_content_based_recommendations(movie, similarity)
    collaborative_filtering_recommends = get_collaborative_filtering_recommendations(movie)
    
    hybrid_recommends = pd.concat([content_based_recommends, collaborative_filtering_recommends], ignore_index=True) \
                            .sort_values(by='wr', ascending=False) \
                            .drop_duplicates(['Title'],ignore_index= True)
    
    return hybrid_recommends

In [88]:
get_hybrid_recommendations('Spectre')

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
0,Casino Royale,tt0381061,2006-01-01,"[action, adventure, thriller]",702997,8.0,7.936341
1,Skyfall,tt1074638,2012-01-01,"[action, adventure, thriller]",739511,7.8,7.744757
2,Mission: Impossible - Fallout,tt4912910,2018-01-01,"[action, adventure, thriller]",385191,7.7,7.60151
3,Mission: Impossible - Rogue Nation,tt2381249,2015-01-01,"[action, adventure, thriller]",416465,7.4,7.322539
4,No Time to Die,tt2382320,2021-01-01,"[action, adventure, thriller]",453414,7.3,7.232876
5,Extraction II,tt12263384,2023-01-01,"[action, crime, thriller]",155496,7.0,6.853877
6,Mission: Impossible III,tt0317919,2006-01-01,"[action, adventure, thriller]",395751,6.9,6.843139
7,Live and Let Die,tt0070328,1973-01-01,"[action, adventure, thriller]",116799,6.7,6.557166
8,Jason Bourne,tt4196776,2016-01-01,"[action, thriller]",245974,6.6,6.534086
9,Quantum of Solace,tt0830515,2008-01-01,"[action, adventure, mystery]",475392,6.5,6.468695
