In [90]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [91]:
df = pd.read_csv("../data/processed/movie_data_processed.csv")
df = df.drop("Unnamed: 0", axis=1)
df["Year"] = df["Year"].fillna("Unknown")
df.head(1)

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,imdbRating,imdbVotes,imdbID,BoxOffice,wr
0,The Shawshank Redemption,1994-01-01,R,142 min,Drama,Frank Darabont,"Stephen King, Frank Darabont","Tim Robbins, Morgan Freeman, Bob Gunton",A banker convicted of uxoricide forms a friend...,English,United States,Nominated for 7 Oscars. 21 wins & 42 nominatio...,https://m.media-amazon.com/images/M/MV5BMDAyY2...,"[{'Source': 'Internet Movie Database', 'Value'...",9.3,2945396,tt0111161,"$28,767,189",9.277415


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6479 entries, 0 to 6478
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       6479 non-null   object 
 1   Year        6479 non-null   object 
 2   Rated       6479 non-null   object 
 3   Runtime     6479 non-null   object 
 4   Genre       6479 non-null   object 
 5   Director    6479 non-null   object 
 6   Writer      6479 non-null   object 
 7   Actors      6479 non-null   object 
 8   Plot        6479 non-null   object 
 9   Language    6479 non-null   object 
 10  Country     6479 non-null   object 
 11  Awards      6479 non-null   object 
 12  Poster      6479 non-null   object 
 13  Ratings     6479 non-null   object 
 14  imdbRating  6479 non-null   float64
 15  imdbVotes   6479 non-null   int64  
 16  imdbID      6479 non-null   object 
 17  BoxOffice   6479 non-null   object 
 18  wr          6479 non-null   float64
dtypes: float64(2), int64(1), ob

# 1. Simple Recommender

> Đưa ra các đề xuất tổng quát cho mọi người dùng, dựa trên mức độ phổ biến và/hoặc thể loại phim. Ý tưởng cơ bản đằng sau hệ thống này là những bộ phim nổi tiếng hơn và được giới phê bình đánh giá cao hơn sẽ có xác suất được khán giả bình thường thích cao hơn. Một ví dụ có thể là IMDB Top 250

In [93]:
def build_chart(df, genre, percentile=0.85):
    # Lọc các dòng chứa genre
    filtered_df = df[df['Genre'].str.contains(genre, na=False)]
    
    # Chuyển các cột cần thiết về kiểu số
    vote_counts = filtered_df[filtered_df['imdbVotes'].notnull()]['imdbVotes'].astype('int')
    vote_averages = filtered_df[filtered_df['imdbRating'].notnull()]['imdbRating'].astype('float')
    
    # Tính giá trị trung bình và ngưỡng
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    # Chọn các dòng đủ điều kiện
    qualified = filtered_df[
        (filtered_df['imdbVotes'] >= m) & 
        (filtered_df['imdbVotes'].notnull()) & 
        (filtered_df['imdbRating'].notnull())
    ][['Title', 'Year', 'Genre','imdbVotes', 'imdbRating']]
    
    # Chuyển các giá trị cần thiết về kiểu số
    qualified['imdbVotes'] = qualified['imdbVotes'].astype('int')
    qualified['imdbRating'] = qualified['imdbRating'].astype('float')
    
    # Tính trọng số (weighted rating)
    qualified['wr'] = qualified.apply(
        lambda x: (x['imdbVotes'] / (x['imdbVotes'] + m) * x['imdbRating']) + 
                  (m / (m + x['imdbVotes']) * C), 
        axis=1
    )
    
    # Sắp xếp theo trọng số và lấy top 250
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified


In [94]:
result = build_chart(df, 'Drama', percentile=0.85)
result.head(10)

Unnamed: 0,Title,Year,Genre,imdbVotes,imdbRating,wr
0,The Shawshank Redemption,1994-01-01,Drama,2945396,9.3,9.170986
2,The Godfather,1972-01-01,"Crime, Drama",2048917,9.2,9.027422
6,The Dark Knight,2008-01-01,"Action, Crime, Drama",2919777,9.0,8.887695
7,The Lord of the Rings: The Return of the King,2003-01-01,"Action, Adventure, Drama",2015572,9.0,8.841526
1,Band of Brothers,2001-01-01,"Drama, History, War",542582,9.4,8.818954
8,Schindler's List,1993-01-01,"Biography, Drama, History",1474085,9.0,8.789769
9,The Godfather Part II,1974-01-01,"Crime, Drama",1387339,9.0,8.778162
11,Pulp Fiction,1994-01-01,"Crime, Drama",2257639,8.9,8.76477
12,The Lord of the Rings: The Fellowship of the Ring,2001-01-01,"Action, Adventure, Drama",2045291,8.9,8.751888
14,Fight Club,1999-01-01,Drama,2371394,8.8,8.677986


In [95]:
result = build_chart(df, 'Action', percentile=0.85)
result.head(10)

Unnamed: 0,Title,Year,Genre,imdbVotes,imdbRating,wr
6,The Dark Knight,2008-01-01,"Action, Crime, Drama",2919777,9.0,8.757474
7,The Lord of the Rings: The Return of the King,2003-01-01,"Action, Adventure, Drama",2015572,9.0,8.665146
12,The Lord of the Rings: The Fellowship of the Ring,2001-01-01,"Action, Adventure, Drama",2045291,8.9,8.584236
13,Inception,2010-01-01,"Action, Adventure, Sci-Fi",2592712,8.8,8.554965
16,The Lord of the Rings: The Two Towers,2002-01-01,"Action, Adventure, Drama",1819421,8.8,8.468007
21,The Matrix,1999-01-01,"Action, Sci-Fi",2087394,8.7,8.419059
22,Star Wars: Episode V - The Empire Strikes Back,1980-01-01,"Action, Adventure, Fantasy",1400371,8.7,8.30939
28,Star Wars: Episode IV - A New Hope,1977-01-01,"Action, Adventure, Fantasy",1469927,8.6,8.243895
38,Gladiator,2000-01-01,"Action, Adventure, Drama",1657460,8.5,8.194831
5,Attack on Titan,Unknown,"Animation, Action, Adventure",551083,9.1,8.187164


# 2. Content based

In [96]:
tf = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tf.fit_transform(df['Plot'])

In [97]:
tfidf_matrix.shape

(6479, 15893)

In [98]:
def computeCosineSimilarity(word_matrix):
    cosine_similarity = linear_kernel(word_matrix, word_matrix)
    return cosine_similarity

In [99]:
def computePearsonCorrelation(word_matrix):
    return np.corrcoef(word_matrix)

In [100]:
indices = pd.Series(df.index, index=df['Title'])
indices

Title
The Shawshank Redemption       0
Band of Brothers               1
The Godfather                  2
Planet Earth                   3
Planet Earth II                4
                            ... 
Monster Trucks              6474
Lost River                  6475
Pet                         6476
The Hallow                  6477
Kurtlar Vadisi: Irak        6478
Length: 6479, dtype: int64

In [101]:
# def get_recommendations(title):
#     idx = indices[title]
#     sim_scores = list(enumerate(cosine_sim[idx]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = sim_scores[1:11]
#     movie_indices = [i[0] for i in sim_scores]
#     return df.iloc[movie_indices][['Title', 'imdbID', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']].sort_values(by='wr', ascending=False)

In [102]:
def get_recommendations(movie_title, similarity_scores):
    # Fetch index of movie based on given title
    movie_idx = indices[movie_title]
    
    # Fetch similarity score of all movies with the given movie
    # Fetch it as a tuple of (index, score)
    similarity_scores = list(enumerate(similarity_scores[movie_idx]))
    
    # Sort the above score
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Pick index and score of 10 most similar movies
    # Skip the 0th index since it is same movie (itself)
    similarity_scores = similarity_scores[1:11]
    
    # Find the indices of these similar movies
    movie_similar_indices = [i[0] for i in similarity_scores]
    
    # Find title of these top movies and return
    return df.iloc[movie_similar_indices][['Title', 'imdbID', 'Year', 'Genre','imdbVotes', 'imdbRating', 'wr']].sort_values(by='wr', ascending=False)

In [103]:
cosine_similarity = computeCosineSimilarity(tfidf_matrix)

In [104]:
get_recommendations('The Dark Knight', cosine_similarity)

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
107,Batman Begins,tt0372784,2005-01-01,"Action, Crime, Drama",1601743,8.2,8.17131
112,Kill Bill: Vol. 1,tt0266697,2003-01-01,"Action, Crime, Thriller",1214597,8.2,8.162303
357,The Batman,tt1877830,2022-01-01,"Action, Crime, Drama",815820,7.8,7.753201
814,Batman,tt0096895,1989-01-01,"Action, Adventure",411015,7.5,7.422088
1713,Batman Returns,tt0103776,1992-01-01,"Action, Crime, Fantasy",332740,7.1,7.0259
4479,Batman: Bad Blood,tt4870838,2016-01-01,"Animation, Action, Adventure, Sci-Fi",21219,6.8,6.286207
4668,Batman: The Killing Joke,tt4853102,2016-01-01,"Animation, Action, Crime",62497,6.4,6.239464
4783,Batman: Gotham by Gaslight,tt7167630,2018-01-01,"Animation, Action, Adventure",19366,6.7,6.21011
5960,Becky,tt10314450,2020-01-01,"Action, Crime, Drama",25862,6.0,5.874475
6023,Domino,tt0421054,2005-01-01,"Action, Biography, Crime",69894,5.9,5.85795


In [105]:
tags_df = pd.read_csv('../data/processed/tags_processed.csv')

In [106]:
tags_df = tags_df.drop("Unnamed: 0", axis = 1)

In [107]:
tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 739220 entries, 0 to 739219
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   userId   739220 non-null  int64 
 1   movieId  739220 non-null  int64 
 2   tag      739220 non-null  object
 3   imdbId   739220 non-null  object
dtypes: int64(2), object(2)
memory usage: 22.6+ MB


In [108]:
tags_grouped = tags_df.groupby('imdbId')['tag'].apply(list).reset_index()
tags_grouped.rename(columns={"imdbId": "imdbID"}, inplace=True)
tags_grouped

Unnamed: 0,imdbID,tag
0,tt0000417,"[classic, experimental, sci-fi, black and whit..."
1,tt0000439,"[less than 300 ratings, not available from Net..."
2,tt0004972,"[racism, Racist, Racist History, racist, contr..."
3,tt0010323,"[german expressionism, psychology, serial kill..."
4,tt0012349,"[Tumey's DVDs, charity, orphan, poverty, silen..."
...,...,...
6179,tt9812474,"[grief, atmospheric, fantasy, A24, atmospheric..."
6180,tt9844522,"[can't play along, inconsistent villain, pasti..."
6181,tt9845564,"[convoy, rapist, revenge, trapped inside, winter]"
6182,tt9873892,"[cheesy, cliche, Black, Blaxploitation, satire]"


In [109]:
df = df.merge(tags_grouped, on='imdbID', how='left')
df.head(1)

Unnamed: 0,Title,Year,Rated,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,imdbRating,imdbVotes,imdbID,BoxOffice,wr,tag
0,The Shawshank Redemption,1994-01-01,R,142 min,Drama,Frank Darabont,"Stephen King, Frank Darabont","Tim Robbins, Morgan Freeman, Bob Gunton",A banker convicted of uxoricide forms a friend...,English,United States,Nominated for 7 Oscars. 21 wins & 42 nominatio...,https://m.media-amazon.com/images/M/MV5BMDAyY2...,"[{'Source': 'Internet Movie Database', 'Value'...",9.3,2945396,tt0111161,"$28,767,189",9.277415,"[based on a book, Morgan Freeman, twist ending..."


In [110]:
tag_counts = df.apply(lambda x: pd.Series(x['tag']),axis=1).stack().reset_index(level=1, drop=True)
tag_counts.name = 'tag'

In [111]:
tag_counts = tag_counts.value_counts()
tag_counts[:5]

tag
sci-fi         4709
atmospheric    4204
action         3915
funny          3369
comedy         3256
Name: count, dtype: int64

In [112]:
tag_counts = tag_counts[tag_counts > 1]

In [113]:
def filter_tags(x):
    words = []
    for i in x:
        if i in tag_counts:
            words.append(i)
    return words

In [114]:
df["tag"] = df["tag"].fillna('')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6479 entries, 0 to 6478
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       6479 non-null   object 
 1   Year        6479 non-null   object 
 2   Rated       6479 non-null   object 
 3   Runtime     6479 non-null   object 
 4   Genre       6479 non-null   object 
 5   Director    6479 non-null   object 
 6   Writer      6479 non-null   object 
 7   Actors      6479 non-null   object 
 8   Plot        6479 non-null   object 
 9   Language    6479 non-null   object 
 10  Country     6479 non-null   object 
 11  Awards      6479 non-null   object 
 12  Poster      6479 non-null   object 
 13  Ratings     6479 non-null   object 
 14  imdbRating  6479 non-null   float64
 15  imdbVotes   6479 non-null   int64  
 16  imdbID      6479 non-null   object 
 17  BoxOffice   6479 non-null   object 
 18  wr          6479 non-null   float64
 19  tag         6479 non-null  

In [115]:
reco_features = ['Title', 'Director', 'Actors', 'tag', 'Genre']

In [116]:
def cleanUpData(data):
    if isinstance(data, list):
        return [str.lower(val.replace(" ", "")) for val in data]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(data, str):
            return str.lower(data.replace(" ", ""))
        else:
            return ''

In [117]:
# Apply data cleanup to reco features
modified_features = ['Director', 'Actors', 'tag', 'Genre']

for feature in modified_features:
    df[feature] = df[feature].apply(cleanUpData)
    
df[reco_features].head(5)

Unnamed: 0,Title,Director,Actors,tag,Genre
0,The Shawshank Redemption,frankdarabont,"timrobbins,morganfreeman,bobgunton","[basedonabook, morganfreeman, twistending, fri...",drama
1,Band of Brothers,unknown,"scottgrimes,damianlewis,ronlivingston","[accurate, gritty, war, worldwarii, notamovie,...","drama,history,war"
2,The Godfather,francisfordcoppola,"marlonbrando,alpacino,jamescaan","[alpacino, atmospheric, greatacting, masterpie...","crime,drama"
3,Planet Earth,unknown,"sigourneyweaver,davidattenborough,nikolaydrozdov","[nature, amazingphotography, impossibleshots, ...","documentary,family"
4,Planet Earth II,unknown,"davidattenborough,gordonbuchanan,barriebritton","[antonishing, bbc, biology, davidattenborough,...",documentary


In [118]:
# Chuyển đổi các cột thành danh sách
df['Director'] = df['Director'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df['Actors'] = df['Actors'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df['Genre'] = df['Genre'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df[reco_features].head(1)

Unnamed: 0,Title,Director,Actors,tag,Genre
0,The Shawshank Redemption,[frankdarabont],"[timrobbins, morganfreeman, bobgunton]","[basedonabook, morganfreeman, twistending, fri...",[drama]


In [119]:
def createSoup(data):
    # Loại bỏ trùng lặp trong mỗi danh sách trước khi nối
    director = ' '.join(data['Director'])
    actors = ' '.join(data['Actors'])
    tag = ' '.join(set(data['tag']))  # Dùng set để loại bỏ trùng lặp
    genre = ' '.join(data['Genre'])
    
    # Ghép tất cả lại thành chuỗi duy nhất
    return f"{director} {actors} {tag} {genre}"

createSoup(df.iloc[0,:])



In [120]:
# Create a new feature Soup with mixed data
df['soup'] = df.apply(createSoup, axis=1)

reco_features = ['Title', 'Director', 'Actors', 'tag', 'Genre', 'soup']
df[reco_features].head(5)

Unnamed: 0,Title,Director,Actors,tag,Genre,soup
0,The Shawshank Redemption,[frankdarabont],"[timrobbins, morganfreeman, bobgunton]","[basedonabook, morganfreeman, twistending, fri...",[drama],frankdarabont timrobbins morganfreeman bobgunt...
1,Band of Brothers,[unknown],"[scottgrimes, damianlewis, ronlivingston]","[accurate, gritty, war, worldwarii, notamovie,...","[drama, history, war]",unknown scottgrimes damianlewis ronlivingston ...
2,The Godfather,[francisfordcoppola],"[marlonbrando, alpacino, jamescaan]","[alpacino, atmospheric, greatacting, masterpie...","[crime, drama]",francisfordcoppola marlonbrando alpacino james...
3,Planet Earth,[unknown],"[sigourneyweaver, davidattenborough, nikolaydr...","[nature, amazingphotography, impossibleshots, ...","[documentary, family]",unknown sigourneyweaver davidattenborough niko...
4,Planet Earth II,[unknown],"[davidattenborough, gordonbuchanan, barriebrit...","[antonishing, bbc, biology, davidattenborough,...",[documentary],unknown davidattenborough gordonbuchanan barri...


In [121]:
# Define a CountVectorizer Object
from sklearn.feature_extraction.text import CountVectorizer
cntVec = CountVectorizer(stop_words='english')

# Remove NaN from soup with empty strings
df['soup'] = df['soup'].fillna('')

# Construct CountVectorizer matrix by fitting and transforming the data
cntVec_matrix = cntVec.fit_transform(df['soup'])

print("Shape of CountVectorizer matrix =", cntVec_matrix.shape)

# Topmost frequently occuring words
words = cntVec.get_feature_names_out()
counts = cntVec_matrix.sum(axis=0).reshape(-1,1).tolist()
print("Most frequently occuring words in plot overview:")
word_count = dict(sorted(zip(words, counts), key=lambda x : x[1], reverse=True)[:20])
print(word_count)

Shape of CountVectorizer matrix = (6479, 75021)
Most frequently occuring words in plot overview:
{'drama': [4245], 'comedy': [2980], 'action': [2315], 'crime': [1751], 'romance': [1635], 'adventure': [1498], 'thriller': [1415], 'nudity': [1219], 'horror': [998], 'mystery': [967], 'murder': [945], 'fi': [916], 'sci': [893], 'violence': [854], 'fantasy': [765], 'basedonabook': [755], 'funny': [743], 'biography': [720], 'death': [720], 'sdvds': [708]}


In [123]:
# Find recommendations based on Cosine Similarity
cosine_similarity = computeCosineSimilarity(cntVec_matrix)
get_recommendations('Spectre', cosine_similarity)

Unnamed: 0,Title,imdbID,Year,Genre,imdbVotes,imdbRating,wr
28,Star Wars: Episode IV - A New Hope,tt0076759,1977-01-01,"[action, adventure, fantasy]",1469927,8.6,8.563773
234,Casino Royale,tt0381061,2006-01-01,"[action, adventure, thriller]",702997,8.0,7.94073
360,Skyfall,tt1074638,2012-01-01,"[action, adventure, thriller]",739511,7.8,7.74849
625,Goldfinger,tt0058150,1964-01-01,"[action, adventure, thriller]",204856,7.7,7.533562
1166,No Time to Die,tt2382320,2021-01-01,"[action, adventure, thriller]",453414,7.3,7.236965
1433,From Russia with Love,tt0057076,1963-01-01,"[action, adventure, thriller]",147094,7.3,7.120429
1478,GoldenEye,tt0113189,1995-01-01,"[action, adventure, thriller]",272864,7.2,7.104297
1595,Dr. No,tt0055928,1962-01-01,"[action, adventure, thriller]",181827,7.2,7.060826
3718,Quantum of Solace,tt0830515,2008-01-01,"[action, adventure, mystery]",475392,6.5,6.469883
3878,Tomorrow Never Dies,tt0120347,1997-01-01,"[action, adventure, thriller]",206514,6.5,6.43391
