In [1]:
import json
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 70)

In [3]:
df = pd.read_csv('data.csv')

In [4]:
df.head()

Unnamed: 0,budget,genres,homepage,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,gross,duration,spoken_languages,status,tagline,movie_title,vote_average,num_voted_users,title_year,country,director_name,actor_1_name,actor_2_name,actor_3_name
0,237000000,Action|Adventure|Fantasy|Science Fiction,http://www.avatarmovie.com/,19995,culture clash|future|space war|space colony|so...,English,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{'name': 'Ingenious Film Partners', 'id': 289...","[{'iso_3166_1': 'US', 'name': 'United States o...",2009-12-10,2787965087,162.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,2009.0,United States of America,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver
1,300000000,Adventure|Fantasy|Action,http://disney.go.com/disneypictures/pirates/,285,ocean|drug abuse|exotic island|east india trad...,English,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{'name': 'Walt Disney Pictures', 'id': 2}, {'...","[{'iso_3166_1': 'US', 'name': 'United States o...",2007-05-19,961000000,169.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,2007.0,United States of America,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley
2,245000000,Action|Adventure|Crime,http://www.sonypictures.com/movies/spectre/,206647,spy|based on novel|secret agent|sequel|mi6|bri...,Français,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{'name': 'Columbia Pictures', 'id': 5}, {'nam...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2015-10-26,880674609,148.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,A Plan No One Escapes,Spectre,6.3,4466,2015.0,United Kingdom,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux
3,250000000,Action|Crime|Drama|Thriller,http://www.thedarkknightrises.com/,49026,dc comics|crime fighter|terrorist|secret ident...,English,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{'name': 'Legendary Pictures', 'id': 923}, {'...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-07-16,1084939099,165.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,2012.0,United States of America,Christopher Nolan,Christian Bale,Michael Caine,Gary Oldman
4,260000000,Action|Adventure|Science Fiction,http://movies.disney.com/john-carter,49529,based on novel|mars|medallion|space travel|pri...,English,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-03-07,284139100,132.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,2012.0,United States of America,Andrew Stanton,Taylor Kitsch,Lynn Collins,Samantha Morton


## Recommender system based on overview

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

import re

In [6]:
#Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')

In [7]:
documents = df['overview']

In [8]:
# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in ENGLISH_STOP_WORDS)
    return text

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

In [9]:
# Create a TfidfVectorizer object with n-gram range from unigrams to trigrams
# vectorizer = TfidfVectorizer(ngram_range=(1, 3))
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the preprocessed data and transform the documents into a feature matrix
X_overview = vectorizer.fit_transform(preprocessed_documents)

# Print the TF-IDF feature matrix
print(np.shape(X_overview.toarray()))

(4803, 23289)


In [10]:
import numpy as np

# Get the feature names (tokens) from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF matrix
tfidf_matrix = X_overview.toarray()

# Calculate the mean TF-IDF score for each token across all documents
mean_tfidf_scores = np.mean(tfidf_matrix, axis=0)

# Create a dictionary mapping each token to its mean TF-IDF score
token_tfidf_scores = dict(zip(feature_names, mean_tfidf_scores))

# Sort the tokens by their mean TF-IDF scores
sorted_tokens = sorted(token_tfidf_scores.items(), key=lambda x: x[1], reverse=True)

# Print the sorted tokens with their mean TF-IDF scores
for token, score in sorted_tokens:
    print(f"Token: {token}, Mean TF-IDF Score: {score}")


Token: life, Mean TF-IDF Score: 0.0137205980016377
Token: new, Mean TF-IDF Score: 0.012568350381471522
Token: young, Mean TF-IDF Score: 0.012409313453186402
Token: world, Mean TF-IDF Score: 0.010572782730578347
Token: man, Mean TF-IDF Score: 0.010550622882998097
Token: family, Mean TF-IDF Score: 0.009912600764423421
Token: story, Mean TF-IDF Score: 0.009671995696007189
Token: love, Mean TF-IDF Score: 0.008813448086856567
Token: friends, Mean TF-IDF Score: 0.0075466213843456325
Token: woman, Mean TF-IDF Score: 0.007272409777867022
Token: years, Mean TF-IDF Score: 0.007247974903619932
Token: finds, Mean TF-IDF Score: 0.00707272750436334
Token: father, Mean TF-IDF Score: 0.006800039104829699
Token: time, Mean TF-IDF Score: 0.006645288098344073
Token: lives, Mean TF-IDF Score: 0.006616499883675835
Token: film, Mean TF-IDF Score: 0.006568396404066435
Token: war, Mean TF-IDF Score: 0.006429494220876632
Token: home, Mean TF-IDF Score: 0.0064176787482174745
Token: help, Mean TF-IDF Score: 0.00

In [11]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_matrix_1 = cosine_similarity(X_overview, X_overview)

In [12]:
cosine_matrix_1.shape

(4803, 4803)

In [13]:
# Construct a reverse map of indices and movie titles using a dictionary
indices = dict(zip(df['original_title'], df.index))

In [14]:
indices

{'Avatar': 0,
 "Pirates of the Caribbean: At World's End": 1,
 'Spectre': 2,
 'The Dark Knight Rises': 3,
 'John Carter': 4,
 'Spider-Man 3': 5,
 'Tangled': 6,
 'Avengers: Age of Ultron': 7,
 'Harry Potter and the Half-Blood Prince': 8,
 'Batman v Superman: Dawn of Justice': 9,
 'Superman Returns': 10,
 'Quantum of Solace': 11,
 "Pirates of the Caribbean: Dead Man's Chest": 12,
 'The Lone Ranger': 13,
 'Man of Steel': 14,
 'The Chronicles of Narnia: Prince Caspian': 15,
 'The Avengers': 16,
 'Pirates of the Caribbean: On Stranger Tides': 17,
 'Men in Black 3': 18,
 'The Hobbit: The Battle of the Five Armies': 19,
 'The Amazing Spider-Man': 20,
 'Robin Hood': 21,
 'The Hobbit: The Desolation of Smaug': 22,
 'The Golden Compass': 23,
 'King Kong': 24,
 'Titanic': 25,
 'Captain America: Civil War': 26,
 'Battleship': 27,
 'Jurassic World': 28,
 'Skyfall': 29,
 'Spider-Man 2': 30,
 'Iron Man 3': 31,
 'Alice in Wonderland': 32,
 'X-Men: The Last Stand': 33,
 'Monsters University': 34,
 'Tra

In [15]:
def recommender_function(title, cosine_matrix):
    idx = indices.get(title)

    if idx:
        movie_scores = cosine_matrix[idx]

        score_df = df[['original_title']]

        score_df['score'] = movie_scores

        score_df_sorted = score_df.sort_values(by='score', ascending=False)

        top_N = 5
        
        score_df_sorted.iloc[1:top_N + 1].head()
        
        return score_df_sorted.iloc[1:top_N + 1]
    
    else:
        print('not valid movie title')


In [16]:
recommender_function('The Avengers', cosine_matrix_1)

Unnamed: 0,original_title,score
7,Avengers: Age of Ultron,0.139673
3144,Plastic,0.117688
1715,Timecop,0.107696
4124,This Thing of Ours,0.101177
3033,The Corruptor,0.093981


## Recommender system based on actors and director

In [17]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'plot_keywords', 'language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'gross', 'duration',
       'spoken_languages', 'status', 'tagline', 'movie_title', 'vote_average',
       'num_voted_users', 'title_year', 'country', 'director_name',
       'actor_1_name', 'actor_2_name', 'actor_3_name'],
      dtype='object')

In [18]:
df2 = df[['original_title', 'director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name']]

In [19]:
df2.head()

Unnamed: 0,original_title,director_name,actor_1_name,actor_2_name,actor_3_name
0,Avatar,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver
1,Pirates of the Caribbean: At World's End,Gore Verbinski,Johnny Depp,Orlando Bloom,Keira Knightley
2,Spectre,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux
3,The Dark Knight Rises,Christopher Nolan,Christian Bale,Michael Caine,Gary Oldman
4,John Carter,Andrew Stanton,Taylor Kitsch,Lynn Collins,Samantha Morton


In [20]:
#Replace NaN with an empty string
df2['director_name'] = df2['director_name'].fillna('')
df2['actor_1_name'] = df2['actor_1_name'].fillna('')
df2['actor_2_name'] = df2['actor_2_name'].fillna('')
df2['actor_3_name'] = df2['actor_3_name'].fillna('')

In [21]:
# Lowercase the strings and remove spaces
df2['director_name'] = df2['director_name'].str.lower().str.replace(' ', '')
df2['actor_1_name'] = df2['actor_1_name'].str.lower().str.replace(' ', '')
df2['actor_2_name'] = df2['actor_2_name'].str.lower().str.replace(' ', '')
df2['actor_3_name'] = df2['actor_3_name'].str.lower().str.replace(' ', '')

# Create a new column 'actors_and_director' by combining director and actors
df2['actors_and_director'] = df2['director_name'] + ' ' + df2['actor_1_name'] + ' ' + df2['actor_2_name'] + ' ' + df2['actor_3_name']

# Print the DataFrame to see the new column
df2.head()

Unnamed: 0,original_title,director_name,actor_1_name,actor_2_name,actor_3_name,actors_and_director
0,Avatar,jamescameron,samworthington,zoesaldana,sigourneyweaver,jamescameron samworthington zoesaldana sigourn...
1,Pirates of the Caribbean: At World's End,goreverbinski,johnnydepp,orlandobloom,keiraknightley,goreverbinski johnnydepp orlandobloom keirakni...
2,Spectre,sammendes,danielcraig,christophwaltz,léaseydoux,sammendes danielcraig christophwaltz léaseydoux
3,The Dark Knight Rises,christophernolan,christianbale,michaelcaine,garyoldman,christophernolan christianbale michaelcaine ga...
4,John Carter,andrewstanton,taylorkitsch,lynncollins,samanthamorton,andrewstanton taylorkitsch lynncollins samanth...


In [22]:
documents = df2['actors_and_director']

In [23]:
from sklearn.feature_extraction.text import CountVectorizer


In [24]:
# Create a CountVectorizer object with n-gram range from unigrams to trigrams
vectorizer = CountVectorizer()

# Fit the vectorizer to the preprocessed data and transform the documents into a feature matrix
X_actors = vectorizer.fit_transform(documents)

# Print the feature matrix
print(X_actors.toarray())

X_actors.shape

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


(4803, 8091)

In [25]:
cosine_matrix_2 = cosine_similarity(X_actors, X_actors)

In [26]:
recommender_function('The Avengers', cosine_matrix_2)

Unnamed: 0,original_title,score
7,Avengers: Age of Ultron,0.75
421,Zodiac,0.5
26,Captain America: Civil War,0.5
3748,The Kids Are All Right,0.25
79,Iron Man 2,0.25


## Recommender system based on actors, directors and overview

In [27]:
X_total = np.concatenate((X_overview.toarray(), X_actors.toarray()), axis=1)

In [28]:
X_actors.shape

(4803, 8091)

In [29]:
X_overview.shape

(4803, 23289)

In [30]:
X_total.shape

(4803, 31380)

In [31]:
cosine_matrix_3 = cosine_similarity(X_total, X_total)

In [32]:
recommender_function('The Dark Knight Rises', cosine_matrix_3)

Unnamed: 0,original_title,score
119,Batman Begins,0.628132
1196,The Prestige,0.603438
65,The Dark Knight,0.447528
1181,JFK,0.218528
1246,Quest for Camelot,0.213137


## add genres

In [33]:
df['genres'] = df['genres'].fillna('')

In [34]:
def rewrite_genres(genres):
    
    genres_list = genres.split('|')
    
    if len(genres_list) > 2:
        return genres_list[0] + " " + genres_list[1] + " " + genres_list[2] + " " 
    
    elif len(genres_list) == 2:
        return genres_list[0] + " " + genres_list[1]
    
    elif len(genres_list) == 1:
        return genres_list[0] 
    
    else:
        return ""

In [35]:
df2['genres'] = df['genres'].apply(rewrite_genres)

In [36]:
df2['genres'] = df2['genres'].apply(str.lower)

In [37]:
df2.head()

Unnamed: 0,original_title,director_name,actor_1_name,actor_2_name,actor_3_name,actors_and_director,genres
0,Avatar,jamescameron,samworthington,zoesaldana,sigourneyweaver,jamescameron samworthington zoesaldana sigourn...,action adventure fantasy
1,Pirates of the Caribbean: At World's End,goreverbinski,johnnydepp,orlandobloom,keiraknightley,goreverbinski johnnydepp orlandobloom keirakni...,adventure fantasy action
2,Spectre,sammendes,danielcraig,christophwaltz,léaseydoux,sammendes danielcraig christophwaltz léaseydoux,action adventure crime
3,The Dark Knight Rises,christophernolan,christianbale,michaelcaine,garyoldman,christophernolan christianbale michaelcaine ga...,action crime drama
4,John Carter,andrewstanton,taylorkitsch,lynncollins,samanthamorton,andrewstanton taylorkitsch lynncollins samanth...,action adventure science fiction


In [38]:
df2['actors_and_director_genres'] = df2['actors_and_director'] + ' ' + df2['genres']
df2['actors_and_director_genres'][0]

'jamescameron samworthington zoesaldana sigourneyweaver action adventure fantasy '

In [39]:
documents = df2['actors_and_director_genres']

In [40]:
# Create a CountVectorizer object with n-gram range from unigrams to trigrams
vectorizer = CountVectorizer()

# Fit the vectorizer to the preprocessed data and transform the documents into a feature matrix
X_actors_genres = vectorizer.fit_transform(documents)

# Print the feature matrix
print(X_actors_genres.toarray())

X_actors_genres.shape

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


(4803, 8113)

In [41]:
X_total = np.concatenate((X_overview.toarray(), X_actors_genres.toarray()), axis=1)

In [42]:
X_total.shape

(4803, 31402)

In [43]:
cosine_matrix_4 = cosine_similarity(X_total, X_total)

In [44]:
recommender_function('The Dark Knight Rises', cosine_matrix_4)

Unnamed: 0,original_title,score
119,Batman Begins,0.767582
65,The Dark Knight,0.654705
4638,Amidst the Devil's Wings,0.53033
1196,The Prestige,0.502149
3073,Romeo Is Bleeding,0.5


In [45]:
recommender_function('The Godfather', cosine_matrix_4)

Unnamed: 0,original_title,score
2731,The Godfather: Part II,0.629136
867,The Godfather: Part III,0.553232
3012,The Outsiders,0.428571
1525,Apocalypse Now,0.428571
2649,The Son of No One,0.403994


In [49]:
"""
suggestion type:

1: base on overview
2: base on actors and director
3: base on actors and director and overview
4: base on actors, director, overview and genres
"""
title = 'The Godfather'
suggestion_type = 4
    
cosine_matrix_name = 'cosine_matrix_' + str(int(suggestion_type))

cosine_matrix_variable = globals()[cosine_matrix_name]

# Assuming recommender_function is defined elsewhere
recommender_function(title, cosine_matrix_variable)

Unnamed: 0,original_title,score
2731,The Godfather: Part II,0.629136
867,The Godfather: Part III,0.553232
3012,The Outsiders,0.428571
1525,Apocalypse Now,0.428571
2649,The Son of No One,0.403994
