In [None]:
import pandas as pd

movies = pd.read_excel('Imdb1000_datasheet.xlsx')
print(movies.head)

In [None]:
print(movies.dtypes)

In [None]:
#fill null value
fillted_movie = movies[movies['keywords'].isnull()]
fillted_movie

In [None]:
keywords_list = [
    "sequel", "reference to dance dance revolution", "long distance relationship",
    "love", "kissing booth", "high school", "halloween", "halloween costume",
    "halloween dance", "ghostbusters costume", "marshmallow costume", "dancing",
    "pumpkin carving", "competition", "thanksgiving dinner", "homecoming",
    "bestfriends", "graduation", "applying for college", "numbered sequel",
    "high school senior", "second part", "male male kiss", "feeling",
    "friendship", "summer", "2010s", "spit take", "based on novel"
]

keywords_list = ", ".join(keywords_list)
keywords_list

In [None]:
movies.loc[movies['id'] == 583083, 'keywords'] = keywords_list
fillted_movie

In [None]:
fillted_movie2 = movies[movies['production_countries'].isnull()]
fillted_movie2

In [None]:
movies.loc[movies['id'] == 27576, 'production_countries'] = 'United States of America'
fillted_movie2

In [None]:
movies.isnull().sum()

In [None]:
movies

In [189]:
'''
Make combined features column
    - Convert non-string columns to string
    - Combine all string columns into one column (combined_features)
    - Add space between each column in the combined_features column
'''


movies['vote_average'] = movies['vote_average'].astype(str)
movies['release_date'] = movies['release_date'].astype(str)

movies['combined_features'] = (
            movies['genres'] + " " 
            + movies['keywords'] + " " 
            + movies['vote_average'] + " " 
            + movies['original_language'] + " "
            + movies['overview'] + " "
            + movies['release_date']
)

In [None]:
'''
Make vector tfidf: convert combined_features to vector tfidf
    - convert ENGLISH_STOP_WORDS to list (because ENGILISH_STOP_WORDS is a frozenset)
    - use TfidfVectorizer to convert combined_features to vector tfidf
'''

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

list_stopwords = list(ENGLISH_STOP_WORDS)
tfidf = TfidfVectorizer(max_features=10000, stop_words=list_stopwords)

vector = tfidf.fit_transform(movies['combined_features'])
vector.shape

In [None]:
'''
Compute cosine_similarity
    - Use cosine_similarity to compute the cosine similarity between all movies (numpy array)
'''

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(vector)
cosine_sim

In [192]:
'''
Search movie index
    - Create a dictionary with the movie title as the key and the movie index as the value (O(n))
    - Create a function that searches for the movie index based on the movie title (O(1))
'''

title_index = {title.lower(): index for index, title in zip(movies.index, movies['original_title'].astype(str))}

def search_movie_index(title, title_index):
    return title_index.get(title.lower(), None)

In [193]:
def recommend_movies(title, cosine_sim, movies, title_index, num_recommend=10   ):
    movie_index = search_movie_index(title, title_index)
    if movie_index is None:
        return "Your movie is not in our database, we will update soon!"
    similar_index = cosine_sim[movie_index].argsort(kind='quicksort')[-(num_recommend + 1):-1][::-1]
    recommended_movies = movies.iloc[similar_index]['original_title']

    return recommended_movies

In [None]:
recommend_movies('The Dark Knight', cosine_sim, movies, title_index)

In [None]:
import pickle

pickle.dump(cosine_sim, open('cosine_sim.pkl', 'wb'))
pickle.dump(title_index, open('title_index.pkl', 'wb'))
pickle.dump(movies, open('movies.pkl', 'wb'))