In [1]:
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
df = pd.read_csv("./data/top10K-TMDB-movies.csv")
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [3]:
columns = ["id", "title", "genre", "overview"]
df = df[columns]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        10000 non-null  int64 
 1   title     10000 non-null  object
 2   genre     9997 non-null   object
 3   overview  9987 non-null   object
dtypes: int64(1), object(3)
memory usage: 312.6+ KB


In [5]:
df.dropna(inplace=True)

In [6]:
df['tags'] = df['genre'] + ' ' + df['overview']
df.drop(columns=["genre", "overview"], inplace=True)

In [7]:
df["tags"][0]

'Drama,Crime Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.'

In [11]:
df.to_csv("./data/top10K-TMDB-movies_with_tags.csv", index=False)

In [8]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["tags"])

In [9]:
movie_title = "Iron Man"
movie_matrix = tfidf.transform(df[df.title == movie_title].tags)
similarity_scores = cosine_similarity(movie_matrix, tfidf_matrix)
top_similar_indices = similarity_scores.argsort()[0][::-1][:10]
top_similar_indices

array([ 969, 3992, 3562, 8906, 2100, 7017, 7365, 1813, 6835, 7518],
      dtype=int64)

In [10]:
df.iloc[top_similar_indices]["title"].tolist()

['Iron Man',
 'Iron Man 2',
 'Iron Man 3',
 'Clown',
 'Avengers: Age of Ultron',
 'Tau',
 'Time Trap',
 'Spider-Man: Homecoming',
 'The New Mutants',
 'Cradle 2 the Grave']

In [11]:
def recommend(movie_title):
    movie_matrix = tfidf.transform(df[df.title == movie_title].tags)
    similarity_scores = cosine_similarity(movie_matrix, tfidf_matrix)
    top_similar_indices = similarity_scores.argsort()[0][::-1][:10]
    similar_movies = df.iloc[top_similar_indices]["title"].tolist()
    return similar_movies

In [12]:
recommend("The Godfather")

['The Godfather',
 'The Godfather: Part II',
 'The Godfather: Part III',
 'Blood Ties',
 'Proud Mary',
 'The Best of Youth',
 'Four Brothers',
 'Joe',
 'Xtreme',
 'Rampart']

In [13]:
pickle.dump(tfidf_matrix, open("tfidf.pkl", "wb"))