In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



# Model Building (content-based recommendation)

In [3]:
movies = pd.read_csv('./cleaned_datasets/movies.dat', 
                 delimiter='::', 
                 encoding = 'latin1',
                 header = None,
                 engine = 'python',
                 names = ['movie_id', 'title', 'genres']
)

In [4]:
content_rec = movies.copy()

# clean genres column
content_rec['genres'] = content_rec['genres'].apply(
    lambda genres: [genre.lower().replace("'", '').replace('-', '_') 
                    for genre in genres.split('|')]
)

# clean title column (separate title and year)
content_rec[['title', 'year']] = content_rec['title'].str.extract('(.+) \((\d{4})\)', expand = True)

# check if cleaned properly
content_rec[content_rec['title'].isnull() | content_rec['year'].isnull() | content_rec['genres'].isnull()]

Unnamed: 0,movie_id,title,genres,year
988,1001,,[comedy],


In [5]:
# fix title of movie_id == 10001
pattern = r'^(.*?) \(([^)]+)\)\((\d{4})\)$'
temp_title_and_year = movies[movies['movie_id'] == 1001]['title'].str.extract(pattern)

content_rec.at[988, 'title'] = temp_title_and_year[0].values[0]
content_rec.at[988, 'year'] = temp_title_and_year[2].values[0]

content_rec[content_rec['movie_id'] == 1001]

Unnamed: 0,movie_id,title,genres,year
988,1001,"Associate, The",[comedy],1982


In [6]:
# fix some title strings
content_rec[content_rec.title.str.contains(',')]

Unnamed: 0,movie_id,title,genres,year
10,11,"American President, The","[comedy, drama, romance]",1995
28,29,"City of Lost Children, The","[adventure, sci_fi]",1995
39,40,"Cry, the Beloved Country",[drama],1995
49,50,"Usual Suspects, The","[crime, thriller]",1995
53,54,"Big Green, The","[childrens, comedy]",1995
...,...,...,...,...
3866,3936,"Phantom of the Opera, The","[drama, thriller]",1943
3868,3938,"Slumber Party Massacre, The",[horror],1982
3869,3939,"Slumber Party Massacre II, The",[horror],1987
3870,3940,"Slumber Party Massacre III, The",[horror],1990


In [7]:
def fix_comma(movie_title):
    title_split = movie_title.split(', ')
    return title_split[-1] + ' ' + ' '.join(title_split[:-1])

condition = content_rec.title.str.contains(',')
content_rec.loc[condition, 'title'] = content_rec.loc[condition, 'title'].apply(fix_comma)
content_rec[content_rec.title.str.contains(',')]

Unnamed: 0,movie_id,title,genres,year
1006,1019,"20,000 Leagues Under the Sea","[adventure, childrens, fantasy, sci_fi]",1954
1962,2031,"$1,000,000 Duck","[childrens, comedy]",1971


In [8]:
import requests

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxNWRkMTQ2ZWFlYTA2NmJiMWFhZjJhOWYxZjQ2Y2I4MiIsInN1YiI6IjY1YmYwNmYyNDM5OTliMDE4NGM3MDNhZiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.FgcvH65nHD5eOftKjOZJUI8PhkoCDtnvsf35KxN2RPQ"
}

In [None]:
# !!!!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!!!
# The below code requests plots for all movies, so it will take a very long time
# Advised to just use the pickled file and import it

def fetch_plot(row):
    movie_title = row['title']
    movie_year = row['year']
    
    search_url = "https://api.themoviedb.org/3/search/movie"
    params = {
        'query': movie_title,
        'year': movie_year,
    }
    
    res = requests.get(search_url, headers=headers, params=params)
    
    if res.status_code == 200:
        search_results = res.json()
        if 0 < search_results['total_results']:
            return search_results['results'][0]['overview']
        else:
            return "ERROR: movie not found"
    else:
        return "ERROR: cannot call TMDB API"
            
plots = content_rec.apply(fetch_plot, axis = 1)
plots.to_pickle('./datasets/movie_plot_inorder.pkl')

In [16]:
plots = pd.read_pickle('./cleaned_datasets/movie_plot_inorder.pkl')
content_rec['plot'] = plots
content_rec

Unnamed: 0,movie_id,title,genres,year,plot,bag_of_words
0,1,Toy Story,"[animation, childrens, comedy]",1995,"Led by Woody, Andy's toys live happily in his ...",Toy Story Toy Story animation childrens comedy...
1,2,Jumanji,"[adventure, childrens, fantasy]",1995,When siblings Judy and Peter discover an encha...,Jumanji Jumanji adventure childrens fantasy ad...
2,3,Grumpier Old Men,"[comedy, romance]",1995,A family wedding reignites the ancient feud be...,Grumpier Old Men Grumpier Old Men comedy roman...
3,4,Waiting to Exhale,"[comedy, drama]",1995,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale Waiting to Exhale comedy dra...
4,5,Father of the Bride Part II,[comedy],1995,Just when George Banks has recovered from his ...,Father of the Bride Part II Father of the Brid...
...,...,...,...,...,...,...
3878,3948,Meet the Parents,[comedy],2000,"Greg Focker is ready to marry his girlfriend, ...",Meet the Parents Meet the Parents comedy comed...
3879,3949,Requiem for a Dream,[drama],2000,The drug-induced utopias of four Coney Island ...,Requiem for a Dream Requiem for a Dream drama ...
3880,3950,Tigerland,[drama],2000,A group of recruits go through Advanced Infant...,Tigerland Tigerland drama drama drama drama 20...
3881,3951,Two Family House,[drama],2000,Buddy Visalo (Michael Rispoli) is a factory wo...,Two Family House Two Family House drama drama ...


In [10]:
# check if there are any rows with missing plots
condition = (content_rec['plot'] == '') | (content_rec['plot'].str.contains('ERROR'))
content_rec[condition]

Unnamed: 0,movie_id,title,genres,year,plot
29,30,Shanghai Triad (Yao a yao yao dao waipo qiao),[drama],1995,ERROR: movie not found
82,83,Once Upon a Time... When We Were Colored,[drama],1995,ERROR: movie not found
119,121,The Boys of St. Vincent,[drama],1993,ERROR: movie not found
125,127,The (Saimt el Qusur) Silence of the Palace,[drama],1994,ERROR: movie not found
126,128,Jupiter's Wife,[documentary],1994,ERROR: movie not found
...,...,...,...,...,...
3820,3890,Back Stage,[documentary],2000,ERROR: movie not found
3832,3902,Goya in Bordeaux (Goya en Bodeos),[drama],1999,ERROR: movie not found
3834,3904,An Uninvited Guest,[drama],2000,ERROR: movie not found
3837,3907,The Prince of Central Park,[drama],1999,ERROR: movie not found


In [11]:
# if we don't have plot, replace with empty string
content_rec.loc[condition, 'plot'] = content_rec.loc[condition, 'plot'].apply(lambda plot_str: '')

content_rec[content_rec['plot'] == '']

Unnamed: 0,movie_id,title,genres,year,plot
29,30,Shanghai Triad (Yao a yao yao dao waipo qiao),[drama],1995,
82,83,Once Upon a Time... When We Were Colored,[drama],1995,
119,121,The Boys of St. Vincent,[drama],1993,
125,127,The (Saimt el Qusur) Silence of the Palace,[drama],1994,
126,128,Jupiter's Wife,[documentary],1994,
...,...,...,...,...,...
3820,3890,Back Stage,[documentary],2000,
3832,3902,Goya in Bordeaux (Goya en Bodeos),[drama],1999,
3834,3904,An Uninvited Guest,[drama],2000,
3837,3907,The Prince of Central Park,[drama],1999,


In [12]:
# NLP text processing; create bag of words to feed into tf-idf vectorization
# join [title, genres, year, plot] into one string, multiplied by their respective weight
content_rec['bag_of_words'] = (content_rec['title'] + ' ' 
    + content_rec['genres'].apply(lambda genres: ' '.join(genres)) + ' ' 
    + content_rec['year'] + ' ' 
    + content_rec['plot'] + ' '
)

content_rec

Unnamed: 0,movie_id,title,genres,year,plot,bag_of_words
0,1,Toy Story,"[animation, childrens, comedy]",1995,"Led by Woody, Andy's toys live happily in his ...",Toy Story animation childrens comedy 1995 Led ...
1,2,Jumanji,"[adventure, childrens, fantasy]",1995,When siblings Judy and Peter discover an encha...,Jumanji adventure childrens fantasy 1995 When ...
2,3,Grumpier Old Men,"[comedy, romance]",1995,A family wedding reignites the ancient feud be...,Grumpier Old Men comedy romance 1995 A family ...
3,4,Waiting to Exhale,"[comedy, drama]",1995,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale comedy drama 1995 Cheated on...
4,5,Father of the Bride Part II,[comedy],1995,Just when George Banks has recovered from his ...,Father of the Bride Part II comedy 1995 Just w...
...,...,...,...,...,...,...
3878,3948,Meet the Parents,[comedy],2000,"Greg Focker is ready to marry his girlfriend, ...",Meet the Parents comedy 2000 Greg Focker is re...
3879,3949,Requiem for a Dream,[drama],2000,The drug-induced utopias of four Coney Island ...,Requiem for a Dream drama 2000 The drug-induce...
3880,3950,Tigerland,[drama],2000,A group of recruits go through Advanced Infant...,Tigerland drama 2000 A group of recruits go th...
3881,3951,Two Family House,[drama],2000,Buddy Visalo (Michael Rispoli) is a factory wo...,Two Family House drama 2000 Buddy Visalo (Mich...


In [13]:
# Apply word transformers (ex: Word2Vec) or sentence transformers (ex: BERT) to vectorize as NLP
    # in this case we should use word transformers b/c
    # there is no 'text context' (context independent) in bag of words
# Try getting cosine similarity or use tf-idf

# Define the weights for each column
    # weights must be in whole numbers since TF-IDF count word frequencies,
    # for each column it will be multiplied (repeated) by 'weights'
column_weights = {
    'title': 2,
    'genres': 4,
    'year': 1,
    'plot': 8,
}

content_rec['bag_of_words'] = (
    content_rec['title'].apply(lambda title: (title + ' ') * column_weights['title']) +
    content_rec['genres'].apply(lambda genres: (' '.join(genres) + ' ') * column_weights['genres']) +
    content_rec['year'].apply(lambda title: (title + ' ') * column_weights['year']) +
    content_rec['year'].apply(lambda title: (title + ' ') * column_weights['year'])
)
content_rec

Unnamed: 0,movie_id,title,genres,year,plot,bag_of_words
0,1,Toy Story,"[animation, childrens, comedy]",1995,"Led by Woody, Andy's toys live happily in his ...",Toy Story Toy Story animation childrens comedy...
1,2,Jumanji,"[adventure, childrens, fantasy]",1995,When siblings Judy and Peter discover an encha...,Jumanji Jumanji adventure childrens fantasy ad...
2,3,Grumpier Old Men,"[comedy, romance]",1995,A family wedding reignites the ancient feud be...,Grumpier Old Men Grumpier Old Men comedy roman...
3,4,Waiting to Exhale,"[comedy, drama]",1995,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale Waiting to Exhale comedy dra...
4,5,Father of the Bride Part II,[comedy],1995,Just when George Banks has recovered from his ...,Father of the Bride Part II Father of the Brid...
...,...,...,...,...,...,...
3878,3948,Meet the Parents,[comedy],2000,"Greg Focker is ready to marry his girlfriend, ...",Meet the Parents Meet the Parents comedy comed...
3879,3949,Requiem for a Dream,[drama],2000,The drug-induced utopias of four Coney Island ...,Requiem for a Dream Requiem for a Dream drama ...
3880,3950,Tigerland,[drama],2000,A group of recruits go through Advanced Infant...,Tigerland Tigerland drama drama drama drama 20...
3881,3951,Two Family House,[drama],2000,Buddy Visalo (Michael Rispoli) is a factory wo...,Two Family House Two Family House drama drama ...


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Fit a TF-IDF vectorizer on bag of words
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(content_rec['bag_of_words'])

# Train the KNN model for content-based filtering:
# Compute similarity matrix from vectorized [title, year, genres, plot]: 
    # there are several methods of calculating distance (similarity)
    # 1. Euclidean distance
    # 2. Manhattan distance
    # 3. Jaccard distance
    # 4. Cosine distance (using this one)
content_model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
content_model_knn.fit(tfidf_matrix)

In [15]:
from fuzzywuzzy import fuzz
from functools import partial

# Fix wrong input by user
def matching_score(a, b):
    # calculate the Levenshtein distance to find closest title
        # if exactly same score == 100
    return fuzz.ratio(a, b) 
    
def find_closest_title(title):
    leven_scores = list(enumerate(content_rec['title'].apply(matching_score, b = title)))
    sorted_lev_scores = sorted(leven_scores, key = lambda x: x[1], reverse = True)

    closest_idx = sorted_lev_scores[0][0]
    closest_movie = content_rec.loc[closest_idx]
    distance_score = sorted_lev_scores[0][1]

    closest_movie_id = closest_movie['movie_id']
    closest_movie_title = closest_movie['title']
        
    return (closest_movie_id, closest_movie_title, distance_score)

def get_movie_from_idx(idx):
    # get movie details from idx
    title = content_rec.loc[idx, 'title']
    year = content_rec.loc[idx, 'year']

    return title, year
    
def get_idx_from_title(title):
    # convert title to idx
    return content_rec[content_rec['title'] == title].index.values[0]
    
def get_content_based_recs(movie_name, num_recommendations = 5):
    _, closest_name, distance = find_closest_title(movie_name)
    if distance != 100:
        print(f"...WARNING: fixing movie name '{movie_name}' '{closest_name}'")
        movie_name = closest_name

    movie_idx = get_idx_from_title(movie_name)

    distances_cb, idx_cb = content_model_knn.kneighbors(
        tfidf_matrix[movie_idx], 
        n_neighbors = num_recommendations + 1
    )
    
    # actual recommendation
    print(f"Recommended movies similar to '{movie_name}' (content based filtering): ")
    cb_recommendations = []
    for i in range(1, len(distances_cb.flatten())):
        movie_id_cb = movies.iloc[idx_cb.flatten()[i]]['title']
        cb_recommendations.append(movie_id_cb)
    
    for movie in cb_recommendations:
        print(f'\t{movie}')

    return idx_cb, distances_cb

idx_cb, distances_cb = get_content_based_recs('oy story')

Recommended movies similar to 'Toy Story' (content based filtering): 
	Toy Story 2 (1999)
	Balto (1995)
	Bug's Life, A (1998)
	King and I, The (1999)
	Tarzan (1999)


In [None]:
# export models
import joblib

joblib.dump(tfidf_matrix, './models/tfidf_matrix.pkl')
joblib.dump(content_model_knn, './models/content_based_model.pkl')
joblib.dump(content_rec, './models/content_rec.pkl')