In [1]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util, models
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi

In [None]:
movies = pd.read_csv('archive/movies_metadata.csv')
movies.head().transpose()

In [None]:
# Check the languages of movies
movies.original_language.unique()

In [5]:
movies_en = movies[movies.original_language == 'en'].copy()

In [None]:
# Remove rows with missing description
movies_processed = movies_en.copy()
movies_processed.dropna(subset=['genres','id','original_language','overview',
                                'production_companies','production_countries',
                                'tagline','vote_average','vote_count','title'],
                        inplace = True)

In [9]:
# Due to the resource and time limit, sample 1000 movies
movies_processed = movies_processed.sample(n=1000, random_state=9542)

# Save it to a csv file
movies_processed.to_csv("processed.csv", index=False)

In [None]:
# Keep necessary data for the similarity-based models
cols_keep = ['id', 'title', 'overview']
movies_reduced = movies_processed[cols_keep]
movies_reduced.head()

In [None]:
# Combine the information of the reduced movies dataset into one column
movies_reduced['OverallInfo'] = movies_reduced[['title', 'overview']].agg('. '.join, axis = 1)
movies_reduced.head()

In [12]:
# Function to get the pairs of cosine similarity and book info in descending order
def sort_by_cosine(query_embedding,data):
    '''
    takes the embedding of a "query" description and the movies dataset, returns a list of pairs
    of the form (similarity,id), sorted in decreasing order according to cosine similarity
    between each document and the query.
    '''
    similarities = []
    for i in range(len(data.OverallInfo)):
        doc_embedding = data.OverallInfo.iloc[i]
        similarity = util.cos_sim(doc_embedding, query_embedding)
        similarities.append((similarity, data.id.iloc[i]))
    similarities.sort(reverse=True)
    return similarities

In [13]:
def tfidf_cosine(query,data):
    '''
    returns the similarities between a movie descriptions with all descriptions as (similarity,id) pairs,
    using the cosine similarity and TF-IDF sentence representation.
    '''
    tfidf = TfidfVectorizer(stop_words="english")
    data_cp = data.copy()
    scores = tfidf.fit_transform(data_cp.OverallInfo).toarray()
    query_embedding = tfidf.transform([query]).toarray()[0]
    data_cp.OverallInfo = [vec for vec in scores]
    sim = sort_by_cosine(query_embedding, data_cp)
    return sim

In [14]:
def glove_cosine(query,data):
    '''
    returns the similarities between a movie descriptions with all descriptions as (similarity,id) pairs,
    using the cosine similarity and glove-based sentence embedding.
    '''
    model = SentenceTransformer('sentence-transformers/average_word_embeddings_glove.840B.300d')
    query_embedding = model.encode(query)
    data_cp = data.copy()
    data_cp.OverallInfo = [vec for vec in model.encode(data_cp.OverallInfo.tolist())]
    sim = sort_by_cosine(query_embedding, data_cp)
    return sim

In [15]:
def minilm_cosine(query,data):
    '''
    returns the similarities between a movie descriptions with all descriptions as (similarity,id) pairs,
    using the cosine similarity and MiniLM-based (derived from BERT) sentence embedding.
    '''
    # As the following model does not respond from repo, download it and load from local
    #model = SentenceTransformer('all-MiniLM-L6-v2')
    model = SentenceTransformer('/Users/l24cui/Library/CloudStorage/OneDrive-TheUniversityofWesternOntario/Western/W23/CS9542 Artificial Intelligence II/Project/book-recommendation/all-MiniLM-L6-v2')
    query_embedding = model.encode(query)
    data_cp = data.copy()
    data_cp.OverallInfo = [vec for vec in model.encode(data_cp.OverallInfo.tolist())]
    sim = sort_by_cosine(query_embedding, data_cp)
    return sim

In [16]:
# Setup dictionaries to store the similarity results
sim_cos = {"tfidf": {}, "glove":{}, "bert":{}}
sim_bm25 = {"tfidf": {}, "glove":{}, "bert":{}}

In [None]:
# Calculate Cosine similarities among the TF-IDF, GloVe and BERT representations
# of all movie description pairs
for i in range(len(movies_reduced.OverallInfo)):
    mid = movies_reduced.id.iloc[i]
    mdesc = movies_reduced.OverallInfo.iloc[i]
    sim_cos["tfidf"][mid] = tfidf_cosine(mdesc,movies_reduced)
    sim_cos["glove"][mid] = glove_cosine(mdesc,movies_reduced)
    sim_cos["bert"][mid] = minilm_cosine(mdesc,movies_reduced)
# Load the similarities instead of compute again
#sim_cos = torch.load('cosine.pt')

In [18]:
# Save the cosine similarities
torch.save(sim_cos, 'cosine.pt')

In [19]:
# Function to get the pairs of BM25 similarity and book info in descending order
def sort_by_bm25(query_embedding,data):
    '''
    takes the embedding of a "query" description and the movies dataset, returns a list of pairs
    of the form (similarity,id), sorted in decreasing order according to BM25 similarity
    between each document and the query.
    '''
    bm25 = BM25Okapi(data.OverallInfo.tolist())
    similarity_scores = bm25.get_scores(query_embedding)
    similarities = [(similarity_scores[i], data.id.iloc[i]) for i in range(len(data.OverallInfo))]
    similarities.sort(reverse=True)
    return similarities

In [20]:
def tfidf_bm25(query,data):
    '''
    returns the similarities between a movie descriptions with all descriptions as (similarity,id) pairs,
    using the BM25 similarity and TF-IDF sentence representation.
    '''
    tfidf = TfidfVectorizer(stop_words="english")
    data_cp = data.copy()
    scores = tfidf.fit_transform(data_cp.OverallInfo).toarray()
    query_embedding = tfidf.transform([query]).toarray()[0]
    data_cp.OverallInfo = [vec for vec in scores]
    sim = sort_by_bm25(query_embedding, data_cp)
    return sim

In [21]:
def glove_bm25(query,data):
    '''
    returns the similarities between a movie descriptions with all descriptions as (similarity,id) pairs,
    using the BM25 similarity and glove-based sentence embedding.
    '''
    model = SentenceTransformer('sentence-transformers/average_word_embeddings_glove.840B.300d')
    query_embedding = model.encode(query)
    data_cp = data.copy()
    data_cp.OverallInfo = [vec for vec in model.encode(data_cp.OverallInfo.tolist())]
    sim = sort_by_bm25(query_embedding, data_cp)
    return sim

In [22]:
def minilm_bm25(query,data):
    '''
    returns the similarities between a movie descriptions with all descriptions as (similarity,id) pairs,
    using the BM25 similarity and MiniLM-based (derived from BERT) sentence embedding.
    '''
    # As the following model does not respond from repo, download it and load from local
    #model = SentenceTransformer('all-MiniLM-L6-v2')
    model = SentenceTransformer('/Users/l24cui/Library/CloudStorage/OneDrive-TheUniversityofWesternOntario/Western/W23/CS9542 Artificial Intelligence II/Project/book-recommendation/all-MiniLM-L6-v2')
    query_embedding = model.encode(query)
    data_cp = data.copy()
    data_cp.OverallInfo = [vec for vec in model.encode(data_cp.OverallInfo.tolist())]
    sim = sort_by_bm25(query_embedding, data_cp)
    return sim

In [23]:
# Calculate Cosine similarities among the TF-IDF, GloVe and BERT representations
# of all movie description pairs
for i in range(len(movies_reduced.OverallInfo)):
    mid = movies_reduced.id.iloc[i]
    mdesc = movies_reduced.OverallInfo.iloc[i]
    sim_bm25["tfidf"][mid] = tfidf_bm25(mdesc,movies_reduced)
    sim_bm25["glove"][mid] = glove_bm25(mdesc,movies_reduced)
    sim_bm25["bert"][mid] = minilm_bm25(mdesc,movies_reduced)
# Load the similarities instead of compute again
#sim_bm25 = torch.load('bm25.pt')

In [24]:
# Save the bm25 similarities
torch.save(sim_bm25, 'bm25.pt')