# Movie Topic Model and Recommendation Model

**Author**: XueMei Jin, Sanhe Hu

## 1. Install Machine Learning Python Librarys

In [1]:
%pip install boto3
%pip install pathlib_mate==0.0.15
%pip install numpy==1.17.0
%pip install pandas==0.23.4
%pip install scikit-learn==0.21.3
%pip install nltk==3.4.4
%pip install stop_words==2018.7.23


Collecting boto3
  Using cached https://files.pythonhosted.org/packages/54/af/5fd15d2273a44ec915ffed2b9c32eaddeea0036e931ae43e7c26cf675bf4/boto3-1.9.210-py2.py3-none-any.whl
Collecting s3transfer<0.3.0,>=0.2.0 (from boto3)
  Using cached https://files.pythonhosted.org/packages/16/8a/1fc3dba0c4923c2a76e1ff0d52b305c44606da63f718d14d3231e21c51b0/s3transfer-0.2.1-py2.py3-none-any.whl
Collecting botocore<1.13.0,>=1.12.210 (from boto3)
  Using cached https://files.pythonhosted.org/packages/a8/02/1b48bdbc12021cbe68b14fdc8f666fc2edf5b0f096965b71a3f1d05c5dca/botocore-1.12.210-py2.py3-none-any.whl
Collecting jmespath<1.0.0,>=0.7.1 (from boto3)
  Using cached https://files.pythonhosted.org/packages/83/94/7179c3832a6d45b266ddb2aac329e101367fbdb11f425f13771d27f225bb/jmespath-0.9.4-py2.py3-none-any.whl
Collecting urllib3<1.26,>=1.20; python_version >= "3.4" (from botocore<1.13.0,>=1.12.210->boto3)
  Using cached https://files.pythonhosted.org/packages/e6/60/247f23a7121ae632d62811ba7f273d0e58972d75e5

In [94]:
data_dir = Path(Path.home(), "rdso-challenge-movie-data")

In [96]:
if not data_dir.exists():
    !aws s3 sync s3://rdso-lab-rdso-lbd-svcs-prod/01-raw/the-movie-db/movie ${HOME}/rdso-challenge-movie-data --profile rdso_sandbox

In [6]:
import os
import boto3

# use IAM role in EC2
if ("AWS_ACCESS_KEY_ID" in os.environ) and ("AWS_SECRET_ACCESS_KEY" in os.environ):
    boto_ses = boto3.session.Session()
else:
    boto_ses = boto3.session.Session(profile_name="rdso_sandbox")

In [1]:
import json
import numpy as np
import pandas as pd
from pathlib_mate import PathCls as Path

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
nltk.download("stopwords")
from stop_words import get_stop_words


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanhehu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
# Load movie title and plot.
# in www.themoviedb.org data set, the `original_title` field is movie title, the `overview` field is movie plot
# this block takes 5-10 seconds.
df_idf = list()
for p in Path(data_dir).select_file():
    try:
        data = json.loads(p.read_text(encoding="utf-8"))
        imdb_id = data["imdb_id"]
        the_movie_db_id = data["id"]
        title = data["original_title"]
        overview = data["overview"]
        tagline = data.get("tagline", "")
        genres = ", ".join([dct["name"] for dct in data.get("genres", [])])
        vote_average = data["vote_average"]
        vote_count = data["vote_count"]
        poster_path = data["poster_path"]
        df_idf.append((
            imdb_id, 
            the_movie_db_id,
            title, 
            overview, 
            tagline, 
            genres, 
            vote_average,
            vote_count,
            poster_path,
        ))
    except:
        pass
df_idf = pd.DataFrame(
    df_idf, 
    columns=[
        "imdb_id", "the_movie_db_id", "title", "overview", "tagline", "genres", 
        "vote_average", "vote_count", "poster_path"
    ]
)
df_idf.index = df_idf["imdb_id"] # used to access row by imdb_id
df_idf["text"] = df_idf["title"] + df_idf["overview"]
df_idf.head(3)

Unnamed: 0_level_0,imdb_id,the_movie_db_id,title,overview,tagline,genres,vote_average,vote_count,poster_path,text
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt5027162,tt5027162,476791,Aurora Borealis - Északi fény,The Aurora Borealis is a story of family that ...,,Drama,6.1,5,/rS7EMVkHhDMKR1uOEg7isekmptP.jpg,Aurora Borealis - Északi fényThe Aurora Boreal...
tt1556190,tt1556190,72721,Nostalgia de la luz,"In Chile's Atacama Desert, astronomers peer de...",,Documentary,7.8,59,/sVGSAr0EQQUOXFxlnUyIpAfZ0SJ.jpg,"Nostalgia de la luzIn Chile's Atacama Desert, ..."
tt3859310,tt3859310,341689,How to Talk to Girls at Parties,"Croydon, 1977. A trio of punk teenagers goes t...",Some girls are out of this world,"Comedy, Music, Romance, Science Fiction",6.4,168,/v6mPfyGshwXd1R6kQlMEyZ8Zu2s.jpg,"How to Talk to Girls at PartiesCroydon, 1977. ..."


In [56]:
# Load english stop words
stopwords = get_stop_words("en")
cv = CountVectorizer(max_df=0.85, stop_words=stopwords, max_features=1000)
word_count_vector = cv.fit_transform(df_idf['text'].apply(lambda x: np.str_(x)))

# Define TF-IDF transformer
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

  'stop_words.' % sorted(inconsistent))


TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [57]:
def sort_coo_matrix(coo_matrix):
    """
    :type coo_matrix: scipy.sparse.coo.coo_matrix
    
    :rtype: List[Tuple[int, float]]
    :return: List[Tuple[word_index, weight]]
    """
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return list(sorted(tuples, key=lambda x: x[1], reverse=True))


def extract_topn_from_vector(feature_names, sorted_items, topn=5):
    """
    get the feature names and tf-idf score of top n items
    
    :type feature_names: List[str]
    :type sorted_items: List[Tuple[int, float]]
    
    :rtype: Dict[str, float]
    :return: Dict[keyword, weight]
    """
    # use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        # keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    # create a tuples of feature,score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results


feature_names = cv.get_feature_names()

keyword_list = list()
for text in df_idf["text"]:
    try:
        tf_idf_vector = tfidf_transformer.transform(cv.transform([text, ]))
        sorted_items = sort_coo(tf_idf_vector.tocoo()) 
        keywords = extract_topn_from_vector(feature_names, sorted_items, 5)
        keyword = json.dumps(keywords)
        keyword_list.append(keyword)
    except:
        keyword_list.append(None)
df_idf["keyword"] = keyword_list
df_idf.head(3)

Unnamed: 0_level_0,imdb_id,the_movie_db_id,title,overview,tagline,genres,vote_average,vote_count,poster_path,text,keyword
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
tt5027162,tt5027162,476791,Aurora Borealis - Északi fény,The Aurora Borealis is a story of family that ...,,Drama,6.1,5,/rS7EMVkHhDMKR1uOEg7isekmptP.jpg,Aurora Borealis - Északi fényThe Aurora Boreal...,
tt1556190,tt1556190,72721,Nostalgia de la luz,"In Chile's Atacama Desert, astronomers peer de...",,Documentary,7.8,59,/sVGSAr0EQQUOXFxlnUyIpAfZ0SJ.jpg,"Nostalgia de la luzIn Chile's Atacama Desert, ...",
tt3859310,tt3859310,341689,How to Talk to Girls at Parties,"Croydon, 1977. A trio of punk teenagers goes t...",Some girls are out of this world,"Comedy, Music, Romance, Science Fiction",6.4,168,/v6mPfyGshwXd1R6kQlMEyZ8Zu2s.jpg,"How to Talk to Girls at PartiesCroydon, 1977. ...",


## Movie Keyword Result Demo

In [58]:
movie_id_mapper = {
    "tt1853728": "Django Unchained",
} # type: Dict[str, str], Dict[imdb_id, movie_title]

sub_df = df_idf.loc[list(movie_id_mapper), ["keyword",]]
for ind, row in sub_df.iterrows():
    movie_title = movie_id_mapper[ind]
    print(movie_title, row["keyword"])

Django Unchained None


## Movie Recommendation Title

**TODO**: Description ...

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

df_idf["text_new"] = df_idf["genres"].astype(str) + "_" \
    + df_idf["genres"].astype(str) + "_" \
    + df_idf["genres"].astype(str) + "_" \
    + df_idf["title"].astype(str) + "_" \
    + df_idf["title"].astype(str) + "_" \
    + df_idf["overview"].astype(str) + "_" \
    + df_idf["overview"].astype(str) + "_" \
    + df_idf["tagline"].astype(str) + "_" \
    + df_idf["tagline"].astype(str)
df_idf["text_new"].head(3)

imdb_id
tt5027162    Drama_Drama_Drama_Aurora Borealis - Északi fén...
tt1556190    Documentary_Documentary_Documentary_Nostalgia ...
tt3859310    Comedy, Music, Romance, Science Fiction_Comedy...
Name: text_new, dtype: object

In [90]:
# Calculate average of vote
C = df_idf["vote_average"].mean()

# Calculate the minimum number of votes
m = df_idf["vote_count"].quantile(0.90)

# Filter out all qualified movies into a new dataframe
q_movies = df_idf.copy().loc[df_idf["vote_count"] >= m]

# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x["vote_count"]
    R = x["vote_average"]
    return (v/(v+m) * R) + (m/(m+v) * C)

# Define a new feature 'score' and calculate its value with weighted_rating()`
q_movies["score"] = q_movies.apply(weighted_rating, axis=1)

# Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

# Display the top 10 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

Unnamed: 0_level_0,title,vote_count,vote_average,score
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt5311514,君の名は。,4374,8.6,8.320712
tt4154796,Avengers: Endgame,8667,8.4,8.262408
tt2582802,Whiplash,8132,8.4,8.253879
tt1375666,Inception,22772,8.3,8.247949
tt4154756,Avengers: Infinity War,14822,8.3,8.220941
tt4633694,Spider-Man: Into the Spider-Verse,4743,8.4,8.159411
tt0816692,Interstellar,19150,8.2,8.140894
tt1675434,Intouchables,10206,8.2,8.091512
tt5726616,Call Me by Your Name,5235,8.3,8.088757
tt2380307,Coco,9069,8.2,8.07862


In [91]:
# Define a TF-IDF Vectorizer Object and remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words="english")

# Replace NaN with an empty string
df_idf["text_new"] = df_idf["text_new"].fillna("")
df_idf.drop_duplicates("title", inplace=True)
df_idf.drop_duplicates("the_movie_db_id", inplace=True)

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df_idf["text_new"])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

df_idf.head(3)

Unnamed: 0_level_0,imdb_id,the_movie_db_id,title,overview,tagline,genres,vote_average,vote_count,poster_path,text,keyword,text_new,similar_movies
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
tt5027162,tt5027162,476791,Aurora Borealis - Északi fény,The Aurora Borealis is a story of family that ...,,Drama,6.1,5,/rS7EMVkHhDMKR1uOEg7isekmptP.jpg,Aurora Borealis - Északi fényThe Aurora Boreal...,,Drama_Drama_Drama_Aurora Borealis - Északi fén...,"[{""the_movie_db_id"":47194,""title"":""Blood Night..."
tt1556190,tt1556190,72721,Nostalgia de la luz,"In Chile's Atacama Desert, astronomers peer de...",,Documentary,7.8,59,/sVGSAr0EQQUOXFxlnUyIpAfZ0SJ.jpg,"Nostalgia de la luzIn Chile's Atacama Desert, ...",,Documentary_Documentary_Documentary_Nostalgia ...,"[{""the_movie_db_id"":473400,""title"":""Cielo"",""po..."
tt3859310,tt3859310,341689,How to Talk to Girls at Parties,"Croydon, 1977. A trio of punk teenagers goes t...",Some girls are out of this world,"Comedy, Music, Romance, Science Fiction",6.4,168,/v6mPfyGshwXd1R6kQlMEyZ8Zu2s.jpg,"How to Talk to Girls at PartiesCroydon, 1977. ...",,"Comedy, Music, Romance, Science Fiction_Comedy...","[{""the_movie_db_id"":86101,""title"":""The Sleeper..."


We use Cosine similarity to compute cosine distance.

The similarity will be equal to 1 if the two vectors are identical, and it will be 0 if the two are orthogonal. In other words, the similarity is a number bounded between 0 and 1 that tells us how much the two vectors are similar.

In [92]:
# Function that takes in movie title as input and outputs most similar movies

# the_movie_db_id -> df_idf DataFrame row number (0 ~ N) mapper
the_movie_db_id_to_df_row_number_mapper = {
    the_movie_db_id: row_num
    for row_num, the_movie_db_id in enumerate(df_idf["the_movie_db_id"])
}
imdb_id_to_the_movie_db_id = {
    imdb_id: the_movie_db_id
    for imdb_id, the_movie_db_id in zip(df_idf.index, df_idf["the_movie_db_id"])
}

def get_recommendations(the_movie_db_id, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    df_row_num = the_movie_db_id_to_df_row_number_mapper[the_movie_db_id]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[df_row_num]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    return movie_indices


# populate recommendation movies
import json

similar_movies_list = list()
for index, row in df_idf.iterrows():
    the_movie_db_id = row["the_movie_db_id"]
    movie_indices = get_recommendations(the_movie_db_id)
    sub_df = df_idf.loc[df_idf.index[movie_indices], ["the_movie_db_id", "title", "poster_path"]]
    json_text = sub_df.to_json(orient="records")
    similar_movies_list.append(json_text)
    
df_idf["similar_movies"] = similar_movies_list
df_idf[["title", "similar_movies"]].head(5)

Unnamed: 0_level_0,title,similar_movies
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1
tt5027162,Aurora Borealis - Északi fény,"[{""the_movie_db_id"":405473,""title"":""A Date for..."
tt1556190,Nostalgia de la luz,"[{""the_movie_db_id"":473400,""title"":""Cielo"",""po..."
tt3859310,How to Talk to Girls at Parties,"[{""the_movie_db_id"":86101,""title"":""The Sleeper..."
tt9759978,내안의 그놈,"[{""the_movie_db_id"":439396,""title"":""Mavi Gece""..."
tt5662106,Lemonade,"[{""the_movie_db_id"":500900,""title"":""Ten Years ..."


## Movie Recommendation Model

In [93]:
movie_id_mapper = {
    "tt1853728": "Django Unchained",
    "tt2527336": "Star Wars: The Last Jedi",
    "tt1228705": "Iron Man 2",
    "tt1229238": "Mission: Impossible - Ghost Protocol",
    "tt0435761": "Toy Story 3",
} # type: Dict[str, str], Dict[imdb_id, movie_title]

sub_df = df_idf.loc[list(movie_id_mapper), ["similar_movies",]]
for ind, row in sub_df.iterrows():
    movie_title = movie_id_mapper[ind]
    print("=" * 80)
    print(movie_title)
    for recommended_movie in json.loads(row["similar_movies"]):
        print("\t", recommended_movie)

Django Unchained
	 {'the_movie_db_id': 54054, 'title': 'One for the Money', 'poster_path': '/g8BxtvxvfbiM1UsJUyFjqeygrye.jpg'}
	 {'the_movie_db_id': 333993, 'title': 'Critters: Bounty Hunter', 'poster_path': '/1KDtbPfcWKc2vEtgYg6PhOMOGqE.jpg'}
	 {'the_movie_db_id': 256467, 'title': 'Honour', 'poster_path': '/ywBmHEjfYAH4ycjFW9BQreA9A3g.jpg'}
	 {'the_movie_db_id': 456781, 'title': 'Boone : The Bounty Hunter', 'poster_path': '/akqKTfd6EbUQamcvVrJS1CxOdUH.jpg'}
	 {'the_movie_db_id': 41441, 'title': '불꽃처럼 나비처럼', 'poster_path': '/bGBChmdEJ10OcrUq6SSZ8DFFSTo.jpg'}
	 {'the_movie_db_id': 293859, 'title': 'Covert Operation', 'poster_path': '/d7sEm5AdeTMerZ4gwFxLspDyVHQ.jpg'}
	 {'the_movie_db_id': 169800, 'title': 'The Retrieval', 'poster_path': '/buXVsenVhRxWpTGTplzJujJ38zk.jpg'}
	 {'the_movie_db_id': 388764, 'title': '赏金猎人', 'poster_path': '/h121QRQzEcchU7tkbHjgcWwbuwH.jpg'}
	 {'the_movie_db_id': 405882, 'title': "Don't Kill It", 'poster_path': '/iQey3UeqU4uQzr82GQJ1pZMwe40.jpg'}
	 {'the_movie