In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from skimage import io
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('../datasets/movies.csv')
df.head(n=2)

Unnamed: 0,id,imdb_id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
0,414906,tt1877830,The Batman,Crime-Mystery-Thriller,en,In his second year of fighting crime Batman un...,8195.597,6th & Idaho-Dylan Clark Productions-DC Films-W...,2022-03-01,185000000,...,176.0,Released,Unmask the truth.,7.8,4165,Robert Pattinson-Zoë Kravitz-Paul Dano-Jeffrey...,crime fighter-secret identity-nightclub-politi...,/74xTEgt7R36Fpooo50r9T25onhq.jpg,/tRS6jvPM9qPrrnx2KRp3ew96Yot.jpg,335787-508947-696806-777270-833425-718032-5050...
1,634649,tt10872600,Spider-Man: No Way Home,Action-Adventure-Science Fiction,en,Peter Parker is unmasked and no longer able to...,4716.995,Marvel Studios-Pascal Pictures-Columbia Pictures,2021-12-15,200000000,...,148.0,Released,The Multiverse unleashed.,8.1,12300,Tom Holland-Zendaya-Benedict Cumberbatch-Jacob...,new york city-loss of loved one-showdown-secre...,/1g0dhYtq4irTY1GPXvft6k4YLjm.jpg,/iQFcwSGbZXMkeyKrxbPnwnRo5fl.jpg,646380-624860-568124-524434-580489-425909-4766...


<h2>Feature Extraction</h2>

In [3]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
df['overview']= df['overview'].fillna('')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/klebervasconcelos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/klebervasconcelos/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/klebervasconcelos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
sample_df = df.sample(n=5)
sample_df.head()

Unnamed: 0,id,imdb_id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
277193,278826,tt0092998,Eye on the Sparrow,Drama-TV Movie,en,True story of a blind couple who fought the au...,1.754,,1987-12-07,0,...,96.0,Released,,7.0,1,Mare Winningham-Keith Carradine-Sandy McPeak-K...,,/drMT2OnphALjOngvnQmhpObtRP1.jpg,,
62213,251689,tt1706481,Sex Express Coffee,Crime-Drama-Thriller,es,Police detective Escobar attempts to capture a...,2.465,MCM Studios,2010-05-25,0,...,87.0,Released,"Here, the holiest temptation falls.",4.2,9,Ricardo Bonno-Fernando Consagra-Diego de Erice...,police investigation-online hookup,/uyKD8lPJcndgxY8ORSNplKapCM0.jpg,/mHAXyacdn2do4RGF0BQ2392Js9Y.jpg,
35277,169025,tt2638096,The Real Life of Teachers,Comedy,fr,A comedy about a band of voyeuristic students ...,4.816,Universal Pictures France,2013-02-20,0,...,97.0,Released,,4.8,146,Lucien Jean-Baptiste-Audrey Fleurot-Emir Seghi...,,/2j81UTWwKCISexGjxpWEaiX2fCg.jpg,/cCZ2a3HSAsmUPD2CjCcHKzyXAUK.jpg,182219-325844-418333-66129-344268-262551-28373...
156305,313778,,TNA Hardcore Justice 2010,Drama-Action,en,Total Nonstop Action (TNA) Wrestling President...,1.96,Total Nonstop Action (TNA),2010-08-08,0,...,360.0,Released,,5.0,1,Rob Van Dam-Sabu-Scott Levy-Tommy Dreamer-Mark...,wrestling-pro wrestling,/wnjkyTwaSiONNqvmwHS8Xs0wlJY.jpg,,
269544,270283,tt1485686,Boronia Boys,,en,Kane and Darren are two 'rag and bone men' who...,0.915,Star Baby Productions,2009-08-25,0,...,90.0,Released,,2.0,1,Cameron Nugent-Tim Burns-Elspeth Ballantyne-Ma...,,/lrh8QOH9Se0Rv59DTrS7XUXKZJ5.jpg,,


In [5]:
sample_df['overview']=sample_df['overview'].str.lower().apply(word_tokenize).apply(nltk.tag.pos_tag) # Applying part of speech tags.
sample_df.head()

Unnamed: 0,id,imdb_id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
277193,278826,tt0092998,Eye on the Sparrow,Drama-TV Movie,en,"[(true, JJ), (story, NN), (of, IN), (a, DT), (...",1.754,,1987-12-07,0,...,96.0,Released,,7.0,1,Mare Winningham-Keith Carradine-Sandy McPeak-K...,,/drMT2OnphALjOngvnQmhpObtRP1.jpg,,
62213,251689,tt1706481,Sex Express Coffee,Crime-Drama-Thriller,es,"[(police, NNS), (detective, JJ), (escobar, NN)...",2.465,MCM Studios,2010-05-25,0,...,87.0,Released,"Here, the holiest temptation falls.",4.2,9,Ricardo Bonno-Fernando Consagra-Diego de Erice...,police investigation-online hookup,/uyKD8lPJcndgxY8ORSNplKapCM0.jpg,/mHAXyacdn2do4RGF0BQ2392Js9Y.jpg,
35277,169025,tt2638096,The Real Life of Teachers,Comedy,fr,"[(a, DT), (comedy, NN), (about, IN), (a, DT), ...",4.816,Universal Pictures France,2013-02-20,0,...,97.0,Released,,4.8,146,Lucien Jean-Baptiste-Audrey Fleurot-Emir Seghi...,,/2j81UTWwKCISexGjxpWEaiX2fCg.jpg,/cCZ2a3HSAsmUPD2CjCcHKzyXAUK.jpg,182219-325844-418333-66129-344268-262551-28373...
156305,313778,,TNA Hardcore Justice 2010,Drama-Action,en,"[(total, JJ), (nonstop, JJ), (action, NN), ((,...",1.96,Total Nonstop Action (TNA),2010-08-08,0,...,360.0,Released,,5.0,1,Rob Van Dam-Sabu-Scott Levy-Tommy Dreamer-Mark...,wrestling-pro wrestling,/wnjkyTwaSiONNqvmwHS8Xs0wlJY.jpg,,
269544,270283,tt1485686,Boronia Boys,,en,"[(kane, NN), (and, CC), (darren, NNS), (are, V...",0.915,Star Baby Productions,2009-08-25,0,...,90.0,Released,,2.0,1,Cameron Nugent-Tim Burns-Elspeth Ballantyne-Ma...,,/lrh8QOH9Se0Rv59DTrS7XUXKZJ5.jpg,,


In [6]:
sample_df['overview'].head()

277193    [(true, JJ), (story, NN), (of, IN), (a, DT), (...
62213     [(police, NNS), (detective, JJ), (escobar, NN)...
35277     [(a, DT), (comedy, NN), (about, IN), (a, DT), ...
156305    [(total, JJ), (nonstop, JJ), (action, NN), ((,...
269544    [(kane, NN), (and, CC), (darren, NNS), (are, V...
Name: overview, dtype: object

In [7]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
sample_df['overview']= sample_df['overview'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
sample_df['overview'].head()

277193    [(true, a), (story, n), (of, n), (a, n), (blin...
62213     [(police, n), (detective, a), (escobar, n), (a...
35277     [(a, n), (comedy, n), (about, n), (a, n), (ban...
156305    [(total, a), (nonstop, a), (action, n), ((, n)...
269544    [(kane, n), (and, n), (darren, n), (are, v), (...
Name: overview, dtype: object

In [8]:
# Applying word lemmatizer.

wnl = WordNetLemmatizer()

sample_df['overview']= sample_df['overview'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])

sample_df['overview']= sample_df['overview'].apply(lambda x: ' '.join(x))

sample_df.head()


Unnamed: 0,id,imdb_id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
277193,278826,tt0092998,Eye on the Sparrow,Drama-TV Movie,en,true story of a blind couple who fight the aut...,1.754,,1987-12-07,0,...,96.0,Released,,7.0,1,Mare Winningham-Keith Carradine-Sandy McPeak-K...,,/drMT2OnphALjOngvnQmhpObtRP1.jpg,,
62213,251689,tt1706481,Sex Express Coffee,Crime-Drama-Thriller,es,police detective escobar attempt to capture an...,2.465,MCM Studios,2010-05-25,0,...,87.0,Released,"Here, the holiest temptation falls.",4.2,9,Ricardo Bonno-Fernando Consagra-Diego de Erice...,police investigation-online hookup,/uyKD8lPJcndgxY8ORSNplKapCM0.jpg,/mHAXyacdn2do4RGF0BQ2392Js9Y.jpg,
35277,169025,tt2638096,The Real Life of Teachers,Comedy,fr,a comedy about a band of voyeuristic student w...,4.816,Universal Pictures France,2013-02-20,0,...,97.0,Released,,4.8,146,Lucien Jean-Baptiste-Audrey Fleurot-Emir Seghi...,,/2j81UTWwKCISexGjxpWEaiX2fCg.jpg,/cCZ2a3HSAsmUPD2CjCcHKzyXAUK.jpg,182219-325844-418333-66129-344268-262551-28373...
156305,313778,,TNA Hardcore Justice 2010,Drama-Action,en,total nonstop action ( tna ) wrestle president...,1.96,Total Nonstop Action (TNA),2010-08-08,0,...,360.0,Released,,5.0,1,Rob Van Dam-Sabu-Scott Levy-Tommy Dreamer-Mark...,wrestling-pro wrestling,/wnjkyTwaSiONNqvmwHS8Xs0wlJY.jpg,,
269544,270283,tt1485686,Boronia Boys,,en,kane and darren be two 'rag and bone men ' who...,0.915,Star Baby Productions,2009-08-25,0,...,90.0,Released,,2.0,1,Cameron Nugent-Tim Burns-Elspeth Ballantyne-Ma...,,/lrh8QOH9Se0Rv59DTrS7XUXKZJ5.jpg,,


In [9]:
def lammatization():
    import nltk
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')
    from nltk.corpus import wordnet, stopwords
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize
    df['overview']= df['overview'].fillna('')

    # Tokenizing the tweet base texts (splitting the text into individual words)
    # and applying part of speech tags to each word

    df['overview']=df['overview'].str.lower().apply(word_tokenize).apply(nltk.tag.pos_tag) 


    # Converting part of speeches to wordnet format.

    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN


    df['overview']= df['overview'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

    # Applying word lemmatizer.

    wnl = WordNetLemmatizer()

    df['overview']= df['overview'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])


    df['overview']= df['overview'].apply(lambda x: ' '.join(x))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

df.drop_duplicates(inplace=True, ignore_index=True)
#df = df.groupby('title').first().reset_index()
df.fillna(value={i: ' ' for i in ['overview', 'genres', 'keywords', 'credits']}, inplace=True)

# lambda func for str split join
strOp= lambda x: ' '.join(x.split('-'))

df.overview = df.overview + df.keywords.apply(strOp) + df.genres.apply(strOp) + df.credits.apply(lambda x: ' '.join(x.replace(' ', '').split('-')[:3]))

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['overview'])

display(pd.DataFrame(
    tfidf_matrix[:10, 7000:7070].toarray(),
    columns= tfidf.get_feature_names_out()[7000:7070],
    index = df.title[:10]).round())

print(tfidf_matrix.shape)

Unnamed: 0_level_0,actionmattfarley,actionmatthewchamp,actionmatthewmercer,actionmatthewmodine,actionmatthewreese,actionmatthewwillig,actionmatthewwolf,actionmatthiasschoenaerts,actionmattmccolm,actionmattmccoy,...,actionmichaelbeck,actionmichaelbell,actionmichaelbiehn,actionmichaelbloom,actionmichaelbowen,actionmichaelbrandon,actionmichaelbrianrawlins,actionmichaelbugard,actionmichaelcaine,actionmichaelcera
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Batman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spider-Man: No Way Home,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Outfit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Turning Red,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sonic the Hedgehog 2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Yaksha: Ruthless Operations,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Moonfall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Uncharted,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
All the Old Knives,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blacklight,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(230160, 460309)


<h3>Recommender Function<h3>

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import skimage.io as io

# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title):
    # Get the index of the movie that matches the title
    idx = df.index[df['title'] == title][0]
    print(idx)
    # show given movie poster
    # try:
    #     a = io.imread(f'https://image.tmdb.org/t/p/w500/{df.loc[idx, "poster_path"]}')
    #     plt.imshow(a)
    #     plt.axis('off')
    #     plt.title(title)
    #     plt.show()
    # except:pass
    
    # print('Recommendations\n')


    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(
        cosine_similarity(
            tfidf_matrix,
            tfidf_matrix[idx])))
    
  

    # # Sort the movies based on the similarity scores
    # sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # # Get the scores of the 10 most similar movies
    # sim_scores = sim_scores[1:10]

    # # Get the movie indices
    # movie_indices = [i[0] for i in sim_scores]

    # # Return the top 10 most similar movies
    # result = df.iloc[movie_indices]
    
    # # show reco. movie posters
    # fig, ax = plt.subplots(3, 3, figsize=(15,20))
    # ax=ax.flatten()
    # for i, j in enumerate(result.poster_path):
    #     try:
    #         ax[i].axis('off')
    #         ax[i].set_title(result.iloc[i].title,fontsize=22)
    #         a = io.imread(f'https://image.tmdb.org/t/p/w500/{j}')
    #         ax[i].imshow(a)          
    #     except: pass
    # fig.tight_layout()
    # fig.show()

In [14]:
get_recommendations("Godzilla")

745
