# TF-IDF vectors


In [7]:
import os
import numpy as np
import pandas as pd

DATA_PATH = "../data/raw/"


In [4]:
ted = pd.read_csv(os.path.join(DATA_PATH, "ted.csv"))
ted = ted["transcript"]


In [5]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ted)

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)


(500, 29158)


## Cosine Similarity


In [8]:
# Initialize numpy vectors
A = np.array([1, 3])
B = np.array([-2, 2])

# Compute dot product
# dot_prod = np.dot(A, B)
dot_prod = A @ B

# Print dot product
print(dot_prod)


4


In [9]:
corpus = [
    "The sun is the largest celestial body in the solar system",
    "The solar system consists of the sun and eight revolving planets",
    "Ra was the Egyptian Sun God",
    "The Pyramids were the pinnacle of Egyptian architecture",
    "The quick brown fox jumps over the lazy dog",
]


In [10]:
from sklearn.metrics.pairwise import cosine_similarity


In [11]:
# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)


[[1.         0.36413198 0.18314713 0.18435251 0.16336438]
 [0.36413198 1.         0.15054075 0.21704584 0.11203887]
 [0.18314713 0.15054075 1.         0.21318602 0.07763512]
 [0.18435251 0.21704584 0.21318602 1.         0.12960089]
 [0.16336438 0.11203887 0.07763512 0.12960089 1.        ]]


## `linear_kernel` vs `cosine_similarity`


In [13]:
import time
from sklearn.metrics.pairwise import linear_kernel


In [14]:
# Record start time
start = time.time()

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Print cosine similarity matrix
print(cosine_sim)

# Print time taken
print("Time taken: %s seconds" % (time.time() - start))

# Record start time
start = time.time()

# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Print cosine similarity matrix
print(cosine_sim)

# Print time taken
print("Time taken: %s seconds" % (time.time() - start))


[[1.         0.36413198 0.18314713 0.18435251 0.16336438]
 [0.36413198 1.         0.15054075 0.21704584 0.11203887]
 [0.18314713 0.15054075 1.         0.21318602 0.07763512]
 [0.18435251 0.21704584 0.21318602 1.         0.12960089]
 [0.16336438 0.11203887 0.07763512 0.12960089 1.        ]]
Time taken: 0.002093791961669922 seconds
[[1.         0.36413198 0.18314713 0.18435251 0.16336438]
 [0.36413198 1.         0.15054075 0.21704584 0.11203887]
 [0.18314713 0.15054075 1.         0.21318602 0.07763512]
 [0.18435251 0.21704584 0.21318602 1.         0.12960089]
 [0.16336438 0.11203887 0.07763512 0.12960089 1.        ]]
Time taken: 0.001165151596069336 seconds


In [24]:
metadata = pd.read_csv(os.path.join(DATA_PATH, "metadata.csv"))
# indices = pd.read_csv(os.path.join(DATA_PATH, "indices.csv"))
# Generate mapping between titles and index
indices = pd.Series(metadata.index, index=metadata["title"]).drop_duplicates()
movie_plots = pd.read_csv(os.path.join(DATA_PATH, "movie_plots.csv"))

movie_plots.fillna("", inplace=True)
movie_plots = movie_plots["overview"]


In [25]:
def get_recommendations(title, cosine_sim, indices):
    # Get index of movie that matches title
    idx = indices[title]
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return metadata["title"].iloc[movie_indices]


In [26]:
# Initialize the TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english")

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movie_plots)

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Generate recommendations
print(get_recommendations("The Dark Knight Rises", cosine_sim, indices))


1                              Batman Forever
2                                      Batman
3                              Batman Returns
8                  Batman: Under the Red Hood
9                            Batman: Year One
10    Batman: The Dark Knight Returns, Part 1
11    Batman: The Dark Knight Returns, Part 2
5                Batman: Mask of the Phantasm
7                               Batman Begins
4                              Batman & Robin
Name: title, dtype: object


In [27]:
indices = pd.read_csv(os.path.join(DATA_PATH, "ted_indices.csv"))

indices = indices["title"]
transcripts = ted


In [32]:
def get_recommendations(title, cosine_sim, indices):
    # Get the index of the movie that matches the title
    idx = indices[title]
    # Get the pairwsie similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    talk_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    # return ted["title"].iloc[talk_indices]
    return ted.iloc[talk_indices]


In [33]:
# Initialize the TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english")

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(transcripts)

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Generate recommendations
print(get_recommendations("5 ways to kill your dreams", cosine_sim, indices))


490    We've evolved with tools, and tools have evolv...
127    I'd like to start by asking you all to go to y...
139    The problem I want to talk with you about is r...
445    Most of us think of motion as a very visual th...
479    So, as researchers, something that we often do...
219    I'm going to talk about a technology that we'r...
8      I grew up in Northern Ireland, right up in the...
260    You hear that this is the era of environment —...
143    Let's imagine a sculptor building a statue, ju...
202    I love paper, and I love technology, and what ...
Name: transcript, dtype: object


## Word embeddings


In [37]:
import spacy

nlp = spacy.load("en_core_web_md")
sent = "I like apples and oranges"


In [38]:
# Create the doc object
doc = nlp(sent)

# Compute pairwise similarity scores
for token1 in doc:
    for token2 in doc:
        print(token1.text, token2.text, token1.similarity(token2))


I I 1.0
I like 0.5554912686347961
I apples 0.20442722737789154
I and 0.3160785734653473
I oranges 0.18824081122875214
like I 0.5554912686347961
like like 1.0
like apples 0.32987144589424133
like and 0.5267484784126282
like oranges 0.2771747410297394
apples I 0.20442722737789154
apples like 0.32987144589424133
apples apples 1.0
apples and 0.24097730219364166
apples oranges 0.7780942320823669
and I 0.3160785734653473
and like 0.5267484784126282
and apples 0.24097730219364166
and and 1.0
and oranges 0.19245944917201996
oranges I 0.18824081122875214
oranges like 0.2771747410297394
oranges apples 0.7780942320823669
oranges and 0.19245944917201996
oranges oranges 1.0


In [39]:
hopes = "\nBeyond the horizon of the place we lived when we were young\nIn a world of magnets and miracles\nOur thoughts strayed constantly and without boundary\nThe ringing of the division bell had begun\nAlong the Long Road and on down the Causeway\nDo they still meet there by the Cut\nThere was a ragged band that followed in our footsteps\nRunning before times took our dreams away\nLeaving the myriad small creatures trying to tie us to the ground\nTo a life consumed by slow decay\nThe grass was greener\nThe light was brighter\nWhen friends surrounded\nThe nights of wonder\nLooking beyond the embers of bridges glowing behind us\nTo a glimpse of how green it was on the other side\nSteps taken forwards but sleepwalking back again\nDragged by the force of some in a tide\nAt a higher altitude with flag unfurled\nWe reached the dizzy heights of that dreamed of world\nEncumbered forever by desire and ambition\nThere's a hunger still unsatisfied\nOur weary eyes still stray to the horizon\nThough down this road we've been so many times\nThe grass was greener\nThe light was brighter\nThe taste was sweeter\nThe nights of wonder\nWith friends surrounded\nThe dawn mist glowing\nThe water flowing\nThe endless river\nForever and ever\n"
hey = "\nHey you, out there in the cold\nGetting lonely, getting old\nCan you feel me?\nHey you, standing in the aisles\nWith itchy feet and fading smiles\nCan you feel me?\nHey you, don't help them to bury the light\nDon't give in without a fight\nHey you out there on your own\nSitting naked by the phone\nWould you touch me?\nHey you with you ear against the wall\nWaiting for someone to call out\nWould you touch me?\nHey you, would you help me to carry the stone?\nOpen your heart, I'm coming home\nBut it was only fantasy\nThe wall was too high\nAs you can see\nNo matter how he tried\nHe could not break free\nAnd the worms ate into his brain\nHey you, out there on the road\nAlways doing what you're told\nCan you help me?\nHey you, out there beyond the wall\nBreaking bottles in the hall\nCan you help me?\nHey you, don't tell me there's no hope at all\nTogether we stand, divided we fall\n"
mother = "\nMother do you think they'll drop the bomb?\nMother do you think they'll like this song?\nMother do you think they'll try to break my balls?\nOoh, ah\nMother should I build the wall?\nMother should I run for President?\nMother should I trust the government?\nMother will they put me in the firing mine?\nOoh ah,\nIs it just a waste of time?\nHush now baby, baby, don't you cry.\nMama's gonna make all your nightmares come true.\nMama's gonna put all her fears into you.\nMama's gonna keep you right here under her wing.\nShe won't let you fly, but she might let you sing.\nMama's gonna keep baby cozy and warm.\nOoh baby, ooh baby, ooh baby,\nOf course mama's gonna help build the wall.\nMother do you think she's good enough, for me?\nMother do you think she's dangerous, to me?\nMother will she tear your little boy apart?\nOoh ah,\nMother will she break my heart?\nHush now baby, baby don't you cry.\nMama's gonna check out all your girlfriends for you.\nMama won't let anyone dirty get through.\nMama's gonna wait up until you get in.\nMama will always find out where you've been.\nMama's gonna keep baby healthy and clean.\nOoh baby, ooh baby, ooh baby,\nYou'll always be baby to me.\nMother, did it need to be so high?\n"


In [40]:
# Create Doc objects
mother_doc = nlp(mother)
hopes_doc = nlp(hopes)
hey_doc = nlp(hey)

# Print similarity between mother and hopes
print(mother_doc.similarity(hopes_doc))

# Print similarity between mother and hey
print(mother_doc.similarity(hey_doc))


0.8700284224193019
0.9604979727674751
