# Cosine Similarity

In [1]:
from sklearn.metrics.pairwise import cosine_similarity

A = (4,7,1)
B = (5,2,3)

# Compute the cosine score of A and B
score = cosine_similarity([A], [B])

# Print the cosine score
print(score[0])

[0.73881883]


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer = TfidfVectorizer(stop_words='english')

corpus = pd.Series([
'The lion is the king of the jungle',
'Lions have lifespans of a decade',
'The lion is an endangered species'
])

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(corpus)

print(tfidf_matrix.toarray())
tfidf_matrix.shape

[[0.         0.         0.62276601 0.62276601 0.         0.4736296
  0.         0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.        ]
 [0.         0.62276601 0.         0.         0.         0.4736296
  0.         0.62276601]]


(3, 8)

In [3]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
data = corpus.values

idx = 1
print(cosine_sim[idx])
sim_scores = list(enumerate(cosine_sim[idx]))
print(sim_scores)
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:11]
movie_indices = [i[0] for i in sim_scores]
[data[i] for i in movie_indices]

[0. 1. 0.]
[(0, 0.0), (1, 1.0), (2, 0.0)]


['The lion is the king of the jungle', 'The lion is an endangered species']

# Ted talk recommendations

In [4]:
import pandas as pd
ted = pd.read_csv('data/ted.csv')

In [5]:
ted.head()

Unnamed: 0,transcript,url
0,"We're going to talk — my — a new lecture, just...",https://www.ted.com/talks/al_seckel_says_our_b...
1,"This is a representation of your brain, and yo...",https://www.ted.com/talks/aaron_o_connell_maki...
2,It's a great honor today to share with you The...,https://www.ted.com/talks/carter_emmart_demos_...
3,"My passions are music, technology and making t...",https://www.ted.com/talks/jared_ficklin_new_wa...
4,It used to be that if you wanted to get a comp...,https://www.ted.com/talks/jeremy_howard_the_wo...


In [6]:
ted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   transcript  500 non-null    object
 1   url         500 non-null    object
dtypes: object(2)
memory usage: 7.9+ KB


In [7]:
ted.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   transcript  500 non-null    object
 1   url         500 non-null    object
dtypes: object(2)
memory usage: 11.7+ KB


No null values

In [8]:
from sklearn.metrics.pairwise import linear_kernel

# Initialize the TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(ted.transcript.values)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
idx = 1
inp = ted.iloc[idx].transcript

sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:11]
# Get the movie indices
recommender = [i[0] for i in sim_scores]
[ted.iloc[i] for i in recommender]

[transcript    Let's imagine a sculptor building a statue, ju...
 url           https://www.ted.com/talks/george_tulevski_the_...
 Name: 143, dtype: object,
 transcript    I'd like to apologize, first of all, to all of...
 url           https://www.ted.com/talks/robin_ince_science_v...
 Name: 222, dtype: object,
 transcript    The AlloSphere: it's a three-story metal spher...
 url           https://www.ted.com/talks/joann_kuchera_morin_...
 Name: 414, dtype: object,
 transcript    I'll just start talking about the 17th century...
 url           https://www.ted.com/talks/kary_mullis_on_what_...
 Name: 65, dtype: object,
 transcript    I'm here today to start a revolution. Now befo...
 url           https://www.ted.com/talks/steve_keil_a_manifes...
 Name: 353, dtype: object,
 transcript    We grew up interacting with the physical objec...
 url           https://www.ted.com/talks/pranav_mistry_the_th...
 Name: 271, dtype: object,
 transcript    In the year 1919, a virtually unknown German

# Movies Recommender

In [10]:
url = "https://assets.datacamp.com/production/repositories/4375/datasets/83f27c4ad045c098d3db5596154316e4ee0a28a8/movie_overviews.csv"

In [11]:
import urllib.request 
urllib.request.urlretrieve(url, 'data/movie_overviews.csv')

('data/movie_overviews.csv', <http.client.HTTPMessage at 0x7f14889cb400>)

In [12]:
movies = pd.read_csv('data/movie_overviews.csv')
movies.head()

Unnamed: 0,id,title,overview,tagline
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...


In [16]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9099 entries, 0 to 9098
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        9099 non-null   int64 
 1   title     9099 non-null   object
 2   overview  9087 non-null   object
 3   tagline   7033 non-null   object
dtypes: int64(1), object(3)
memory usage: 284.5+ KB


In [20]:
summary = movies.overview.dropna()
overview = summary.values
overview[1]

"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."

In [21]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=2.0)
vector = vectorizer.fit_transform(overview)

In [26]:
cosine_sim = cosine_similarity(vector, vector)

In [31]:
idx = 1
inp = summary.iloc[idx]

sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:11]
# Get the movie indices
recommender = [i[0] for i in sim_scores]
[summary.iloc[i] for i in recommender]

["Wreck-It Ralph is the 9-foot-tall, 643-pound villain of an arcade video game named Fix-It Felix Jr., in which the game's titular hero fixes buildings that Ralph destroys. Wanting to prove he can be a good guy and not just a villain, Ralph escapes his game and lands in Hero's Duty, a first-person shooter where he helps the game's hero battle against alien invaders. He later enters Sugar Rush, a kart racing game set on tracks made of candies, cookies and other sweets. There, Ralph meets Vanellope von Schweetz who has learned that her game is faced with a dire threat that could affect the entire arcade, and one that Ralph may have inadvertently started.",
 "Video game experts are recruited by the military to fight 1980s-era video game characters who've attacked New York.",
 'A game designer on the run from assassins must play her latest virtual reality creation with a marketing trainee to determine if the game has been damaged.',
 'After the mysterious, brutal death of an old friend, a 

## Real recommender

In [39]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9099 entries, 0 to 9098
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        9099 non-null   int64 
 1   title     9099 non-null   object
 2   overview  9087 non-null   object
 3   tagline   7033 non-null   object
dtypes: int64(1), object(3)
memory usage: 284.5+ KB


In [48]:
final_movies = movies[movies.overview.notnull()]

In [49]:
final_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9087 entries, 0 to 9098
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        9087 non-null   int64 
 1   title     9087 non-null   object
 2   overview  9087 non-null   object
 3   tagline   7033 non-null   object
dtypes: int64(1), object(3)
memory usage: 355.0+ KB


In [51]:
inp = final_movies.overview.values
vectorizer = TfidfVectorizer(stop_words='english', max_df=2.0)
vector = vectorizer.fit_transform(inp)
cosine_sim = linear_kernel(vector, vector)

In [57]:
idx = 1
search = final_movies.overview.iloc[idx]
indices = pd.Series(final_movies.index, index=final_movies['overview'])
idx2 = indices[search]

sim_scores = list(enumerate(cosine_sim[idx2]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:11]
# Get the movie indices
recommender = [i[0] for i in sim_scores]
[final_movies.iloc[i].title for i in recommender]

['Wreck-It Ralph',
 'Pixels',
 'eXistenZ',
 'Stay Alive',
 'Gamer',
 "Grandma's Boy",
 'The Last Starfighter',
 'Dungeons & Dragons',
 'Panic Room',
 'Casino Royale']

## Analysis

In [86]:
inp = final_movies.overview.values
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=(2, 3))
vector = vectorizer.fit_transform(inp)
cosine_sim = linear_kernel(vector, vector)

idx = 1
search = final_movies.overview.iloc[idx]
indices = pd.Series(final_movies.index, index=final_movies['overview'])
idx2 = indices[search]

sim_scores = list(enumerate(cosine_sim[idx2]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:11]
# Get the movie indices
recommender = [i[0] for i in sim_scores]
[final_movies.iloc[i].title for i in recommender]

['Guardians of the Galaxy',
 'Quarantine',
 'Night of the Living Dead',
 "Amityville: It's About Time",
 'The Giant Spider Invasion',
 'Peter Pan',
 'The Wizard of Oz',
 'The Smurfs',
 'Zathura: A Space Adventure',
 'Martha Marcy May Marlene']

In [87]:
indices = pd.Series(final_movies.index, index=final_movies['overview'])

def get_recommendation(overview, similarity):
  
    id = indices[overview]

    scores = list(enumerate(similarity[id]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[1:11]
    # Get the movie indices
    recommender = [i[0] for i in scores]
    return recommender

In [88]:
def rec(idx):
    movie = final_movies.iloc[idx]
    print("Title: ", movie.title)
    print("Over view: ", movie.overview)
    output = get_recommendation(movie.overview, cosine_sim)
    print("Recommanded Movies title: \n\n", '\n'.join([final_movies.iloc[i].title for i in output]))

In [89]:
rec(1)

Title:  Jumanji
Over view:  When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.
Recommanded Movies title: 

 Guardians of the Galaxy
Quarantine
Night of the Living Dead
Amityville: It's About Time
The Giant Spider Invasion
Peter Pan
The Wizard of Oz
The Smurfs
Zathura: A Space Adventure
Martha Marcy May Marlene


In [90]:
rec(2)

Title:  Grumpier Old Men
Over view:  A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.
Recommanded Movies title: 

 Grumpy Old Men
The Hundred-Foot Journey
Thor
Leon: The Professional
The Anniversary Party
Toy Story
Jumanji
Waiting to Exhale
Father of the Bride Part II
Heat


In [91]:
rec(100)

Title:  Braveheart
Over view:  Enraged at the slaughter of Murron, his new bride and childhood love, Scottish warrior William Wallace slays a platoon of the local English lord's soldiers. This leads the village to revolt and, eventually, the entire country to rise up against English rule.
Recommanded Movies title: 

 Certified Copy
Killers
Black Death
Trouble Every Day
Toy Story
Jumanji
Grumpier Old Men
Waiting to Exhale
Father of the Bride Part II
Heat


In [92]:
rec(150)

Title:  Jeffrey
Over view:  Jeffery, a young gay man in New York, decides that sex is too much and decided to become celibate. He immediately meets the man of his dreams and must decide whether or not love is worth the danger of a boyfriend dying.
Recommanded Movies title: 

 Boys Life: Three Stories of Love, Lust, and Liberation
Summertime
Leon: The Professional
Ladybird Ladybird
Boy Crazy
Six Degrees of Separation
Belle Époque
The Beastmaster
The Age of Adaline
The Butcher's Wife


In [93]:
rec(250)

Title:  Miami Rhapsody
Over view:  Gwyn Marcus has always wanted a marriage like her parents. She has just accepted the proposal of her boyfriend Matt, but she has misgivings about their future together. Her fear of commitment grows as she learns of the various affairs that her family is having. With her sister getting married and her brother already married, her mother is growing concerned about Gwyn's being the last single person in the family. But the more she thinks about marriage, the more she must search for the balance between career, marriage and family.
Recommanded Movies title: 

 St. Vincent
Shag
Sex and the City
The Human Condition I: No Greater Love
The Daytrippers
My Best Friend's Wedding
La Cage aux folles
Leap Year
Margot at the Wedding
Walking and Talking
