# LAB 4 - Semantic Embeddings

## Exercise 2 – Word2vec


### 1. Import spacy library and load the en_core_web_md model


In [None]:
# Install spacy library
!pip install spacy

In [None]:
# Download the English model
import spacy
spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")

### 2. Show the word vector for “football”, how long is it?


In [None]:
#Get the representation of the word "football"
football = nlp("football")
print(football.vector)

### 3. Show the word vector for “frankfurteria”, how long is it?

Only words in model’s vocabulary have vectors, the rest are called out-ofvocabulary (OOV)

In [None]:
#Get the representation of the word "frankfurteria" and show the length of the vector

vec_frankfurteria = nlp("frankfurteria").vector

print(vec_frankfurteria, "\nLenght of the vector:", len(vec_frankfurteria))

As you can see frankfurteria is an Out Of Vocabulary word

###  4. Check whether the word “flowers” is in the model vocabulary


In [None]:
# Check whether the word "flowers" is in the vocabulary

if "flowers" in nlp.vocab.strings:
  print("The word 'flowers' is in the vocabulary")

else:
  print("The word 'flowers' is not in the vocabulary")

#Get the representation of the word "flowers"
#print(nlp("flowers").vector)

### 5. Create a sentence including the word ”football”, and show the sentence vector


In [None]:
sentence = "Football is a great sport"
sent_vec = nlp(sentence).vector
print(sent_vec)

### 6. How long is the sentence vector? How is it calculated?


In [None]:
print("\nLenght of the vector:", len(sent_vec))

Even the sentence have many words, the length is 300 because they do the average of the vectors of all the sentence 


## Exercise 3 – Similarity

### 1. Define the two utterances “I visited Scotland” and “I went to Edinburgh”


In [None]:
utt1 = "I visited Scotland"
utt2 = "I went to Edinburgh"
utt1_vec = nlp(utt1).vector
utt2_vec = nlp(utt2).vector

### 2. Calculate the similarity between these two sentences


In [None]:
utt1 = nlp("I visited Scotland")
utt2 = nlp("I went to Edinburgh")

utt1.similarity(utt2)

In [None]:
import numpy as np
from numpy.linalg import norm
 
 
# compute cosine similarity
cosine = np.dot(utt1_vec,utt2_vec)/(norm(utt1_vec)*norm(utt2_vec))
print("The cosine similarity is:",cosine)



Define two similar sentences and calculate their similarity,
then define two very different sentences and calculate their similarity

In [None]:
# Compute similarities of 2 similar sentences
sent_1 = "I do not like football"
sent_2 = "I hate soccer"
sent_1_vec = nlp(sent_1).vector
sent_2_vec = nlp(sent_2).vector
cosine = np.dot(sent_1_vec,sent_2_vec)/(norm(sent_1_vec)*norm(sent_2_vec))
print(f"The cosine similarity for sentences: 'I do not like football' and 'I hate soccer' is:\n{cosine}")



In [None]:
# Compute similarities of 2 dissimilar sentences
sent_1 = "I do not like football"
sent_2 = "I love soccer"
sent_1_vec = nlp(sent_1).vector
sent_2_vec = nlp(sent_2).vector
cosine = np.dot(sent_1_vec,sent_2_vec)/(norm(sent_1_vec)*norm(sent_2_vec))
print(f"The cosine similarity for sentences: 'I do not like football' and 'I hate soccer' is:\n{cosine}")



### 3. Consider the following words [cat, dog, tiger, elephant, bird, monkey, lion, cheetah, burger, pizza, food, cheese, wine, salad, noodles, fruit, vegetables]

In [None]:
embed_list = []
word_list = ["cat", "dog", "tiger", "elephant", "bird", "monkey", "lion","cheetah", "burger", "pizza", "food", "cheese", "wine", "salad", "noodles", "fruit", "vegetables"]

### 4. Calculate the word vector for every word




In [None]:
for word in word_list:
    embed_list.append(nlp(word).vector)

### 5. Apply a PCA, consider the first two components, and epresent the words in the feature space

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
    
pca = PCA(n_components=2)
embed_pca = pca.fit_transform(embed_list)

x = [vector[0] for vector in embed_pca] # x-axis
y = [vector[1] for vector in embed_pca] # y-axis

plt.scatter(x,y, marker='o') #
for i, word in enumerate(word_list):
    plt.annotate(word, (x[i], y[i]))
    
plt.show()

#Print the amount of variance explained by each of the selected components
var_explained = pca.explained_variance_ratio_
print(f"The amount of variance explained by each of the selected components is:\n component 1: {var_explained[0]}\n component 2: {var_explained[1]}\n")


In [None]:
#Print the amount of variance explained by each of the selected components
var_explained = pca.explained_variance_ratio_
print(f"The amount of variance explained by each of the selected components is:\n component 1: {var_explained[0]}\n component 2: {var_explained[1]}\n")

Now let's do it in 3d

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# 3D PCA

pca = PCA(n_components=3)
embed_pca = pca.fit_transform(embed_list)

x = [vector[0] for vector in embed_pca] # x-axis
y = [vector[1] for vector in embed_pca] # y-axis
z = [vector[2] for vector in embed_pca] # z-axis

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, marker='o')

for i, word in enumerate(word_list):
    ax.text(x[i], y[i], z[i], word)
    
plt.show()

#Print the amount of variance explained by each of the selected components
var_explained = pca.explained_variance_ratio_
print(f"The amount of variance explained by each of the selected components is:\n component 1: {var_explained[0]}\n component 2: {var_explained[1]}\n component 3: {var_explained[2]}")


In [None]:
#Print the amount of variance explained by each of the selected components
var_explained = pca.explained_variance_ratio_
print(f"The amount of variance explained by each of the selected components is:\n component 1: {var_explained[0]}\n component 2: {var_explained[1]}\n component 3: {var_explained[2]}")

Define a new set of words (at least 20 different words), and
represent them in the feature space

In [None]:
common_words = ['Apple', 'Banana', 'Grapes', 'Pear', 'Orange', 'Melon', 'Tomato', 'pineaple', 
                'raspberry', 'watermelon', 'Information', 'Data', 'Bit', 'Computer', 'Mouse', 
                'Tower', 'Screen', 'Music', 'Network', 'Phone']
embed_list1 =[]
for word1 in common_words:
    embed_list1.append(nlp(word1).vector)
pca = PCA(n_components=2)
embed_pca1 = pca.fit_transform(embed_list1)

x = [vector1[0] for vector1 in embed_pca1] # x-axis
y = [vector1[1] for vector1 in embed_pca1] # y-axis

plt.scatter(x,y, marker='o') #
for i, word1 in enumerate(common_words):
    plt.annotate(word1, (x[i], y[i]))



plt.show()

#Print the amount of variance explained by each of the selected components
var_explained = pca.explained_variance_ratio_
print(f"The amount of variance explained by each of the selected components is:\n component 1: {var_explained[0]}\n component 2: {var_explained[1]}\n")

In [None]:
#Print the amount of variance explained by each of the selected components
var_explained = pca.explained_variance_ratio_
print(f"The amount of variance explained by each of the selected components is:\n component 1: {var_explained[0]}\n component 2: {var_explained[1]}\n")

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
common_words = ['Apple', 'Banana', 'Grapes', 'Pear', 'Orange', 'Melon', 'Tomato', 'pineaple', 
                'raspberry', 'watermelon', 'Information', 'Data', 'Bit', 'Computer', 'Mouse', 
                'Tower', 'Screen', 'Music', 'Network', 'Phone']
# 3D PCA
embed_list =[]
for word in common_words:
    embed_list.append(nlp(word).vector)

pca = PCA(n_components=3)
embed_pca = pca.fit_transform(embed_list)

x = [vector[0] for vector in embed_pca] # x-axis
y = [vector[1] for vector in embed_pca] # y-axis
z = [vector[2] for vector in embed_pca] # z-axis

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, marker='o')

for i, word in enumerate(common_words):
    ax.text(x[i], y[i], z[i], word)
    
plt.show()

#Print the amount of variance explained by each of the selected components
var_explained = pca.explained_variance_ratio_
print(f"The amount of variance explained by each of the selected components is:\n component 1: {var_explained[0]}\n component 2: {var_explained[1]}\n component 3: {var_explained[2]}")

In [None]:
var_explained = pca.explained_variance_ratio_
print(f"The amount of variance explained by each of the selected components is:\n component 1: {var_explained[0]}\n component 2: {var_explained[1]}\n component 3: {var_explained[2]}")

## Exercise IV – Categorizing text with semantic similarity

### 1. Define a set of sentences, e.g., “I purchased a science fiction book last week. I loved this fragrance: light, floral and feminine. I purchased a bottle of wine.”

In [None]:
sent = nlp("I purchased a science fiction book last week. I loved this fragrance: light, floral and feminine. I purchased a bottle of wine.")

### 2. Define a keyword, e.g., perfume


In [None]:
keyword = nlp("perfume")

### 3. Calculate the similarity between each sentence and the keyword


In [None]:
# Similarity between the sentence and the keyword
sent.similarity(keyword)


### 4. Could we filter out the sentences which are not related with the keyword?


In [None]:
# filter words that are not related with the sentence
#filtered_sent = [word for word in sent if word.is_alpha and not word.is_stop]

Load the Alexa’s review dataset, and filter out the reviews
which are not associated with the “music” property

In [113]:
import pandas as pd
data = pd.read_csv('contents/amazon_alexa.tsv', delimiter = '\t')

filtered_df = data[data['verified_reviews'].lower().contains("music")]
filtered_df
data

AttributeError: 'Series' object has no attribute 'lower'