<a href="https://colab.research.google.com/github/Mrigakshi24-ux/Training-Sheet/blob/main/GloVe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install glove-python-binary



In [2]:
import os
import urllib.request
import matplotlib.pyplot as plt
from scipy import spatial
import numpy as np
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from glove import Corpus, Glove

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# to load the pre-trained model hosted at this link, will take a few minutes because large file
urllib.request.urlretrieve('https://nlp.stanford.edu/data/glove.6B.zip','glove.6B.zip')

('glove.6B.zip', <http.client.HTTPMessage at 0x7f8419820490>)

In [4]:
# zip file, unzip it by providing source and destination for the file. Here 50, 100 means dimensions.
!unzip "/content/glove.6B.zip" -d "/content/"

Archive:  /content/glove.6B.zip
replace /content/glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/glove.6B.50d.txt  
replace /content/glove.6B.100d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/glove.6B.100d.txt  y

replace /content/glove.6B.200d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: /content/glove.6B.200d.txt  y

replace /content/glove.6B.300d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: /content/glove.6B.300d.txt  y



In [5]:
# creating an embedding dictionary for anyone file from above
embed_dict = {}
with open('/content/glove.6B.200d.txt', 'r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    embedd = np.asarray(values[1:], 'float32')
    embed_dict[word] = embedd

In [6]:
# to find similar words, we will use Eucledian distance
# sort the keys(words) in ascending order(least distance) with key as eucledian distance btw every word vector and given word vector
def find_similar(word_vector, embed = embed_dict):
  sim = sorted(embed.keys(), key=lambda word: spatial.distance.euclidean(embed[word], word_vector))
  return sim

In [7]:
def howSimilar(word1, word2, embed = embed_dict):
  return (cosine_similarity([list(embed[word1])], [list(embed[word2])]))[0][0]

In [8]:
find_similar(embed_dict['river'])[1:11]

['rivers',
 'tributary',
 'confluence',
 'creek',
 'along',
 'tributaries',
 'valley',
 'flows',
 'danube',
 'upstream']

In [9]:
howSimilar('man', 'women')

0.40427884

In [10]:
howSimilar('girl', 'women')

0.47667465

##On a text

In [11]:
lines =  ["Hello this is a tutorial to convert word to integer",
         "It is a beautiful day",
         "Jack is going to office",
         "I want to go Manali, it's very pretty"]

In [12]:
# preprocess

# lower
doc = list(np.char.lower(lines))

# tokenize
for i,line in enumerate(doc):
  doc[i] = line.split()

# # stopwords
for j,i in enumerate(doc):
  doc[j] = ' '.join(list(filter(lambda x:x not in nltk.corpus.stopwords.words('english'), i)))

# # symbols
symb = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
for i in symb:
  doc = list(np.char.replace(doc, i, ''))
doc

# # lemmatize
wordnet_lemmatizer = WordNetLemmatizer() 
for j,i in enumerate(doc):
  doc[j] = list(filter(lambda x:wordnet_lemmatizer.lemmatize(x, pos ='a'), i.split()))

doc

[['hello', 'tutorial', 'convert', 'word', 'integer'],
 ['beautiful', 'day'],
 ['jack', 'going', 'office'],
 ['want', 'go', 'manali', 'pretty']]

In [13]:
# creating a corpus object
corpus = Corpus() 
# training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(doc, window=10) # changes the user input according to the requirement, window size is the number of surrounding words to be taken, here 10

In [14]:
# creating a Glove object which will use the matrix created in the above lines to create embeddings
# We can set the learning rate as it uses Gradient Descent and number of components
glove = Glove(no_components=5, learning_rate=0.05) # no_components determines the number of dimensions for output vector

In [15]:
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)  # first pattern is the co-occurence matrix

# add dictionary to glove object
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [19]:
glove.word_vectors[glove.dictionary['manali']]

array([ 0.07884263, -0.09927628, -0.02481225, -0.05884735,  0.02997702])