### GLOVE

In [1]:
import requests
import zipfile
import os
from tqdm import tqdm
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine

In [2]:
def download_file(url):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    t=tqdm(total=total_size, unit='iB', unit_scale=True)
    with open('glove.6B.zip', 'wb') as file:
        for data in response.iter_content(block_size):
            t.update(len(data))
            file.write(data)
    t.close()
    if total_size != 0 and t.n != total_size:
        print("ERROR, something went wrong")

# Check if the file already exists
if not os.path.exists('glove.6B.zip'):
    # URL of the GloVe embeddings
    url = 'http://nlp.stanford.edu/data/glove.6B.zip'

    download_file(url)

    # Create a directory to extract the zip file to
    if not os.path.exists('glove'):
        os.makedirs('glove')

    # Extract the zip file
    with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
        zip_ref.extractall('glove')
else:
    print("File already exists.")

# Check if the conversion is already done
if not os.path.exists('glove/glove.6B.300d.txt.word2vec'):
    print("Converting...")
    # After downloading and extracting, convert the GloVe vectors into word2vec format:
    glove_input_file = 'glove/glove.6B.300d.txt'
    word2vec_output_file = 'glove/glove.6B.300d.txt.word2vec'
    #glove2word2vec(glove_input_file, word2vec_output_file)
    KeyedVectors.load_word2vec_format(glove_input_file, binary=False, no_header=True).save_word2vec_format(word2vec_output_file)

# Now we can load these vectors into a Gensim model:
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

File already exists.
Converting...


In [1]:
# And calculate cosine similarity between two words
similarity_score = model.similarity('joy', 'happiness')

print("Similarity", similarity_score)

NameError: name 'model' is not defined

### BERT

In [2]:
from transformers import BertModel, BertTokenizer
import torch
from scipy.spatial.distance import cosine

# Initialize the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Define a function to get word embeddings
def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors='pt')
    outputs = model(**inputs)
    return outputs.last_hidden_state[0, 1, :].detach().numpy()



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Similarity between joy and happiness:  0.8387427926063538


In [16]:
# Get the embeddings for the words
word1_embedding = get_word_embedding('I feel happy')
word2_embedding = get_word_embedding('I feel joy')

# Calculate cosine similarity
similarity = 1 - cosine(word1_embedding, word2_embedding)

print("Similarity between joy and happiness: ", similarity)

Similarity between joy and happiness:  0.7349165678024292
