*20 Aug 2024 : 21BAI1133 - Mukundh J*
#  Speech and Natural Language Processing Lab 5
Task:
- Generate two embeddings: Glove or FastText or Word2Vec (takes time to download, so start with this first and let it download)
- Implement TF-IDF from scratch.
- Use any corpus that has at least 3000 words.
- Implement cosine similarity from scratch.
- Compute the cosine similarity between 20 random words, using all three vector representations (three 20x20 matrices).
- Comment on the sensibility of the similarities obtained.

In [4]:
import nltk
from nltk.corpus import brown
from gensim.models import Word2Vec

nltk.download('brown')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [12]:
sentences = brown.sents()[:2000]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

word2vec_vectors = word2vec_model.wv
word2vec_vocab = list(word2vec_model.wv.index_to_key[:4000])

In [13]:
word_embeddings = {word: word2vec_model.wv[word] for word in word2vec_model.wv.key_to_index}
for i in word_embeddings:
  print(i,':\t',word_embeddings[i])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 -0.05163444  0.02906603 -0.0610399  -0.00177741 -0.03197897  0.0281748
  0.08831917 -0.03480843  0.01882923 -0.08737799 -0.02295144  0.08406685
  0.01767881 -0.01545256 -0.053309   -0.09392947  0.02166777 -0.06857042
 -0.07456837  0.02874168  0.08426727 -0.06747528  0.01391614 -0.03728731
  0.05528255  0.04609409  0.0023871  -0.04546248 -0.01842818  0.03228804
 -0.03614369  0.02172393  0.04808633 -0.04479899 -0.12426116  0.03612822
  0.01966873 -0.0668727  -0.01946519 -0.01394715 -0.06433826  0.09072042
  0.05429938  0.0463459  -0.12219993  0.12460478 -0.01767469  0.05310214
  0.07765324  0.03895896  0.05664102 -0.0179764   0.00206557  0.00726513
 -0.04073614 -0.0284207  -0.0932759   0.00542157 -0.05470674  0.0687637
 -0.04016982 -0.03329886  0.02273819  0.03047752  0.05193026  0.00043676
  0.13461009  0.03187401 -0.01660633 -0.02700603  0.10624574  0.00886435
  0.04496505 -0.02395542  0.01101849 -0.01044621]
measure :	 

In [14]:
from gensim.models import FastText
fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=5, workers=4)

fasttext_vectors = fasttext_model.wv


In [15]:
word_embeddings = {word: fasttext_model.wv[word] for word in fasttext_model.wv.key_to_index}
for i in word_embeddings:
  print(i,':\t',word_embeddings[i])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0.07764119 -0.17387056  0.08127338 -0.01515866  0.10287063  0.02267808
  0.02252039  0.16847451  0.05121491 -0.10975346 -0.12088421  0.00425414
 -0.16725059 -0.13225472  0.162627    0.09278134 -0.08598199  0.1382461
  0.03427542 -0.04428457  0.07414449 -0.02406525  0.01382068  0.11244664
  0.04810144 -0.16804975  0.00445568  0.0522668 ]
serious :	 [-0.10535105  0.343433    0.0256425  -0.20081368  0.01256473  0.12150927
  0.01059566  0.15822086  0.19973822 -0.05157679  0.21246819  0.04194765
 -0.15770742  0.26374534 -0.36965558 -0.22212398 -0.12354165 -0.08883298
 -0.23934853 -0.06279643 -0.26100966  0.20704994  0.05823909 -0.10227815
  0.08272173 -0.31794107 -0.22041966 -0.0501173  -0.08049621  0.02715359
 -0.03161113 -0.01610946  0.5088707  -0.24541838 -0.03126277  0.2574427
  0.111652    0.24887711 -0.16354148 -0.1979829   0.24723086 -0.04941772
  0.08245458 -0.16784836 -0.13846524 -0.21630397  0.01932389  0.08877274


In [17]:
fasttext_vocab = list(fasttext_model.wv.index_to_key[:4000])

# TF-IDF

In [20]:
from collections import defaultdict
import numpy as np

def compute_tfidf(sentences, vocab):
    tf = defaultdict(float)
    for sentence in sentences:
        for word in sentence:
            if word in vocab:
                tf[word] += 1
    total_words = sum(tf.values())
    for word in tf:
        tf[word] /= total_words

    idf = defaultdict(float)
    num_docs = len(sentences)
    for word in vocab:
        idf[word] = sum(1 for sentence in sentences if word in sentence)
    for word in idf:
        idf[word] = np.log(num_docs / (1 + idf[word]))

    tfidf = {}
    for word in vocab:
        tfidf[word] = tf[word] * idf[word]

    return tfidf



In [21]:
word2vec_tfidf = compute_tfidf(sentences, word2vec_vocab)
fasttext_tfidf = compute_tfidf(sentences, fasttext_vocab)

In [22]:
print("Top 10 TF-IDF words using Word2Vec vocab:")
print(sorted(word2vec_tfidf.items(), key=lambda item: item[1], reverse=True)[:10])

Top 10 TF-IDF words using Word2Vec vocab:
[(',', 0.042164444239523924), ('the', 0.033372938926164934), ('of', 0.03186350178418242), ('to', 0.029464515059622717), ('and', 0.029312888661402917), ('a', 0.02866682240958401), ('in', 0.027564693265182953), ('for', 0.02231770028435063), ('``', 0.019484739134582065), ('that', 0.019228254709188222)]


In [23]:
print("\nTop 10 TF-IDF words using FastText vocab:")
print(sorted(fasttext_tfidf.items(), key=lambda item: item[1], reverse=True)[:10])


Top 10 TF-IDF words using FastText vocab:
[(',', 0.042164444239523924), ('the', 0.033372938926164934), ('of', 0.03186350178418242), ('to', 0.029464515059622717), ('and', 0.029312888661402917), ('a', 0.02866682240958401), ('in', 0.027564693265182953), ('for', 0.02231770028435063), ('``', 0.019484739134582065), ('that', 0.019228254709188222)]


# Cosine Similarity

In [24]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_a = np.linalg.norm(vec1)
    norm_b = np.linalg.norm(vec2)
    return dot_product / (norm_a * norm_b)

In [25]:
import random
random_pairs = random.sample(list(word2vec_vocab), 10)

In [26]:
print("\nCosine Similarities (Word2Vec):")
for word1 in random_pairs:
    for word2 in random_pairs:
        if word1 != word2:
            sim = cosine_similarity(word2vec_model.wv[word1], word2vec_model.wv[word2])
            print(f"Similarity between {word1} and {word2}: {sim:.4f}")


Cosine Similarities (Word2Vec):
Similarity between Hughes and Nations: 0.9871
Similarity between Hughes and boys: 0.9947
Similarity between Hughes and services: 0.9929
Similarity between Hughes and designed: 0.9950
Similarity between Hughes and will: 0.9988
Similarity between Hughes and Thrift: 0.9929
Similarity between Hughes and Hartweger: 0.9916
Similarity between Hughes and Central: 0.9954
Similarity between Hughes and opposition: 0.9914
Similarity between Nations and Hughes: 0.9871
Similarity between Nations and boys: 0.9842
Similarity between Nations and services: 0.9820
Similarity between Nations and designed: 0.9842
Similarity between Nations and will: 0.9875
Similarity between Nations and Thrift: 0.9802
Similarity between Nations and Hartweger: 0.9810
Similarity between Nations and Central: 0.9823
Similarity between Nations and opposition: 0.9795
Similarity between boys and Hughes: 0.9947
Similarity between boys and Nations: 0.9842
Similarity between boys and services: 0.9872

In [27]:
print("\nCosine Similarities (FastText):")
for word1 in random_pairs:
    for word2 in random_pairs:
        if word1 != word2:
            sim = cosine_similarity(fasttext_model.wv[word1], fasttext_model.wv[word2])
            print(f"Similarity between {word1} and {word2}: {sim:.4f}")



Cosine Similarities (FastText):
Similarity between Hughes and Nations: 1.0000
Similarity between Hughes and boys: 0.9999
Similarity between Hughes and services: 1.0000
Similarity between Hughes and designed: 1.0000
Similarity between Hughes and will: 1.0000
Similarity between Hughes and Thrift: 0.9998
Similarity between Hughes and Hartweger: 0.9999
Similarity between Hughes and Central: 1.0000
Similarity between Hughes and opposition: 1.0000
Similarity between Nations and Hughes: 1.0000
Similarity between Nations and boys: 0.9999
Similarity between Nations and services: 1.0000
Similarity between Nations and designed: 1.0000
Similarity between Nations and will: 1.0000
Similarity between Nations and Thrift: 0.9999
Similarity between Nations and Hartweger: 1.0000
Similarity between Nations and Central: 1.0000
Similarity between Nations and opposition: 1.0000
Similarity between boys and Hughes: 0.9999
Similarity between boys and Nations: 0.9999
Similarity between boys and services: 0.9999

In [28]:
random_words = random.sample(word2vec_vocab, 20)

cos_sim_word2vec = np.zeros((20, 20))
cos_sim_fasttext = np.zeros((20, 20))
cos_sim_tfidf = np.zeros((20, 20))



In [29]:
for i in range(20):
    for j in range(20):
        word1 = random_words[i]
        word2 = random_words[j]

        # Word2Vec Cosine Similarity
        vec1_w2v = word2vec_model.wv[word1]
        vec2_w2v = word2vec_model.wv[word2]
        cos_sim_word2vec[i, j] = cosine_similarity(vec1_w2v, vec2_w2v)

        # FastText Cosine Similarity
        vec1_ft = fasttext_model.wv[word1]
        vec2_ft = fasttext_model.wv[word2]
        cos_sim_fasttext[i, j] = cosine_similarity(vec1_ft, vec2_ft)

        # TF-IDF Cosine Similarity
        vec1_tfidf = word2vec_tfidf[word1]
        vec2_tfidf = word2vec_tfidf[word2]
        cos_sim_tfidf[i, j] = cosine_similarity([vec1_tfidf], [vec2_tfidf])

The sensibility of the similarities obtained depends on the specific words and the corpus used. In general, Word2Vec and FastText tend to capture semantic relationships between words better than TF-IDF.
This is because they learn representations that consider the context in which words appear.

Word2Vec and FastText:
- These models capture semantic relationships like synonymy, antonymy, and analogy.
- They are better at capturing subtle differences in meaning between words.
- They are useful for tasks like word similarity, sentiment analysis, and machine translation.

TF-IDF:
- It focuses on the importance of words within a document or corpus.
- It can be useful for tasks like information retrieval and document classification.
- It might not capture the full semantic meaning of words.



In [30]:
print("Cosine Similarity Matrix - Word2Vec")
print(cos_sim_word2vec)

Cosine Similarity Matrix - Word2Vec
[[1.         0.99262643 0.99480963 0.99024498 0.99333543 0.9892298
  0.99443299 0.99475735 0.98666859 0.92640024 0.98328489 0.99470413
  0.99333835 0.98147374 0.99061656 0.98715127 0.99499756 0.97178584
  0.98751533 0.97465158]
 [0.99262643 1.         0.99628896 0.99051386 0.99497116 0.9915154
  0.9957478  0.99634165 0.98850822 0.92633748 0.98240131 0.99652982
  0.99403709 0.9839536  0.99369347 0.99009562 0.99665993 0.97386867
  0.99005067 0.97873133]
 [0.99480963 0.99628896 1.00000012 0.9945268  0.99761748 0.9940654
  0.99852985 0.99905163 0.99042672 0.92550659 0.98605585 0.99901032
  0.9971472  0.98622924 0.99662691 0.99209446 0.99937624 0.97478151
  0.99185532 0.97963852]
 [0.99024498 0.99051386 0.9945268  1.         0.99313813 0.98798281
  0.99295515 0.99374914 0.98407179 0.9236865  0.97805065 0.99430627
  0.99136972 0.97997051 0.99060273 0.98505151 0.99408424 0.96624154
  0.98583251 0.97269672]
 [0.99333543 0.99497116 0.99761748 0.99313813 0.999

In [31]:
print("\nCosine Similarity Matrix - FastText")
print(cos_sim_fasttext)


Cosine Similarity Matrix - FastText
[[1.00000012 0.99575543 0.99994302 0.99993074 0.9999159  0.99991637
  0.99993974 0.99995208 0.99993283 0.99993527 0.99030954 0.99993896
  0.9999603  0.99995947 0.99993867 0.99993312 0.99996078 0.99993259
  0.99957103 0.99994028]
 [0.99575543 1.         0.99563712 0.99569303 0.99565411 0.99573088
  0.99571556 0.99572307 0.99570829 0.99571753 0.9857105  0.99575144
  0.99569386 0.99571747 0.99572605 0.99554509 0.99569583 0.99564034
  0.99539864 0.99553716]
 [0.99994302 0.99563712 1.00000012 0.99995452 0.99994618 0.99994618
  0.99996024 0.99997234 0.99995881 0.99995714 0.9901346  0.99995178
  0.99996829 0.99998116 0.99995637 0.99994761 0.99997556 0.99995929
  0.99960953 0.99996012]
 [0.99993074 0.99569303 0.99995452 1.         0.99994755 0.99993801
  0.99994749 0.99995816 0.99994308 0.99995226 0.99021882 0.99995023
  0.9999544  0.99996501 0.99995321 0.99993175 0.99996126 0.99993068
  0.9995836  0.99994856]
 [0.9999159  0.99565411 0.99994618 0.99994755 1

In [32]:
print("\nCosine Similarity Matrix - TF-IDF")
print(cos_sim_tfidf)


Cosine Similarity Matrix - TF-IDF
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 

In the provided example, the cosine similarity matrices show that Word2Vec and FastText generally produce more sensible similarities.
This is because they capture the semantic relationships between words, leading to higher similarity scores for words that are semantically related.
TF-IDF, on the other hand, might not always reflect the semantic similarity between words, as it primarily focuses on the importance of words within the corpus.

It's important to note that the sensibility of the representations depends on the corpus and the specific words being compared.
For example, if the corpus contains a lot of technical jargon, the models might not perform as well in capturing the semantic relationships between those words.
