# Text Embeddings

 - text does not have a inherit semantic distance between words. (hamming-distance...)
 - Given the text: "This is the first document" and "Is this the first document?", how similar are these? 
 - Transform text (Graph) into vectors!

## Bag-Of-Words 
 - Idea: Create Vocabulary Array of size x, containing every word
 - For each document ys: count how often each word occurs
 - x*y matrix containing the embeddings

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

### Our Documents

In [5]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

### Vectorizer counts occurences in the corpus

In [6]:
bow_vectoricer : CountVectorizer = CountVectorizer()
bow_embedding = bow_vectoricer.fit_transform(corpus)

#### Returns the Feature names

In [7]:
bow_vectoricer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [8]:
bow_embedding.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

## TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_embeddings = tfidf_vectorizer.fit_transform(corpus)

In [11]:
tfidf_vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [12]:
tfidf_embeddings.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

## Word2Vec

 - Get Cooccurance of words in the document. 
 - The higher the cooccurance between two words, the more similar the vectors should be.

In [13]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

### Tokenize
 - We want the embeddings of the words
 - Thus we need to split the sentences into the words

In [14]:
tokens = [[word for word in document.split(" ")] for document in corpus]

In [15]:
tokens

[['This', 'is', 'the', 'first', 'document.'],
 ['This', 'document', 'is', 'the', 'second', 'document.'],
 ['And', 'this', 'is', 'the', 'third', 'one.'],
 ['Is', 'this', 'the', 'first', 'document?']]

In [16]:
word2vec = Word2Vec(sentences=tokens, vector_size=9, window=2, min_count=1, sg=1)

In [17]:
w2v_embedding = word2vec.wv

In [18]:
vocab = set([word for line in tokens for word in line])

In [19]:
for word in vocab:
    if(word in word2vec.wv):
        print(f"{word}: {w2v_embedding[word]}")

third: [ 0.03140464  0.0599922   0.07838441 -0.06337231  0.02067937  0.06765798
 -0.05332528 -0.03455647  0.07553378]
This: [-0.0032909  -0.08512489  0.10683048  0.0553562   0.10259048 -0.09064353
  0.04995332 -0.04596751  0.00916151]
document: [-0.03155611 -0.06859469 -0.00455803 -0.09298832 -0.06222236  0.07893932
  0.03725044  0.08028522  0.0755583 ]
second: [ 0.07102815 -0.09577431  0.04073042  0.05766537  0.06379931  0.08296576
 -0.06852973  0.0122846   0.06719203]
one.: [-0.01615424 -0.10232265  0.04856641  0.00635378  0.08269591 -0.00903735
 -0.02931856 -0.09726511 -0.00951822]
Is: [-0.08536667 -0.01677566  0.02745128 -0.00986536  0.06150129 -0.03047973
  0.02510632  0.06060106  0.09273084]
the: [-0.00595808  0.00262702  0.05670388  0.10010304 -0.10336611 -0.07907566
  0.07176525  0.09969987 -0.05572698]
document?: [ 0.09443673 -0.04957192  0.05022183 -0.07542852 -0.03941385  0.10444671
 -0.01754775  0.00355158 -0.04598591]
And: [ 0.01812751  0.00211019  0.03859597  0.00241975  

## BERT Embeddings

In [20]:
from transformers import BertModel, BertTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [22]:
input = tokenizer(corpus, return_tensors="pt", padding=True, truncation=True)


In [23]:
input

{'input_ids': tensor([[ 101, 2023, 2003, 1996, 2034, 6254, 1012,  102,    0],
        [ 101, 2023, 6254, 2003, 1996, 2117, 6254, 1012,  102],
        [ 101, 1998, 2023, 2003, 1996, 2353, 2028, 1012,  102],
        [ 101, 2003, 2023, 1996, 2034, 6254, 1029,  102,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [24]:
with torch.no_grad():
    outputs = model(**input)
word_embeddings = outputs.last_hidden_state

#### Embedding of the first word in the first sentence:

In [25]:
print(word_embeddings[0,0])

tensor([-1.7925e-01,  2.4680e-01,  2.5877e-01, -1.9693e-01, -2.1139e-01,
        -5.1831e-01,  5.0253e-02,  2.8839e-01,  1.1750e-02, -2.5680e-01,
        -1.9406e-01, -1.9011e-01,  1.1257e-01,  2.8728e-01,  1.0411e-01,
        -4.6118e-02, -2.3991e-01,  4.7052e-01,  2.7609e-01, -3.5730e-01,
        -2.3314e-01, -4.6894e-01, -6.8953e-02, -2.2837e-01, -2.8003e-01,
        -5.4488e-02, -4.0301e-02,  7.6617e-02, -2.5184e-01,  4.3663e-02,
         1.3813e-01,  2.9083e-02, -6.6083e-02,  3.3753e-01,  2.7510e-01,
         1.3768e-01,  1.2088e-01, -7.7740e-02,  2.3999e-01, -3.1664e-02,
         1.0138e-01,  8.5622e-02,  6.9635e-02, -2.0513e-01,  1.4577e-01,
        -4.3383e-01, -2.5102e+00, -1.3555e-01, -6.0130e-02, -8.5688e-02,
         6.8202e-02, -3.6489e-01,  6.4345e-02,  2.8158e-01, -9.2559e-02,
         2.7825e-01, -4.3326e-01,  5.4751e-01, -8.6024e-02,  2.6318e-01,
         1.5872e-01, -1.7574e-01,  1.9737e-01,  1.1068e-02,  3.8963e-02,
         8.8484e-02, -1.6272e-01,  1.3865e-02, -3.9