### Experimenting with BERTScore

In [10]:
import sys
sys.path.append(r"D:\COURS\A4\S8 - ESILV\Stage\Work\Repositories\bert_score")

from bert_score.score import score as bscore

In [11]:
o = bscore(["I am Marius"], ["My name is marius"], lang="en")

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Experimenting with Roberta tokenizer

In [1]:
from transformers import RobertaTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
test = tokenizer("I am Marius")

In [3]:
out = tokenizer("I am Marius")

In [20]:
out

{'input_ids': [0, 100, 524, 1127, 6125, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [21]:
test.input_ids

[0, 100, 524, 1127, 6125, 2]

### Experimenting with AutoModel class

In [4]:
from transformers import AutoModel
from transformers import RobertaTokenizer

In [5]:
model = AutoModel.from_pretrained("roberta-large")
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
sentence = "I am Marius"
tokens = tokenizer(sentence)
token_ids = tokens["input_ids"]
masks = tokens["attention_mask"]

In [8]:
type(masks)

list

In [None]:
out = model(token_ids, attention_mask=masks, output_hidden_states=True)

### BERT Embedding - Discovery

In [34]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states=True)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text = "My name is Marius."
marked_text = "[CLS]"+text+"[SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

['[CLS]', 'my', 'name', 'is', 'marius', '.', '[SEP]']


In [16]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12}{:>6,}'.format(tup[0], tup[1]))

[CLS]          101
my           2,026
name         2,171
is           2,003
marius      20,032
.            1,012
[SEP]          102


In [17]:
#mark tokenks as belonging to sentence 1.
segment_ids = [1]*len(tokenized_text)
print(segment_ids)

[1, 1, 1, 1, 1, 1, 1]


In [18]:
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segment_ids])
print(tokens_tensor)
print(segments_tensor)

tensor([[  101,  2026,  2171,  2003, 20032,  1012,   102]])
tensor([[1, 1, 1, 1, 1, 1, 1]])


In [21]:
with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensor)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [27]:
bert_hidden_states = outputs[-1]
len(bert_hidden_states)

13

In [31]:
token_embeddings = torch.stack(bert_hidden_states, dim=0)
token_embeddings.size()

torch.Size([13, 1, 7, 768])

In [32]:
#here we have 1 sentence so we remove the batch size
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.size()

torch.Size([13, 7, 768])

In [33]:
#swap dim 0 and 1
token_embeddings = token_embeddings.permute(1, 0, 2)
token_embeddings.size()

torch.Size([7, 13, 768])

#### Word Vectors

In [47]:
#concatenate layers
token_vec_cat = []
for token in token_embeddings:
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    token_vec_cat.append(cat_vec)
print("Shape:", len(token_vec_cat),"x", len(token_vec_cat[0]))

Shape: 7 x 3072


In [59]:
for i in range(len(token_vec_cat)):
    if len(token_vec_cat[i] != 3072):
        print(i)
        print(len(token_vec_cat[i]))

0
3072
1
3072
2
3072
3
3072
4
3072
5
3072
6
3072


In [78]:
test = [token.tolist() for token in token_vec_cat]
test = np.array(test)

In [79]:
test.shape

(7, 3072)

In [48]:
#sum layers
token_vec_sum = []
for token in token_embeddings:
    cat_vec = torch.sum(token[-4:], dim=0)
    token_vec_sum.append(cat_vec)
print("Shape:", len(token_vec_sum),"x", len(token_vec_sum[0]))

Shape: 7 x 768


#### Sentence Vectors

In [51]:
token_vecs = bert_hidden_states[-2][0]
sentence_embedding = torch.mean(token_vecs, dim=0)
sentence_embedding.size()

torch.Size([768])

### Vector Visualization

In [82]:
from umap import UMAP
import plotly.express as px

In [80]:
umap3D = UMAP(n_components=3, init='random', random_state=0)
proj3D = umap3D.fit_transform(test)

  warn(


In [85]:
f = px.scatter_3d(proj3D, x=0, y=1, z=2)
f.show()

### BERT Sentence-level Embedding - Paragraphs 

In [6]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 
Downloading pytorch_model.bin: 100%|██████████| 440M/440M [00:11<00:00, 40.0MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification m

In [4]:
corpus = "Wikipedia is a multilingual, free, online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. Wikipedia is the largest and most-read reference work in history. It is consistently one of the 10 most popular websites ranked by Similarweb and formerly Alexa; as of 2023, Wikipedia was ranked the 5th most popular site in the world according to Semrush. It is hosted by the Wikimedia Foundation, an American non-profit organization funded mainly through donations."
delimiter="."
print(corpus)

Wikipedia is a multilingual, free, online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. Wikipedia is the largest and most-read reference work in history. It is consistently one of the 10 most popular websites ranked by Similarweb and formerly Alexa; as of 2023, Wikipedia was ranked the 5th most popular site in the world according to Semrush. It is hosted by the Wikimedia Foundation, an American non-profit organization funded mainly through donations.


In [20]:
splited = [sentence+delimiter for sentence in corpus.split(".")]
max_len = max(len(x) for x in splited)

input_ids = []
attention_masks = []
for sentence in splited:
    encoded = tokenizer.encode_plus(sentence, 
                                    add_special_tokens=True,
                                    max_length=max_len+1,
                                    pad_to_max_length=True,
                                    return_attention_mask=True,
                                    return_tensors='pt',
                                    truncation=True)
    input_ids.append(encoded["input_ids"])
    attention_masks.append(encoded["attention_mask"])

inputs_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

with torch.no_grad():
    output = model(inputs_ids, attention_mask=attention_masks)
hidden_state = output.last_hidden_state
cls_emb = hidden_state[:,0,:]
cls_emb = cls_emb.detach().numpy()
np.shape(cls_emb)



(5, 768)

In [19]:
cls_emb

array([[-0.10181614, -0.04256969, -0.37916115, ..., -0.01706432,
         0.38435265,  0.77578974],
       [-0.15074998,  0.16900486, -0.4508034 , ..., -0.41086987,
         0.6468059 ,  0.44568998],
       [-0.00590458,  0.07766522, -0.09788167, ..., -0.05835519,
         0.6742643 ,  0.6657301 ],
       [-0.46835747,  0.30540103, -0.3483228 , ..., -0.31408456,
         0.18837832,  0.8075387 ],
       [-0.1548306 ,  0.25915366,  0.14536104, ..., -0.2707459 ,
         0.49331143,  0.22958532]], dtype=float32)

In [14]:
len(splited)

5

### BERT Word-level Embedding - Paragraphs

In [1]:
from transformers import BertTokenizer, BertModel
import torch
from umap import UMAP
import plotly.express as px
import numpy as np

model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

corpus = "Wikipedia is a multilingual, free, online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. Wikipedia is the largest and most-read reference work in history. It is consistently one of the 10 most popular websites ranked by Similarweb and formerly Alexa; as of 2023, Wikipedia was ranked the 5th most popular site in the world according to Semrush. It is hosted by the Wikimedia Foundation, an American non-profit organization funded mainly through donations."
delimiter="."
print(corpus)

  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Wikipedia is a multilingual, free, online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. Wikipedia is the largest and most-read reference work in history. It is consistently one of the 10 most popular websites ranked by Similarweb and formerly Alexa; as of 2023, Wikipedia was ranked the 5th most popular site in the world according to Semrush. It is hosted by the Wikimedia Foundation, an American non-profit organization funded mainly through donations.


In [2]:
splited = [sentence+delimiter for sentence in corpus.split(".")]
max_len = max(len(x) for x in splited)

input_ids = []
attention_masks = []
for sentence in splited:
    encoded = tokenizer.encode_plus(sentence, 
                                    add_special_tokens=True,
                                    max_length=max_len+1,
                                    pad_to_max_length=True,
                                    return_attention_mask=True,
                                    return_tensors='pt',
                                    truncation=True)
    input_ids.append(encoded["input_ids"])
    attention_masks.append(encoded["attention_mask"])

#inputs_ids = torch.Tensor(len(input_ids),1, max_len+1)
#torch.cat(input_ids, out=inputs_ids)
inputs_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

with torch.no_grad():
    output = model(inputs_ids, attention_mask=attention_masks)



In [3]:
hidden_states = output.hidden_states

In [4]:
token_embeddings = torch.stack(hidden_states, dim=0)
print(token_embeddings.size())
token_embeddings = torch.squeeze(token_embeddings, dim=1)
print(token_embeddings.size())


torch.Size([13, 5, 213, 768])
torch.Size([13, 5, 213, 768])


In [5]:
token_embeddings = token_embeddings.permute(1,2,0,3)
print(token_embeddings.size())

torch.Size([5, 213, 13, 768])


In [6]:
embs = []
for batch in token_embeddings:
    for token in batch:
        emb = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        embs.append(emb)

In [7]:
embs[0].size()

torch.Size([3072])

In [25]:
len(embs)

1065

In [10]:
test = [token.tolist() for token in embs]
test = np.array(test)

In [24]:
test.shape

(1065, 3072)

In [20]:
umap3D = UMAP(n_components=3, init='random', random_state=0)
proj3D = umap3D.fit_transform(test).T

In [21]:
labels = tokenizer.convert_ids_to_tokens(tokenizer.encode(corpus))
labels = np.pad(labels)

In [23]:
len(proj3D[0])

1065

In [27]:
data = {"x": proj3D[0],
        "y": proj3D[1],
        "z": proj3D[2], 
        "labels": labels}

In [None]:
f = px.scatter_3d(data, x="x", y="y", z="z", labels="labels")
f.show()

In [12]:
f = px.scatter_3d(proj3D, x=0, y=1, z=2)
f.show()

### BERT Sentence Transformers

In [17]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Downloading (…)e9125/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<?, ?B/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 
Downloading (…)7e55de9125/README.md: 100%|██████████| 10.6k/10.6k [00:00<?, ?B/s]
Downloading (…)55de9125/config.json: 100%|██████████| 612/612 [00:00<?, ?B/s] 
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<?, ?B/s] 
Downloading (…)125/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 485kB/s]
Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:02<00:00, 44.5MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<?, ?B/s] 
Downloading (…)e9125/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.92MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 350/350 [00:00<?, ?B/s] 
Downloading (…)9125/train_script.py: 100%|██████████| 13.2k/13.2k [00:00<?, ?B/s]
Downloading (…)7e5

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173552e-02 -4.28515412e-02 -1.56286098e-02  1.40537675e-02
  3.95537615e-02  1.21796317e-01  2.94333659e-02 -3.17524336e-02
  3.54959927e-02 -7.93139786e-02  1.75878350e-02 -4.04370092e-02
  4.97259349e-02  2.54912134e-02 -7.18700662e-02  8.14968720e-02
  1.47072482e-03  4.79627065e-02 -4.50336114e-02 -9.92174670e-02
 -2.81769820e-02  6.45046234e-02  4.44670580e-02 -4.76217046e-02
 -3.52952480e-02  4.38671671e-02 -5.28566055e-02  4.33045556e-04
  1.01921499e-01  1.64072420e-02  3.26996632e-02 -3.45986634e-02
  1.21339280e-02  7.94871151e-02  4.58342070e-03  1.57778412e-02
 -9.68210399e-03  2.87625976e-02 -5.05806208e-02 -1.55793503e-02
 -2.87907012e-02 -9.62282438e-03  3.15556787e-02  2.27348972e-02
  8.71449485e-02 -3.85027118e-02 -8.84718373e-02 -8.75501521e-03
 -2.12343540e-02  2.08923593e-02 -9.02077407e-02 -5.25732078e-02
 -1.05638644e-02  2.88310759e-02 -1.61455162e-02  6.17835438e-03
 -1.23234

In [19]:
embedding.shape

(384,)

### Embedding a corpus

In [86]:
ex = "Wikipedia is a multilingual, free, online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. Wikipedia is the largest and most-read reference work in history. It is consistently one of the 10 most popular websites ranked by Similarweb and formerly Alexa; as of 2023, Wikipedia was ranked the 5th most popular site in the world according to Semrush. It is hosted by the Wikimedia Foundation, an American non-profit organization funded mainly through donations."
print(ex)

Wikipedia is a multilingual, free, online encyclopedia written and maintained by a community of volunteers, known as Wikipedians, through open collaboration and using a wiki-based editing system called MediaWiki. Wikipedia is the largest and most-read reference work in history. It is consistently one of the 10 most popular websites ranked by Similarweb and formerly Alexa; as of 2023, Wikipedia was ranked the 5th most popular site in the world according to Semrush. It is hosted by the Wikimedia Foundation, an American non-profit organization funded mainly through donations.


In [111]:
def tokenizeCorpus(corpus, model=BertModel.from_pretrained('bert-base-uncased'), tokenizer = BertTokenizer.from_pretrained('bert-base-uncased'), delimiter="."):
    splited = [sentence+delimiter for sentence in corpus.split(".")]
    SEPmarked = [sentence+" [SEP]" for sentence in splited[:-1]]
    marked_text = "[CLS] "+"".join(SEPmarked)[:-6]
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segment_ids = []
    i = 0
    for token in tokenized_text:
        segment_ids.append(i)
        if token == "[SEP]":
            i += 1
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segment_ids])
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensor)
    hidden_states = outputs[-1]
    
    
    return outputs

o = tokenizeCorpus(ex)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [112]:
o

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]]]), pooler_output=tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,