In [3]:
# imports
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd

In [4]:
# read in preprocessed sentences
txt = pd.read_csv('D:\\preprocessed_sentences.csv')[['Category', 'Sentence', 'Length Label']]
txt = list(txt.itertuples(index=False, name=None))

In [5]:
# word embeddings function
def word_embeddings(model, tokenizer, wrangled_txt):
    tokenized_txt = [(i[0], tokenizer.tokenize(i[1]), i[2]) for i in wrangled_txt]
    ids_tokens = [(i[0], tokenizer.convert_tokens_to_ids(i[1]), i[2]) for i in tokenized_txt]
    segment_ids = [(i[0], [1] * len(i[1]), i[2]) for i in tokenized_txt]
    token_tensor = [(i[0], torch.tensor([i[1]]), i[2]) for i in ids_tokens]
    segment_tensor = [(i[0], torch.tensor([i[1]]), i[2]) for i in segment_ids]
    hidden_states_list = []
    for i in range(len(token_tensor)):
        with torch.no_grad():
            outputs = model(token_tensor[i][1], segment_tensor[i][1])
        hidden_states = outputs[-1]
        hidden_states_list.append((token_tensor[i][0], hidden_states, token_tensor[i][2]))
    token_embeddings_list = []
    for h_s in hidden_states_list:
        token_embeddings = torch.stack(h_s[1], dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1) # remove the batch dimension
        token_embeddings = token_embeddings.permute(1,0,2)
        token_embeddings_list.append((h_s[0], token_embeddings, h_s[2]))
    word_embeddings = []
    for t_e in token_embeddings_list:
        token_vectors_sum = []
        for token in t_e[1]:
            sum_vector = torch.sum(token[-4:], dim=0) # sum the last 4 hidden layers
            token_vectors_sum.append(sum_vector)
        word_embeddings.append((t_e[0], token_vectors_sum, t_e[2]))
    return word_embeddings

In [6]:
# sentence embeddings function
def word_embedding_to_sentence_embedding(word_embedding):
    sentence_embedding = []
    for i in range(len(word_embedding[1][0])):
        total = 0
        for j in word_embedding[1]:
            total = torch.add(total, j[i])
        average = total / len(word_embedding[1])
        sentence_embedding.append(average.item())
    return sentence_embedding

# RobBERT-2023

In [8]:
# tokenizer, model
robbert_tokenizer = AutoTokenizer.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-large")
robbert_model = AutoModel.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-large", output_hidden_states = True, return_dict = False)
robbert_model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50000, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  

In [26]:
# data preprocessing
wrangled_txt = [(i[0], '<s> ' + i[1], i[2]) for i in txt]
len(wrangled_txt)

7935

In [28]:
wrangled_txt_1 = wrangled_txt[:1000]
word_embeddings_robbert_1 = word_embeddings(robbert_model, robbert_tokenizer, wrangled_txt_1)
sentence_embeddings_robbert_1 = []
for word_embedding in word_embeddings_robbert_1:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_robbert_1.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_robbert_1 = pd.DataFrame(sentence_embeddings_robbert_1, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_robbert_1.to_csv('D:\\robbert_embeddings_1.csv')

In [29]:
wrangled_txt_2 = wrangled_txt[1000:2000]
word_embeddings_robbert_2 = word_embeddings(robbert_model, robbert_tokenizer, wrangled_txt_2)
sentence_embeddings_robbert_2 = []
for word_embedding in word_embeddings_robbert_2:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_robbert_2.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_robbert_2 = pd.DataFrame(sentence_embeddings_robbert_2, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_robbert_2.to_csv('D:\\robbert_embeddings_2.csv')

In [30]:
wrangled_txt_3 = wrangled_txt[2000:3000]
word_embeddings_robbert_3 = word_embeddings(robbert_model, robbert_tokenizer, wrangled_txt_3)
sentence_embeddings_robbert_3 = []
for word_embedding in word_embeddings_robbert_3:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_robbert_3.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_robbert_3 = pd.DataFrame(sentence_embeddings_robbert_3, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_robbert_3.to_csv('D:\\robbert_embeddings_3.csv')

In [31]:
wrangled_txt_4 = wrangled_txt[3000:4000]
word_embeddings_robbert_4 = word_embeddings(robbert_model, robbert_tokenizer, wrangled_txt_4)
sentence_embeddings_robbert_4 = []
for word_embedding in word_embeddings_robbert_4:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_robbert_4.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_robbert_4 = pd.DataFrame(sentence_embeddings_robbert_4, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_robbert_4.to_csv('D:\\robbert_embeddings_4.csv')

In [32]:
wrangled_txt_5 = wrangled_txt[4000:5000]
word_embeddings_robbert_5 = word_embeddings(robbert_model, robbert_tokenizer, wrangled_txt_5)
sentence_embeddings_robbert_5 = []
for word_embedding in word_embeddings_robbert_5:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_robbert_5.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_robbert_5 = pd.DataFrame(sentence_embeddings_robbert_5, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_robbert_5.to_csv('D:\\robbert_embeddings_5.csv')

In [33]:
wrangled_txt_6 = wrangled_txt[5000:6000]
word_embeddings_robbert_6 = word_embeddings(robbert_model, robbert_tokenizer, wrangled_txt_6)
sentence_embeddings_robbert_6 = []
for word_embedding in word_embeddings_robbert_6:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_robbert_6.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_robbert_6 = pd.DataFrame(sentence_embeddings_robbert_6, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_robbert_6.to_csv('D:\\robbert_embeddings_6.csv')

In [34]:
wrangled_txt_7 = wrangled_txt[6000:7000]
word_embeddings_robbert_7 = word_embeddings(robbert_model, robbert_tokenizer, wrangled_txt_7)
sentence_embeddings_robbert_7 = []
for word_embedding in word_embeddings_robbert_7:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_robbert_7.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_robbert_7 = pd.DataFrame(sentence_embeddings_robbert_7, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_robbert_7.to_csv('D:\\robbert_embeddings_7.csv')

In [35]:
wrangled_txt_8 = wrangled_txt[7000:]
word_embeddings_robbert_8 = word_embeddings(robbert_model, robbert_tokenizer, wrangled_txt_8)
sentence_embeddings_robbert_8 = []
for word_embedding in word_embeddings_robbert_8:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_robbert_8.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_robbert_8 = pd.DataFrame(sentence_embeddings_robbert_8, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_robbert_8.to_csv('D:\\robbert_embeddings_8.csv')

# BERTje

In [42]:
# tokenizer, model
bertje_tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
bertje_model = AutoModel.from_pretrained("GroNLP/bert-base-dutch-cased", output_hidden_states = True, return_dict = False)
bertje_model.eval()

Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30073, 768, padding_idx=3)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [43]:
# preprocessing data
wrangled_txt = [(i[0], '[CLS] ' + i[1] + ' [SEP]', i[2]) for i in txt]

In [44]:
wrangled_txt_1 = wrangled_txt[:1000]
word_embeddings_bertje_1 = word_embeddings(bertje_model, bertje_tokenizer, wrangled_txt_1)
sentence_embeddings_bertje_1 = []
for word_embedding in word_embeddings_bertje_1:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_bertje_1.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_bertje_1 = pd.DataFrame(sentence_embeddings_bertje_1, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_bertje_1.to_csv('D:\\bertje_embeddings_1.csv')

In [45]:
wrangled_txt_2 = wrangled_txt[1000:2000]
word_embeddings_bertje_2 = word_embeddings(bertje_model, bertje_tokenizer, wrangled_txt_2)
sentence_embeddings_bertje_2 = []
for word_embedding in word_embeddings_bertje_2:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_bertje_2.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_bertje_2 = pd.DataFrame(sentence_embeddings_bertje_2, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_bertje_2.to_csv('D:\\bertje_embeddings_2.csv')

In [46]:
wrangled_txt_3 = wrangled_txt[2000:3000]
word_embeddings_bertje_3 = word_embeddings(bertje_model, bertje_tokenizer, wrangled_txt_3)
sentence_embeddings_bertje_3 = []
for word_embedding in word_embeddings_bertje_3:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_bertje_3.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_bertje_3 = pd.DataFrame(sentence_embeddings_bertje_3, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_bertje_3.to_csv('D:\\bertje_embeddings_3.csv')

In [47]:
wrangled_txt_4 = wrangled_txt[3000:4000]
word_embeddings_bertje_4 = word_embeddings(bertje_model, bertje_tokenizer, wrangled_txt_4)
sentence_embeddings_bertje_4 = []
for word_embedding in word_embeddings_bertje_4:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_bertje_4.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_bertje_4 = pd.DataFrame(sentence_embeddings_bertje_4, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_bertje_4.to_csv('D:\\bertje_embeddings_4.csv')

In [48]:
wrangled_txt_5 = wrangled_txt[4000:5000]
word_embeddings_bertje_5 = word_embeddings(bertje_model, bertje_tokenizer, wrangled_txt_5)
sentence_embeddings_bertje_5 = []
for word_embedding in word_embeddings_bertje_5:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_bertje_5.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_bertje_5 = pd.DataFrame(sentence_embeddings_bertje_5, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_bertje_5.to_csv('D:\\bertje_embeddings_5.csv')

In [49]:
wrangled_txt_6 = wrangled_txt[5000:6000]
word_embeddings_bertje_6 = word_embeddings(bertje_model, bertje_tokenizer, wrangled_txt_6)
sentence_embeddings_bertje_6 = []
for word_embedding in word_embeddings_bertje_6:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_bertje_6.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_bertje_6 = pd.DataFrame(sentence_embeddings_bertje_6, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_bertje_6.to_csv('D:\\bertje_embeddings_6.csv')

In [50]:
wrangled_txt_7 = wrangled_txt[6000:7000]
word_embeddings_bertje_7 = word_embeddings(bertje_model, bertje_tokenizer, wrangled_txt_7)
sentence_embeddings_bertje_7 = []
for word_embedding in word_embeddings_bertje_7:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_bertje_7.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_bertje_7 = pd.DataFrame(sentence_embeddings_bertje_7, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_bertje_7.to_csv('D:\\bertje_embeddings_7.csv')

In [51]:
wrangled_txt_8 = wrangled_txt[7000:]
word_embeddings_bertje_8 = word_embeddings(bertje_model, bertje_tokenizer, wrangled_txt_8)
sentence_embeddings_bertje_8 = []
for word_embedding in word_embeddings_bertje_8:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_bertje_8.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_bertje_8 = pd.DataFrame(sentence_embeddings_bertje_8, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_bertje_8.to_csv('D:\\bertje_embeddings_8.csv')

# EuroBERT

In [66]:
# tokenizer, model
eurobert_tokenizer = AutoTokenizer.from_pretrained("EuroBERT/EuroBERT-210m")
eurobert_model = AutoModel.from_pretrained("EuroBERT/EuroBERT-210m", trust_remote_code=True, output_hidden_states = True, return_dict = False)
eurobert_model.eval()

EuroBertModel(
  (embed_tokens): Embedding(128256, 768, padding_idx=128001)
  (layers): ModuleList(
    (0-11): 12 x EuroBertDecoderLayer(
      (self_attn): EuroBertAttention(
        (q_proj): Linear(in_features=768, out_features=768, bias=False)
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=False)
        (o_proj): Linear(in_features=768, out_features=768, bias=False)
      )
      (mlp): EuroBertMLP(
        (gate_proj): Linear(in_features=768, out_features=3072, bias=False)
        (up_proj): Linear(in_features=768, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=768, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): EuroBertRMSNorm((768,), eps=1e-05)
      (post_attention_layernorm): EuroBertRMSNorm((768,), eps=1e-05)
    )
  )
  (norm): EuroBertRMSNorm((768,), eps=1e-05)
  (rotary_emb): EuroBertRotaryEmbedding()
)

In [68]:
# preprocessing data
wrangled_txt = [(i[0], '[CLS] ' + i[1] + ' [SEP]', i[2]) for i in txt]

In [72]:
wrangled_txt_1 = wrangled_txt[:1000]
word_embeddings_eurobert_1 = word_embeddings(eurobert_model, eurobert_tokenizer, wrangled_txt_1)
sentence_embeddings_eurobert_1 = []
for word_embedding in word_embeddings_eurobert_1:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_eurobert_1.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_eurobert_1 = pd.DataFrame(sentence_embeddings_eurobert_1, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_eurobert_1.to_csv('D:\\eurobert_embeddings_1.csv')

In [73]:
wrangled_txt_2 = wrangled_txt[1000:2000]
word_embeddings_eurobert_2 = word_embeddings(eurobert_model, eurobert_tokenizer, wrangled_txt_2)
sentence_embeddings_eurobert_2 = []
for word_embedding in word_embeddings_eurobert_2:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_eurobert_2.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_eurobert_2 = pd.DataFrame(sentence_embeddings_eurobert_2, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_eurobert_2.to_csv('D:\\eurobert_embeddings_2.csv')

In [74]:
wrangled_txt_3 = wrangled_txt[2000:3000]
word_embeddings_eurobert_3 = word_embeddings(eurobert_model, eurobert_tokenizer, wrangled_txt_3)
sentence_embeddings_eurobert_3 = []
for word_embedding in word_embeddings_eurobert_3:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_eurobert_3.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_eurobert_3 = pd.DataFrame(sentence_embeddings_eurobert_3, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_eurobert_3.to_csv('D:\\eurobert_embeddings_3.csv')

In [75]:
wrangled_txt_4 = wrangled_txt[3000:4000]
word_embeddings_eurobert_4 = word_embeddings(eurobert_model, eurobert_tokenizer, wrangled_txt_4)
sentence_embeddings_eurobert_4 = []
for word_embedding in word_embeddings_eurobert_4:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_eurobert_4.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_eurobert_4 = pd.DataFrame(sentence_embeddings_eurobert_4, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_eurobert_4.to_csv('D:\\eurobert_embeddings_4.csv')

In [76]:
wrangled_txt_5 = wrangled_txt[4000:5000]
word_embeddings_eurobert_5 = word_embeddings(eurobert_model, eurobert_tokenizer, wrangled_txt_5)
sentence_embeddings_eurobert_5 = []
for word_embedding in word_embeddings_eurobert_5:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_eurobert_5.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_eurobert_5 = pd.DataFrame(sentence_embeddings_eurobert_5, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_eurobert_5.to_csv('D:\\eurobert_embeddings_5.csv')

In [78]:
wrangled_txt_6 = wrangled_txt[5000:6000]
word_embeddings_eurobert_6 = word_embeddings(eurobert_model, eurobert_tokenizer, wrangled_txt_6)
sentence_embeddings_eurobert_6 = []
for word_embedding in word_embeddings_eurobert_6:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_eurobert_6.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_eurobert_6 = pd.DataFrame(sentence_embeddings_eurobert_6, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_eurobert_6.to_csv('D:\\eurobert_embeddings_6.csv')

In [79]:
wrangled_txt_7 = wrangled_txt[6000:7000]
word_embeddings_eurobert_7 = word_embeddings(eurobert_model, eurobert_tokenizer, wrangled_txt_7)
sentence_embeddings_eurobert_7 = []
for word_embedding in word_embeddings_eurobert_7:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_eurobert_7.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_eurobert_7 = pd.DataFrame(sentence_embeddings_eurobert_7, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_eurobert_7.to_csv('D:\\eurobert_embeddings_7.csv')

In [80]:
wrangled_txt_8 = wrangled_txt[7000:]
word_embeddings_eurobert_8 = word_embeddings(eurobert_model, eurobert_tokenizer, wrangled_txt_8)
sentence_embeddings_eurobert_8 = []
for word_embedding in word_embeddings_eurobert_8:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_eurobert_8.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_eurobert_8 = pd.DataFrame(sentence_embeddings_eurobert_8, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_eurobert_8.to_csv('D:\\eurobert_embeddings_8.csv')

# mBERT

In [83]:
# word embeddings mBERT
mbert_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
mbert_model = AutoModel.from_pretrained("bert-base-multilingual-cased", output_hidden_states = True, return_dict = False)
mbert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [85]:
wrangled_txt = [(i[0], '[CLS] ' + i[1] + ' [SEP]', i[2]) for i in txt]

In [86]:
wrangled_txt_1 = wrangled_txt[:1000]
word_embeddings_mbert_1 = word_embeddings(mbert_model, mbert_tokenizer, wrangled_txt_1)
sentence_embeddings_mbert_1 = []
for word_embedding in word_embeddings_mbert_1:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_mbert_1.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_mbert_1 = pd.DataFrame(sentence_embeddings_mbert_1, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_mbert_1.to_csv('D:\\mbert_embeddings_1.csv')

In [87]:
wrangled_txt_2 = wrangled_txt[1000:2000]
word_embeddings_mbert_2 = word_embeddings(mbert_model, mbert_tokenizer, wrangled_txt_2)
sentence_embeddings_mbert_2 = []
for word_embedding in word_embeddings_mbert_2:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_mbert_2.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_mbert_2 = pd.DataFrame(sentence_embeddings_mbert_2, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_mbert_2.to_csv('D:\\mbert_embeddings_2.csv')

In [88]:
wrangled_txt_3 = wrangled_txt[2000:3000]
word_embeddings_mbert_3 = word_embeddings(mbert_model, mbert_tokenizer, wrangled_txt_3)
sentence_embeddings_mbert_3 = []
for word_embedding in word_embeddings_mbert_3:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_mbert_3.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_mbert_3 = pd.DataFrame(sentence_embeddings_mbert_3, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_mbert_3.to_csv('D:\\mbert_embeddings_3.csv')

In [89]:
wrangled_txt_4 = wrangled_txt[3000:4000]
word_embeddings_mbert_4 = word_embeddings(mbert_model, mbert_tokenizer, wrangled_txt_4)
sentence_embeddings_mbert_4 = []
for word_embedding in word_embeddings_mbert_4:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_mbert_4.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_mbert_4 = pd.DataFrame(sentence_embeddings_mbert_4, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_mbert_4.to_csv('D:\\mbert_embeddings_4.csv')

In [90]:
wrangled_txt_5 = wrangled_txt[4000:5000]
word_embeddings_mbert_5 = word_embeddings(mbert_model, mbert_tokenizer, wrangled_txt_5)
sentence_embeddings_mbert_5 = []
for word_embedding in word_embeddings_mbert_5:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_mbert_5.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_mbert_5 = pd.DataFrame(sentence_embeddings_mbert_5, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_mbert_5.to_csv('D:\\mbert_embeddings_5.csv')

In [91]:
wrangled_txt_6 = wrangled_txt[5000:6000]
word_embeddings_mbert_6 = word_embeddings(mbert_model, mbert_tokenizer, wrangled_txt_6)
sentence_embeddings_mbert_6 = []
for word_embedding in word_embeddings_mbert_6:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_mbert_6.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_mbert_6 = pd.DataFrame(sentence_embeddings_mbert_6, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_mbert_6.to_csv('D:\\mbert_embeddings_6.csv')

In [92]:
wrangled_txt_7 = wrangled_txt[6000:7000]
word_embeddings_mbert_7 = word_embeddings(mbert_model, mbert_tokenizer, wrangled_txt_7)
sentence_embeddings_mbert_7 = []
for word_embedding in word_embeddings_mbert_7:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_mbert_7.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_mbert_7 = pd.DataFrame(sentence_embeddings_mbert_7, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_mbert_7.to_csv('D:\\mbert_embeddings_7.csv')

In [93]:
wrangled_txt_8 = wrangled_txt[7000:]
word_embeddings_mbert_8 = word_embeddings(mbert_model, mbert_tokenizer, wrangled_txt_8)
sentence_embeddings_mbert_8 = []
for word_embedding in word_embeddings_mbert_8:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_mbert_8.append((word_embedding[0], sentence_embedding, word_embedding[2]))
df_mbert_8 = pd.DataFrame(sentence_embeddings_mbert_8, columns=['Category', 'Sentence Embedding', 'Length Label'])
df_mbert_8.to_csv('D:\\mbert_embeddings_8.csv')