In [1]:
# imports
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd

In [2]:
# read in sentences
txt = pd.read_csv('D:\\preprocessed_sentences_Lassy_Klein.csv')[['Category', 'Sentence', 'Length Label']]
txt = list(txt.itertuples(index=False, name=None))

In [3]:
# word embeddings function
def word_embeddings(model, tokenizer, wrangled_txt):
    tokenized_txt = [(i[0], tokenizer.tokenize(i[1]), i[2]) for i in wrangled_txt]
    ids_tokens = [(i[0], tokenizer.convert_tokens_to_ids(i[1]), i[2]) for i in tokenized_txt]
    segment_ids = [(i[0], [1] * len(i[1]), i[2]) for i in tokenized_txt]
    token_tensor = [(i[0], torch.tensor([i[1]]), i[2]) for i in ids_tokens]
    segment_tensor = [(i[0], torch.tensor([i[1]]), i[2]) for i in segment_ids]
    hidden_states_list = []
    for i in range(len(token_tensor)):
        with torch.no_grad():
            outputs = model(token_tensor[i][1], segment_tensor[i][1])
        hidden_states = outputs[-1]
        hidden_states_list.append((token_tensor[i][0], hidden_states, token_tensor[i][2]))
    token_embeddings_list = []
    for h_s in hidden_states_list:
        token_embeddings = torch.stack(h_s[1], dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1) # remove the batch dimension
        token_embeddings = token_embeddings.permute(1,0,2)
        token_embeddings_list.append((h_s[0], token_embeddings, h_s[2]))
    word_embeddings = []
    for t_e in token_embeddings_list:
        token_vectors_sum = []
        for token in t_e[1]:
            sum_vector = torch.sum(token[-4:], dim=0) # sum the last 4 hidden layers
            token_vectors_sum.append(sum_vector)
        word_embeddings.append((t_e[0], token_vectors_sum, t_e[2]))
    return word_embeddings

In [4]:
# sentence embeddings function
def word_embedding_to_sentence_embedding(word_embedding):
    sentence_embedding = []
    for i in range(len(word_embedding[1][0])):
        total = 0
        for j in word_embedding[1]:
            total = torch.add(total, j[i])
        average = total / len(word_embedding[1])
        sentence_embedding.append(average.item())
    return sentence_embedding

# RobBERT-2023

In [6]:
# tokenizer, model
robbert_tokenizer = AutoTokenizer.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-large")
robbert_model = AutoModel.from_pretrained("DTAI-KULeuven/robbert-2023-dutch-large", output_hidden_states = True, return_dict = False)
robbert_model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at DTAI-KULeuven/robbert-2023-dutch-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50000, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  

In [7]:
# data preprocessing
wrangled_txt = [(i[0], '<s> ' + i[1], i[2]) for i in txt]

In [8]:
word_embeddings_robbert = word_embeddings(robbert_model, robbert_tokenizer, wrangled_txt)

In [9]:
# sentence embeddings
sentence_embeddings_robbert = []
for word_embedding in word_embeddings_robbert:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_robbert.append((word_embedding[0], sentence_embedding, word_embedding[2]))

In [10]:
df_robbert = pd.DataFrame(sentence_embeddings_robbert, columns=['Category', 'Sentence Embedding', 'Length Label'])

# BERTje

In [12]:
# tokenizer, model
bertje_tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
bertje_model = AutoModel.from_pretrained("GroNLP/bert-base-dutch-cased", output_hidden_states = True, return_dict = False)
bertje_model.eval()

Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30073, 768, padding_idx=3)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [13]:
# preprocessing data
wrangled_txt = [(i[0], '[CLS] ' + i[1] + ' [SEP]', i[2]) for i in txt]

In [14]:
# word embeddings BERTje
word_embeddings_bertje = word_embeddings(bertje_model, bertje_tokenizer, wrangled_txt)

In [15]:
# sentence embeddings BERTje
sentence_embeddings_bertje = []
for word_embedding in word_embeddings_bertje:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_bertje.append((word_embedding[0], sentence_embedding, word_embedding[2]))

In [16]:
df_bertje = pd.DataFrame(sentence_embeddings_bertje, columns=['Category', 'Sentence Embedding', 'Length Label'])

# EuroBERT

In [18]:
# tokenizer, model
eurobert_tokenizer = AutoTokenizer.from_pretrained("EuroBERT/EuroBERT-210m")
eurobert_model = AutoModel.from_pretrained("EuroBERT/EuroBERT-210m", trust_remote_code=True, output_hidden_states = True, return_dict = False)
eurobert_model.eval()

EuroBertModel(
  (embed_tokens): Embedding(128256, 768, padding_idx=128001)
  (layers): ModuleList(
    (0-11): 12 x EuroBertDecoderLayer(
      (self_attn): EuroBertAttention(
        (q_proj): Linear(in_features=768, out_features=768, bias=False)
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=False)
        (o_proj): Linear(in_features=768, out_features=768, bias=False)
      )
      (mlp): EuroBertMLP(
        (gate_proj): Linear(in_features=768, out_features=3072, bias=False)
        (up_proj): Linear(in_features=768, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=768, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): EuroBertRMSNorm((768,), eps=1e-05)
      (post_attention_layernorm): EuroBertRMSNorm((768,), eps=1e-05)
    )
  )
  (norm): EuroBertRMSNorm((768,), eps=1e-05)
  (rotary_emb): EuroBertRotaryEmbedding()
)

In [19]:
# preprocessing data
wrangled_txt = [(i[0], '[CLS] ' + i[1] + ' [SEP]', i[2]) for i in txt]

In [20]:
# word embeddings EuroBERT
word_embeddings_eurobert = word_embeddings(eurobert_model, eurobert_tokenizer, wrangled_txt)

In [21]:
# sentence embeddings EuroBERT
sentence_embeddings_eurobert = []
for word_embedding in word_embeddings_eurobert:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_eurobert.append((word_embedding[0], sentence_embedding, word_embedding[2]))

In [22]:
# dataframe
df_eurobert = pd.DataFrame(sentence_embeddings_eurobert, columns=['Category', 'Sentence Embedding', 'Length Label'])

# mBERT

In [26]:
# word embeddings mBERT
mbert_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
mbert_model = AutoModel.from_pretrained("bert-base-multilingual-cased", output_hidden_states = True, return_dict = False)
mbert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [27]:
wrangled_txt = [(i[0], '[CLS] ' + i[1] + ' [SEP]', i[2]) for i in txt]

In [28]:
word_embeddings_mbert = word_embeddings(mbert_model, mbert_tokenizer, wrangled_txt)

In [29]:
# sentence embeddings mBERT
sentence_embeddings_mbert = []
for word_embedding in word_embeddings_mbert:
    sentence_embedding = word_embedding_to_sentence_embedding(word_embedding)
    sentence_embeddings_mbert.append((word_embedding[0], sentence_embedding, word_embedding[2]))

In [30]:
df_mbert = pd.DataFrame(sentence_embeddings_mbert, columns=['Category', 'Sentence Embedding', 'Length Label'])

# Saving the sentence embeddings

In [32]:
# saving to csv files
# robBERT
df_robbert.to_csv('D:\\sentence_embeddings_robbert_lassy_klein.csv')

# BERTje
df_bertje.to_csv('D:\\sentence_embeddings_bertje_lassy_klein.csv')

# EuroBERT
df_eurobert.to_csv('D:\\sentence_embeddings_eurobert_lassy_klein.csv')

# mBERT
df_mbert.to_csv('D:\\sentence_embeddings_mbert_lassy_klein.csv')