In [49]:
import torch
from transformers import LongformerTokenizer, LongformerModel
import pandas as pd
from tqdm import tqdm

In [44]:
# Load the Longformer tokenizer and model
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [45]:
# Define a function to compute the document embedding vector
def get_document_embedding(document):
    # Tokenize the document and add special tokens
    tokens = tokenizer.encode(document, add_special_tokens=True)
    # Convert the token IDs to a PyTorch tensor
    input_ids = torch.tensor(tokens).unsqueeze(0)  # Batch size 1
    # Compute the document embedding vector using the Longformer model
    outputs = model(input_ids)
    embedding = outputs[0].squeeze().tolist()  # Remove the batch dimension
    #embedding = outputs[0][:, -1, :].squeeze().tolist()
    # Normalize the embedding vector to have unit length
    #embedding /= embedding.norm()
    doc2vec = []
    for i in range(len(embedding[0])):
        pos_value = 0.0
        for item in embedding:
            item = list(item)
            pos_value += item[i]
        doc2vec.append(pos_value/(len(embedding)))
    return doc2vec


In [54]:
df = pd.read_csv('/Users/carina/Downloads/courses/final thesis/dataset/annotated text.csv')

In [55]:
df_vec = pd.DataFrame()
for index in df:
    para = df[index].tolist()
    final_vec = []
    for i in tqdm(range(len(para))):
        document = para[i]
        embedding = get_document_embedding(document)
        final_vec.append(embedding)
    df_vec[index] = final_vec

100%|█████████████████████████████████████████| 480/480 [16:20<00:00,  2.04s/it]
100%|█████████████████████████████████████████| 480/480 [16:24<00:00,  2.05s/it]
100%|█████████████████████████████████████████| 480/480 [16:07<00:00,  2.01s/it]
100%|█████████████████████████████████████████| 480/480 [15:29<00:00,  1.94s/it]
100%|█████████████████████████████████████████| 480/480 [15:24<00:00,  1.93s/it]
100%|█████████████████████████████████████████| 480/480 [15:50<00:00,  1.98s/it]


In [57]:
df_vec.to_csv('/Users/carina/Downloads/courses/final thesis/precessed data/doc2vec_longformer.csv')