In [2]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:




df = pd.read_csv('../../data/processed/light.csv')
# Filter
timestamps = df.year.to_list()
texts = df.text.to_list()



In [4]:
def bert_text_preparation(texts, tokenizer, max_seq_length=612):
    """Preparing the input for BERT
    
    Takes a list of strings (texts) and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. Each sentence is treated as a separate segment.
    
    Args:
        texts (list): List of strings (sentences/documents) to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        max_seq_length (int): Maximum sequence length supported by the BERT model
        
    Returns:
        list: List of lists of BERT-readable tokens for each sentence
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    tokenized_texts = []
    tokens_tensors = []
    segments_tensors = []

    for text in texts:
        # Calculate how much to truncate from the beginning and end
        truncate_length = len(text) - max_seq_length + 2  # +2 to account for [CLS] and [SEP]

        # Truncate the beginning and end of the text
        truncated_text = text[truncate_length//2 : -truncate_length//2]

        marked_text = "[CLS] " + truncated_text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(indexed_tokens)

        tokenized_texts.append(tokenized_text)
        tokens_tensors.append(indexed_tokens)
        segments_tensors.append(segments_ids)

    # Pad sequences to max_seq_length
    tokens_tensors = torch.nn.utils.rnn.pad_sequence([torch.tensor(t) for t in tokens_tensors], batch_first=True)
    segments_tensors = torch.nn.utils.rnn.pad_sequence([torch.tensor(s) for s in segments_tensors], batch_first=True)

    return tokenized_texts, tokens_tensors, segments_tensors


In [5]:
model = BertModel.from_pretrained('bert-base-uncased', 
                                  output_hidden_states = True)


In [6]:
# Prepare input using the bert_text_preparation function
tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(texts, tokenizer)



In [None]:
with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensors)
    hidden_states = outputs[2]

# Selecting the output embeddings from the last layer
token_embeddings = hidden_states[-1]

# Assuming you want the embeddings for the first word in the first sentence
word_index = tokenized_texts[0].index("sovereignty")
word_embedding = token_embeddings[0, word_index].numpy()


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Assuming you have the BERT model loaded and the embeddings for each word
# Example: word_embedding_sovereignty and word_embedding_territorial

# Reshape embeddings to be 2D arrays
embedding_sovereignty = word_embedding_sovereignty.reshape(1, -1)
embedding_territorial = word_embedding_territorial.reshape(1, -1)

# Calculate cosine similarity
cosine_sim = cosine_similarity(embedding_sovereignty, embedding_territorial)

print("Cosine Similarity between 'sovereignty' and 'territorial integrity':", cosine_sim[0][0])


In [None]:
import plotly.express as px

# Assuming you have the BERT model loaded and the embeddings for each word at different time points
# Example: word_embedding_sovereignty_list and word_embedding_territorial_list

# Placeholder lists to store distances and time points
distances = []
time_points = []

# Collect embeddings at different time points
for time_point in range(num_time_points):
    # Example: Obtain word embeddings at different time points
    word_embedding_sovereignty = get_word_embedding_at_time_point("sovereignty", time_point)
    word_embedding_territorial = get_word_embedding_at_time_point("territorial integrity", time_point)

    # Calculate cosine similarity
    cosine_sim = cosine_similarity(word_embedding_sovereignty.reshape(1, -1), word_embedding_territorial.reshape(1, -1))

    # Append distance and time point
    distances.append(cosine_sim[0][0])
    time_points.append(time_point)

# Create a Plotly figure
fig = px.line(x=time_points, y=distances, markers=True, labels={'x': 'Time Points', 'y': 'Cosine Similarity'},
              title='Cosine Similarity between "sovereignty" and "territorial integrity" over Time')

# Show the interactive plot
fig.show()


In [29]:
from transformers import LongformerTokenizer, LongformerModel

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')


python(44409) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

In [30]:
def longformer_text_preparation(texts, tokenizer, max_seq_length=4096):
    tokenized_texts = []
    attention_masks = []

    for text in texts:
        # Truncate or pad text to fit within max_seq_length
        truncated_text = text[:max_seq_length]
        encoding = tokenizer(truncated_text, return_tensors='pt', truncation=True, padding='max_length', max_length=max_seq_length)

        tokenized_texts.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    # Convert lists to PyTorch tensors
    tokenized_texts = torch.stack(tokenized_texts)
    attention_masks = torch.stack(attention_masks)

    return tokenized_texts, attention_masks


In [None]:

# Prepare input using the longformer_text_preparation function
tokenized_texts, attention_masks = longformer_text_preparation(texts, tokenizer)

# Run the Longformer model
with torch.no_grad():
    outputs = model(input_ids=tokenized_texts, attention_mask=attention_masks)

# Access the hidden states or pooler output as needed
hidden_states = outputs.last_hidden_state
