# Word embeddings generated by BERT Model.

In [None]:
from transformers import BertModel, BertTokenizer, AutoTokenizer
import numpy as np
import streamlit as st
import re
import pandas as pd
from datetime import datetime
import nltk
import torch

# Initialize BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# Read the input DataFrame
df = pd.read_csv('../../../data/processed/paragraph.csv')

# Create the output folder if it doesn't exist
output_folder = "../../../output/bert_embeddings_paragraph"
os.makedirs(output_folder, exist_ok=True)


start_time = time.time()
text = row['text']


In [28]:
df.tail()

Unnamed: 0.1,Unnamed: 0,ccode_iso,session,year,paragraph_index,text
1604952,1604953,ZWE,77,2022,20,"Notwithstanding our success, the ongoing delet..."
1604953,1604954,ZWE,77,2022,21,"At the international level, Zimbabwe has adopt..."
1604954,1604955,ZWE,77,2022,22,The spread of terrorism and the intensificatio...
1604955,1604956,ZWE,77,2022,23,Zimbabwe stands committed to playing its part ...
1604956,1604957,ZWE,77,2022,24,"In conclusion, Zimbabwe reaffirms its commitme..."


In [25]:

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Extract relevant information from the row
    ccode_iso = row['ccode_iso']
    year = row['year']
    text = row['text']
  
    # Tokenize the text
    tokenized_text = tokenizer.tokenize(text)
    
    # Truncate or pad text to fit within max_seq_length
    truncate_length = len(tokenized_text) - 512 + 2  # +2 to account for [CLS] and [SEP]
    truncated_text = tokenized_text[truncate_length//2 : -truncate_length//2]

    # Add special tokens [CLS] and [SEP], convert tokens to ids, and create attention mask
    marked_text = ["[CLS] "] + truncated_text + [" [SEP]"]
    indexed_tokens = tokenizer.convert_tokens_to_ids(marked_text)
    attention_mask = [1] * len(indexed_tokens)

    # Pad sequences to max_seq_length
    if len(indexed_tokens) < 512:
        indexed_tokens.append(0)
        attention_mask.append(0)

    # Convert lists to PyTorch tensors
    tokens_tensors = torch.tensor([indexed_tokens])
    attention_masks = torch.tensor([attention_mask])

    # Run the BERT model
    with torch.no_grad():
        outputs = model(input_ids=tokens_tensors.view(-1, tokens_tensors.size(-1)), attention_mask=attention_masks.view(-1, attention_masks.size(-1)))

    # Extract the hidden states and create a DataFrame
    hidden_states = outputs[2][0].squeeze().numpy()
    pd_words = pd.Series(marked_text, name='term')
    df_outputs = pd.DataFrame(hidden_states)
    df_outputs['term'] = pd_words

    # Move 'term' column to the first position
    df_outputs = df_outputs[['term'] + [col for col in df_outputs.columns if col != 'term']]

    # Save the DataFrame to a CSV file
    output_file = os.path.join(output_folder, f'embedding_{ccode_iso}_{year}.csv')
    df_outputs.to_csv(output_file, index=False)

# Record the end time
end_time = time.time()

# Calculate and print the total time taken
total_time = end_time - start_time
print(f'Total time taken: {total_time} seconds')

I consider it a great honour and privilege to share with you the opportunities and responsibilities of the United Nations on this momentous occasion. This world Organization embodies the hopes and aspirations of the peoples of the world for peace, prosperity and prospects of a better and more fruitful life, It is our task to reaffirm and to help realize the aims and purposes which are expressed in the Preamble to the Charter of the United Nations with a degree of eloquence that only the urgent desire of a generation which had suffered the scourges of two world wars could formulate. It is for us, the representatives of Members of the United Nations, never to become oblivious to the sufferings and experiences which created the urgent desire to pursue the aims and the purposes of the United Nations. We must solemnly rededicate ourselves every day in discharging our functions. We must pledge ourselves to serve humanity. We must persist in our efforts in searching for ways and means of pres

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [23]:
tokenizer.tokenize(df.text[1])

['In',
 'order',
 'to',
 'accomplish',
 'these',
 'ends',
 ',',
 'we',
 'must',
 'push',
 'forward',
 'the',
 'cultural',
 ',',
 'economic',
 'and',
 'social',
 'development',
 'of',
 'the',
 'under',
 '-',
 'developed',
 'areas',
 'of',
 'the',
 'world',
 'through',
 'the',
 'aid',
 'and',
 'assistance',
 'of',
 'the',
 'United',
 'Nations',
 'in',
 'all',
 'its',
 'phases',
 '.',
 'In',
 'the',
 'realization',
 'of',
 'these',
 'aims',
 'and',
 'objectives',
 ',',
 'happily',
 'the',
 'moral',
 'duties',
 'of',
 'the',
 'peoples',
 'of',
 'the',
 'United',
 'Nations',
 ',',
 'for',
 'which',
 'their',
 'collective',
 'conscience',
 'acts',
 'as',
 'their',
 'guide',
 ',',
 'coincide',
 '##s',
 'with',
 'their',
 'collective',
 'practical',
 'self',
 '-',
 'interest',
 '.']