# get embedding vectors by GPT2

In [1]:
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2Model
import pandas as pd
from tqdm import tqdm
import string

In [2]:
# Load GPT2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2', output_hidden_states=True)

In [3]:
punctuation_map = dict((ord(char), None) for char in string.punctuation)  #引入标点符号，为下步去除标点做准备

In [4]:
df_paragraphs = pd.read_csv('/Users/carina/Downloads/courses/final thesis/dataset/annotated text.csv')

In [5]:
heads = df_paragraphs.columns.values.tolist()
df_emvecOfaw = pd.DataFrame()
for head in heads:
    paras = []
    paras = df_paragraphs[head].tolist()
    
    words_embeddings = []
    for i in tqdm(range(len(paras))):
        
        paragraphs = paras[i]
        
        paragraphs = paragraphs.lower()
        paragraphs = paragraphs.translate(punctuation_map)
        
        # Tokenize the paragraph and convert the tokens to IDs
        input_ids = torch.tensor(tokenizer.encode(paragraphs, add_special_tokens=True)).unsqueeze(0)
        tokens = [token.replace('Ġ', '') for token in tokenizer.convert_ids_to_tokens(input_ids[0])]
        #print('num of tokens : ')
        #print(len(tokens))
        

        # Generate embeddings for the input IDs using the BERT model
        outputs = model(input_ids)
        
        # Get the hidden states of the last layer from the model
        if len(outputs) >= 3:
            last_layer_hidden_states = outputs[2][-1]
        else:
            last_layer_hidden_states = outputs.last_hidden_state
        
        word_embeddings = []
        for token in tokens:
                index = tokens.index(token)
                # Extract the corresponding embedding vector from the GPT model output
                embedding = last_layer_hidden_states[0][index].detach().numpy()
                #print(len(embedding))
                #print(embedding)
                word_embeddings.append(embedding)
                
        words_embeddings.append(word_embeddings)
    df_emvecOfaw[head] = words_embeddings

100%|█████████████████████████████████████████| 480/480 [04:43<00:00,  1.69it/s]
100%|█████████████████████████████████████████| 480/480 [04:52<00:00,  1.64it/s]
100%|█████████████████████████████████████████| 480/480 [04:12<00:00,  1.90it/s]
100%|█████████████████████████████████████████| 480/480 [03:35<00:00,  2.22it/s]
100%|█████████████████████████████████████████| 480/480 [03:55<00:00,  2.04it/s]
100%|█████████████████████████████████████████| 480/480 [04:23<00:00,  1.82it/s]


In [6]:
df_emvecOfaw.to_csv('/Users/carina/Downloads/courses/final thesis/precessed data/ex2/emvec_GPT2_allwords.csv')

# get embedding vectors by BERT

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
from tqdm import tqdm
import string
import numpy as np

In [None]:
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
punctuation_map = dict((ord(char), None) for char in string.punctuation)  #引入标点符号，为下步去除标点做准备

In [None]:
# Define the paragraph and words to generate embeddings for
df_paragraphs = pd.read_csv('/Users/carina/Downloads/courses/final thesis/dataset/annotated text.csv')

In [None]:
heads = df_paragraphs.columns.values.tolist()
df_emvecOfaw = pd.DataFrame()
#for i in tqdm(range(len(heads))):
for head in heads:
    paras = []
    paras = df_paragraphs[head].tolist()
    
    words_embeddings = []
    for i in tqdm(range(len(paras))):
        
        paragraphs = paras[i]
        
        #paragraphs = paragraphs.lower()
        paragraphs = paragraphs.translate(punctuation_map)
        
        # Tokenize the paragraph and convert the tokens to IDs
        tokens = tokenizer.tokenize(paragraphs)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        #print('num of tokens : ')
        #print(len(tokens))
        
        # Convert the input IDs to a PyTorch tensor
        input_ids = torch.tensor(input_ids).unsqueeze(0)  # Batch size 1

        # Generate embeddings for the input IDs using the BERT model
        outputs = model(input_ids)

        # Generate embeddings for the input IDs using the BERT model
        outputs = model(input_ids)
        
        word_embeddings = []
        for token in tokens:
                index = tokens.index(token)
                # Extract the corresponding embedding vector from the BERT model output
                embedding = outputs.last_hidden_state[0][index].detach().numpy()
                word_embeddings.append(embedding)
        
        words_embeddings.append(word_embeddings)
    df_emvecOfaw[head] = words_embeddings

In [None]:
df_emvecOfaw.to_csv('/Users/carina/Downloads/courses/final thesis/precessed data/ex2/emvec_BERT_allwords.csv')