In [14]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm
import pickle

In [16]:
model_name = "google-bert/bert-base-uncased"
file_name = "train"

In [15]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)

In [17]:
data = pd.read_csv(file_name+"_split.csv")

In [18]:
text_data = data['tweet']

In [20]:
# Tokenize input text
tokenized_text = text_data.apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Pad sequences to the same length
max_len = max(map(len, tokenized_text))
padded_tokenized_text = [text + [0]*(max_len-len(text)) for text in tokenized_text]

# Convert tokenized text to PyTorch tensors
input_ids = torch.tensor(padded_tokenized_text)

# Initialize BERT model
model = BertModel.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

# List to store the vectors
bert_vectors = []

In [None]:
# Process each text sample
for text in tqdm(text_data):
    # Tokenize input text
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)

    # Convert tokenized text to PyTorch tensor
    input_ids = torch.tensor(tokenized_text).unsqueeze(0)  # Add batch dimension

    # Forward pass
    with torch.no_grad():
        outputs = model(input_ids)

    # Extract the output representations (vectors) from BERT
    bert_output = outputs[0]  # Output of the last layer

    # Average pooling of the output representations
    pooled_output = torch.mean(bert_output, dim=1).squeeze().numpy()

    # Append the pooled output to the list
    bert_vectors.append(pooled_output)

 74%|███████▍  | 6275/8480 [18:22<05:55,  6.21it/s]

In [None]:
# Convert the list of vectors to a numpy array
bert_vectors = np.array(bert_vectors)

In [None]:
print(bert_vectors)

In [None]:
bert_vectors.shape

In [None]:
# Save bert_vectors into pickle file
with open("bert_vectors_"+file_name+"_"+model_name+".pkl", "wb") as f:
    pickle.dump((bert_vectors, data['label']), f)