In [1]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import os

# Load dataset
df = pd.read_csv("/kaggle/input/google-dataset/go_emotions_dataset.csv")

# Use the 'headline' column (you can change this to 'short_description' or combine both)
texts = df['text'].astype(str).tolist()

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# BERT embedding function using [CLS] token
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

# Generate embeddings
embeddings = []
for text in tqdm(texts, desc="Generating BERT embeddings"):
    emb = get_bert_embedding(text)
    embeddings.append(emb)

# Convert to numpy array and save
embeddings_array = np.array(embeddings)
np.save("bert_Google_Emotion_embeddings.npy", embeddings_array)

print("Embeddings saved to 'bert_Google_Emotion_embeddings.npy'")

2025-05-07 10:45:41.591957: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746614742.092143      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746614742.234106      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generating BERT embeddings: 100%|██████████| 211225/211225 [27:26<00:00, 128.32it/s]


Embeddings saved to 'bert_Google_Emotion_embeddings.npy'
