Let's in this case instead of using GloVe let's use having the same structure as before the BERT tranformer, using its embedding 

In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

# Hides the GPU from TensorFlow
tf.config.set_visible_devices([], 'GPU') 

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('data/goemotions/goemotions_filtered.csv')

print(f"Dataset shape: {df.shape}")
print("\nFirst few rows of the dataset:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())

# Extract texts and labels
texts = df['text'].tolist()

# Get the one-hot encoded labels
labels = df.iloc[:, -27:].values  # Convert to numpy array

# check dimensions
print(f"\nNumber of training examples: {len(texts)}")
print(f"Example text: {texts[0][:100]}...")
print(f"Example label shape: {labels[0].shape}")

# 2. Tokenize and pad
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len)
y = np.array(labels)

# 3. Load GloVe embeddings
embedding_dim = 50
embeddings_index = {}
current_dir = os.path.dirname(os.path.abspath(__file__))
glove_path = 'glove.6B.50d.txt'
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
 
# 4. Prepare embedding matrix
word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    

Loading dataset...
Dataset shape: (207814, 37)

First few rows of the dataset:
                                                text       id  \
0                                    That game hurt.  eew5j0j   
1     You do right, if you don't care then fuck 'em!  ed2mah1   
2                                 Man I love reddit.  eeibobj   
3  [NAME] was nowhere near them, he was by the Fa...  eda6yn6   
4  Right? Considering it’s such an important docu...  eespn2i   

                author            subreddit    link_id   parent_id  \
0                Brdd9                  nrl  t3_ajis4z  t1_eew18eq   
1             Labalool          confessions  t3_abru74  t1_ed2m7g7   
2        MrsRobertshaw             facepalm  t3_ahulml   t3_ahulml   
3  American_Fascist713  starwarsspeculation  t3_ackt2f  t1_eda65q2   
4         ImperialBoss           TrueReddit  t3_aizyuz  t1_eesoak0   

    created_utc  rater_id  example_very_unclear  admiration  ...  love  \
0  1.548381e+09         1          

NameError: name '__file__' is not defined