Let's in this case instead of using GloVe let's use having the same structure as before the BERT tranformer, using its embedding 

As a note the previous model in the goemotions_training_template had an accuracy during training of 0.37

In [10]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split # Import train_test_split

# Hides the GPU from TensorFlow if needed, adjust based on your setup
# tf.config.set_visible_devices([], 'GPU')
# If you want to use GPU, ensure TensorFlow is installed with GPU support
# and remove or comment the line above.

# Load the dataset
print("Loading dataset...")
# Ensure the path to your CSV is correct
# Assuming 'goemotions_filtered.csv' already contains data filtered as per the paper
df = pd.read_csv('data/goemotions/goemotions_filtered.csv')

print(f"Dataset shape: {df.shape}")
print("\nFirst few rows of the dataset:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())

# Extract texts and labels
texts = df['text'].tolist()

# Get the one-hot encoded labels
# Assuming the last 27 columns are the emotion labels
labels_df = df.iloc[:, -27:] # Keep as DataFrame temporarily for easier manipulation
labels = labels_df.values # Convert to numpy array

# check dimensions
print(f"\nNumber of examples: {len(texts)}")
print(f"Example text: {texts[0][:100]}...")
print(f"Example label shape: {labels[0].shape}")

# Identify and remove rows with emotion labels that appear only once
# This is necessary because train_test_split cannot split single instances of a class
print("\nChecking for and removing rows with single-instance emotion labels...")

# Calculate the sum for each emotion label across all rows
label_counts = labels_df.sum(axis=0)

# Identify labels that have a count of 1
single_instance_labels = label_counts[label_counts == 1].index.tolist()

if single_instance_labels:
    print(f"Found single-instance labels: {single_instance_labels}")
    # Find the indices of rows that have *any* of these single-instance labels
    rows_to_remove_indices = df[labels_df[single_instance_labels].sum(axis=1) > 0].index

    print(f"Removing {len(rows_to_remove_indices)} rows containing single-instance labels.")

    # Remove these rows from the dataframe
    df_filtered_for_split = df.drop(rows_to_remove_indices)

    # Re-extract texts and labels from the filtered dataframe
    texts_filtered = df_filtered_for_split['text'].tolist()
    labels_filtered = df_filtered_for_split.iloc[:, -27:].values
    print(f"Dataset shape after removing single-instance label rows: {df_filtered_for_split.shape}")
else:
    print("No single-instance labels found. Proceeding with original data.")
    texts_filtered = texts
    labels_filtered = labels

# 2. Split data into train, validation, and test sets (80/10/10 split)
# Use the filtered data for splitting
print("\nSplitting data into train, validation, and test sets...")
# First, split into training (80%) and temp (20% for validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    texts_filtered, labels_filtered, test_size=0.2, random_state=42 # Removed stratify due to classes with only one member
)

# Then, split the temp set into validation (10% of total, 50% of temp) and test (10% of total, 50% of temp)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42 # Removed stratify due to classes with only one member
)

print(f"\nTrain set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")



Loading dataset...
Dataset shape: (207814, 37)

First few rows of the dataset:
                                                text       id  \
0                                    That game hurt.  eew5j0j   
1     You do right, if you don't care then fuck 'em!  ed2mah1   
2                                 Man I love reddit.  eeibobj   
3  [NAME] was nowhere near them, he was by the Fa...  eda6yn6   
4  Right? Considering it’s such an important docu...  eespn2i   

                author            subreddit    link_id   parent_id  \
0                Brdd9                  nrl  t3_ajis4z  t1_eew18eq   
1             Labalool          confessions  t3_abru74  t1_ed2m7g7   
2        MrsRobertshaw             facepalm  t3_ahulml   t3_ahulml   
3  American_Fascist713  starwarsspeculation  t3_ackt2f  t1_eda65q2   
4         ImperialBoss           TrueReddit  t3_aizyuz  t1_eesoak0   

    created_utc  rater_id  example_very_unclear  admiration  ...  love  \
0  1.548381e+09         1          

In [11]:
# BERT and model

# 3. Tokenize using BERT tokenizer
BERT_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

max_len = 100 # Keep the same max length for consistency, adjust if needed

# Tokenize and encode each split separately
print("\nTokenizing and encoding data splits...")

encoded_train = tokenizer(
    X_train,
    max_length=max_len,
    padding='max_length',
    truncation=True,
    return_tensors='tf' # Return TensorFlow tensors
)
X_train_input_ids = encoded_train['input_ids']
X_train_attention_mask = encoded_train['attention_mask']

encoded_val = tokenizer(
    X_val,
    max_length=max_len,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)
X_val_input_ids = encoded_val['input_ids']
X_val_attention_mask = encoded_val['attention_mask']

encoded_test = tokenizer(
    X_test,
    max_length=max_len,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)
X_test_input_ids = encoded_test['input_ids']
X_test_attention_mask = encoded_test['attention_mask']

# Convert labels to TensorFlow tensors if needed, though numpy arrays work with model.fit
y_train_tf = tf.constant(y_train, dtype=tf.float32)
y_val_tf = tf.constant(y_val, dtype=tf.float32)
y_test_tf = tf.constant(y_test, dtype=tf.float32)


# 4. Load pre-trained BERT model
bert_model = TFBertModel.from_pretrained(BERT_MODEL_NAME)

# Freeze the BERT model weights during initial training
bert_model.trainable = False



Tokenizing and encoding data splits...


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [12]:
# 5. Build the model with BERT and a classification head
input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

# Pass inputs through the BERT model, use pooled output
bert_output = bert_model(input_ids, attention_mask=attention_mask)[1]

# Add classification layers on top of BERT output
x = layers.Dropout(0.2)(bert_output)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.2)(x)
output_layer = layers.Dense(27, activation='sigmoid', name='output_layer')(x) # 27 emotion labels

model = models.Model(inputs=[input_ids, attention_mask], outputs=output_layer)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# 6. Train the model using the train and validation sets
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

print("\nTraining the model...")
history = model.fit(
    {'input_ids': X_train_input_ids, 'attention_mask': X_train_attention_mask},
    y_train_tf,
    epochs=4,
    batch_size=16,
    validation_data=({'input_ids': X_val_input_ids, 'attention_mask': X_val_attention_mask}, y_val_tf),
    callbacks=[early_stop]
)

# 7. Evaluate the model on the test set (only done once the model is finalized)
print("\nEvaluating the model on the test set...")
loss, accuracy = model.evaluate(
    {'input_ids': X_test_input_ids, 'attention_mask': X_test_attention_mask},
    y_test_tf,
    batch_size=16
)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


# 8. Save model
#model.save('emotion_model_bert_80_10_10_split.keras')





Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 100)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 100)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model_2 (TFBertMod  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 el)                         ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, 100, 7                                     

KeyboardInterrupt: 