Let's in this case instead of using GloVe let's use having the same structure as before the BERT tranformer, using its embedding 

As a note the previous model in the goemotions_training_template had an accuracy during training of 0.37

In [22]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split # Import train_test_split

# Hides the GPU from TensorFlow if needed, adjust based on your setup
# tf.config.set_visible_devices([], 'GPU')
# If you want to use GPU, ensure TensorFlow is installed with GPU support
# and remove or comment the line above.

# Load the dataset
print("Loading dataset...")
# Ensure the path to your CSV is correct
# Assuming 'goemotions_filtered.csv' already contains data filtered as per the paper
df = pd.read_csv('data/goemotions/goemotions_filtered.csv')

print(f"Dataset shape: {df.shape}")
print("\nFirst few rows of the dataset:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())

# Extract texts and labels
texts = df['text'].tolist()

# Get the one-hot encoded labels
# Assuming the last 27 columns are the emotion labels
labels_df = df.iloc[:, -27:] # Keep as DataFrame temporarily for easier manipulation
labels = labels_df.values # Convert to numpy array

# check dimensions
print(f"\nNumber of examples: {len(texts)}")
print(f"Example text: {texts[0][:100]}...")
print(f"Example label shape: {labels[0].shape}")

# Identify and remove rows with emotion labels that appear only once
# This is necessary because train_test_split cannot split single instances of a class
print("\nChecking for and removing rows with single-instance emotion labels...")

# Calculate the sum for each emotion label across all rows
label_counts = labels_df.sum(axis=0)

# Identify labels that have a count of 1
single_instance_labels = label_counts[label_counts == 1].index.tolist()

if single_instance_labels:
    print(f"Found single-instance labels: {single_instance_labels}")
    # Find the indices of rows that have *any* of these single-instance labels
    rows_to_remove_indices = df[labels_df[single_instance_labels].sum(axis=1) > 0].index

    print(f"Removing {len(rows_to_remove_indices)} rows containing single-instance labels.")

    # Remove these rows from the dataframe
    df_filtered_for_split = df.drop(rows_to_remove_indices)

    # Re-extract texts and labels from the filtered dataframe
    texts_filtered = df_filtered_for_split['text'].tolist()
    labels_filtered = df_filtered_for_split.iloc[:, -27:].values
    print(f"Dataset shape after removing single-instance label rows: {df_filtered_for_split.shape}")
else:
    print("No single-instance labels found. Proceeding with original data.")
    texts_filtered = texts
    labels_filtered = labels

# 2. Split data into train, validation, and test sets (80/10/10 split)
# Use the filtered data for splitting
print("\nSplitting data into train, validation, and test sets...")
# First, split into training (80%) and temp (20% for validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    texts_filtered, labels_filtered, test_size=0.2, random_state=42 # Removed stratify due to classes with only one member
)

# Then, split the temp set into validation (10% of total, 50% of temp) and test (10% of total, 50% of temp)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42 # Removed stratify due to classes with only one member
)

print(f"\nTrain set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")



  from .autonotebook import tqdm as notebook_tqdm


Loading dataset...
Dataset shape: (207814, 37)

First few rows of the dataset:
                                                text       id  \
0                                    That game hurt.  eew5j0j   
1     You do right, if you don't care then fuck 'em!  ed2mah1   
2                                 Man I love reddit.  eeibobj   
3  [NAME] was nowhere near them, he was by the Fa...  eda6yn6   
4  Right? Considering it’s such an important docu...  eespn2i   

                author            subreddit    link_id   parent_id  \
0                Brdd9                  nrl  t3_ajis4z  t1_eew18eq   
1             Labalool          confessions  t3_abru74  t1_ed2m7g7   
2        MrsRobertshaw             facepalm  t3_ahulml   t3_ahulml   
3  American_Fascist713  starwarsspeculation  t3_ackt2f  t1_eda65q2   
4         ImperialBoss           TrueReddit  t3_aizyuz  t1_eesoak0   

    created_utc  rater_id  example_very_unclear  admiration  ...  love  \
0  1.548381e+09         1          

In [11]:
# BERT and model

# 3. Tokenize using BERT tokenizer
BERT_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

max_len = 100 # Keep the same max length for consistency, adjust if needed

# Tokenize and encode each split separately
print("\nTokenizing and encoding data splits...")

encoded_train = tokenizer(
    X_train,
    max_length=max_len,
    padding='max_length',
    truncation=True,
    return_tensors='tf' # Return TensorFlow tensors
)
X_train_input_ids = encoded_train['input_ids']
X_train_attention_mask = encoded_train['attention_mask']

encoded_val = tokenizer(
    X_val,
    max_length=max_len,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)
X_val_input_ids = encoded_val['input_ids']
X_val_attention_mask = encoded_val['attention_mask']

encoded_test = tokenizer(
    X_test,
    max_length=max_len,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)
X_test_input_ids = encoded_test['input_ids']
X_test_attention_mask = encoded_test['attention_mask']

# Convert labels to TensorFlow tensors if needed, though numpy arrays work with model.fit
y_train_tf = tf.constant(y_train, dtype=tf.float32)
y_val_tf = tf.constant(y_val, dtype=tf.float32)
y_test_tf = tf.constant(y_test, dtype=tf.float32)


# 4. Load pre-trained BERT model
bert_model = TFBertModel.from_pretrained(BERT_MODEL_NAME)

# Freeze the BERT model weights during initial training
bert_model.trainable = False



Tokenizing and encoding data splits...


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [12]:
# 5. Build the model with BERT and a classification head
input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

# Pass inputs through the BERT model, use pooled output
bert_output = bert_model(input_ids, attention_mask=attention_mask)[1]

# Add classification layers on top of BERT output
x = layers.Dropout(0.2)(bert_output)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.2)(x)
output_layer = layers.Dense(27, activation='sigmoid', name='output_layer')(x) # 27 emotion labels

model = models.Model(inputs=[input_ids, attention_mask], outputs=output_layer)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# 6. Train the model using the train and validation sets
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

print("\nTraining the model...")
history = model.fit(
    {'input_ids': X_train_input_ids, 'attention_mask': X_train_attention_mask},
    y_train_tf,
    epochs=4,
    batch_size=16,
    validation_data=({'input_ids': X_val_input_ids, 'attention_mask': X_val_attention_mask}, y_val_tf),
    callbacks=[early_stop]
)

# 7. Evaluate the model on the test set (only done once the model is finalized)
print("\nEvaluating the model on the test set...")
loss, accuracy = model.evaluate(
    {'input_ids': X_test_input_ids, 'attention_mask': X_test_attention_mask},
    y_test_tf,
    batch_size=16
)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


# 8. Save model
#model.save('emotion_model_bert_80_10_10_split.keras')





Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 100)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 100)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model_2 (TFBertMod  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 el)                         ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, 100, 7                                     

KeyboardInterrupt: 

Let's use a model citated in one paper of this model but using GloVe approach checking the results

In [23]:
import os
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

# Preprocessing function (Paper 1 style)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)            
    text = re.sub(r'[^a-z\s]', '', text)           
    text = re.sub(r'\s+', ' ', text).strip()       
    return text

# Load dataset
df = pd.read_csv('data/goemotions/goemotions_filtered.csv')
df['text'] = df['text'].apply(clean_text)

# Token limit filtering (keep between 2 and 30 tokens)
df['token_count'] = df['text'].apply(lambda x: len(x.split()))
df = df[df['token_count'].between(2, 30)]
df.drop(columns=['token_count'], inplace=True)

# Extract texts and labels
texts = df['text'].tolist()
labels = df.iloc[:, -27:].values  # 27 emotion labels

# Tokenization
max_words = 10000
max_len = 30
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = np.array(labels)

# Train/dev/test split (Paper 1: 80% train, 10% dev, 10% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Load GloVe embeddings
embedding_dim = 50
embedding_index = {}
glove_path = 'glove.6B.50d.txt'
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Build embedding matrix
word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        vec = embedding_index.get(word)
        if vec is not None:
            embedding_matrix[i] = vec


In [2]:
# CNN model from Paper 1 (non-BERT baseline)
model_paper_1 = models.Sequential([
    layers.Input(shape=(max_len,)),
    layers.Embedding(
        input_dim=num_words,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False
    ),
    layers.Conv1D(128, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(27, activation='sigmoid')
])

model_paper_1.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model_paper_1.summary()

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train
history = model_paper_1.fit(
    X_train, y_train,
    epochs=10,
    batch_size=16,  # Per Paper 1
    validation_data=(X_val, y_val),
    callbacks=[early_stop]
)

# Evaluate
test_loss, test_acc = model_paper_1.evaluate(X_test, y_test)
print(f"\nTest Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

# Save
#model_paper_1.save('glove_cnn_emotion_model_paper_1.keras')



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 50)            500000    
                                                                 
 conv1d (Conv1D)             (None, 26, 128)           32128     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 27)                3483      
                                                       

Let's also check other measures for this as F1 Score, Precision and Recall

In [5]:
print(y_test)
# make the predictions
y_pred_probs = model_paper_1.predict(X_test)
print(y_pred_probs)

[[0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]]
[[0.01282421 0.0060567  0.02719866 ... 0.07010121 0.02137885 0.19038928]
 [0.33976814 0.06680038 0.12600544 ... 0.00922946 0.01667458 0.16431417]
 [0.0059865  0.00387016 0.01877972 ... 0.02847379 0.01070489 0.22515517]
 ...
 [0.02139626 0.06044342 0.0661814  ... 0.01927025 0.0389699  0.45850116]
 [0.04290602 0.03060021 0.07629908 ... 0.00409844 0.01940195 0.35640788]
 [0.01302919 0.01293397 0.01934005 ... 0.01312545 0.0076209  0.08929104]]


In [6]:
# let's use a threshold to classifiy one emotion in the end 
# we can tune this threshold later by looking at the different results in metrics it gives different thresholds
y_pred = (y_pred_probs >= 0.5).astype(int)
# metrics
from sklearn.metrics import classification_report, accuracy_score, f1_score
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')

In [7]:
print(f1_micro)
print(f1_macro)

0.1559246785058175
0.07034166726858461


In [None]:
# play with different thresholds for the above model
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
for threshold in thresholds:
    print("For this threshold: ", threshold)
    y_pred = (y_pred_probs >= threshold).astype(int)
    f1_micro = f1_score(y_test, y_pred, average='micro')
    print(f1_micro)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print(f1_macro)
  

For this threshold:  0.3
0.3092157613981315
0.11816237404617404
For this threshold:  0.4
0.2311104949396922
0.08917674478729902
For this threshold:  0.5
0.1559246785058175
0.07034166726858461
For this threshold:  0.6
0.10355387218953985
0.05592704056700485
For this threshold:  0.7
0.06371225651036258
0.039453039046557455


Let's take into account paper 1: 
The pre-processing papers:
- Text cleaning (lowercasing, removing punctuation and URLs, whitespace normalization).
- Token count filtering (keep comments between 2 and 30 tokens).
- Tokenization and padding.
- Class imbalance correction using per-label weighting. 

In [4]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models

#1. Preprocess text 
def clean_text(text):
    text = text.lower()                                 # Lowercase
    text = re.sub(r'http\S+', '', text)                # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)               # Remove punctuation/special chars
    text = re.sub(r'\s+', ' ', text).strip()           # Normalize whitespace
    return text

#2. Load and prepare dataset ---
df = pd.read_csv('data/goemotions/goemotions_filtered.csv')

# Clean text
df['text'] = df['text'].apply(clean_text)
df['token_count'] = df['text'].apply(lambda x: len(x.split()))
df = df[df['token_count'].between(2, 30)]              

# Extract texts and labels
texts = df['text'].tolist()
labels = df.iloc[:, -27:].values                     




In [5]:
# Tokenize
max_words = 10000
max_len = 30
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = np.array(labels)

# Split into train/val/test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

#Compute class weights to handle imbalance ---
label_frequencies = np.sum(y_train, axis=0)
label_weights = 1.0 / (label_frequencies + 1e-6)
label_weights = label_weights / np.sum(label_weights) * len(label_weights)
label_weights_tensor = tf.convert_to_tensor(label_weights, dtype=tf.float32)

def weighted_binary_crossentropy(y_true, y_pred):
    bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
    return tf.reduce_mean(bce * label_weights_tensor)


In [8]:
#3. Load GloVe embeddings
embedding_dim = 50
embedding_index = {}
with open('glove.6B.50d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        vec = embedding_index.get(word)
        if vec is not None:
            embedding_matrix[i] = vec


In [10]:
# Build an LSTM model
from tensorflow.keras.optimizers import Adam
model_LSTM = models.Sequential([
    layers.Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    layers.Bidirectional(layers.LSTM(256)), # Hidden layer dimensionality is 256
    layers.Dropout(0.7),                    # Dropout set to 0.7
    layers.Dense(128, activation='relu'),
    layers.Dense(27, activation='sigmoid')
])

custom_optimizer = Adam(learning_rate=0.1)

model_LSTM.compile(
    optimizer=custom_optimizer, # Use the custom optimizer
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model_LSTM.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 50)            500000    
                                                                 
 bidirectional_1 (Bidirecti  (None, 512)               628736    
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               65664     
                                                                 
 dense_3 (Dense)             (None, 27)                3483      
                                                                 
Total params: 1197883 (4.57 MB)
Trainable params: 697883 (2.66 MB)
Non-trainable params: 500000 (1.91 MB)
______________

I stopped in my computer because it was taking too long to run

In [11]:
#5. Train and evaluate ---
model_LSTM.fit(X_train, y_train, epochs=4, batch_size=16, validation_data=(X_val, y_val), verbose=2)
loss, acc = model_LSTM.evaluate(X_test, y_test)
print(f"Custom LSTM Model Test Loss: {loss:.4f}, Accuracy: {acc:.4f}")

Epoch 1/4




KeyboardInterrupt: 

In [None]:
# check other metrics such as f1 score 

In [None]:
# test the same data without the pre-processing steps used above (See if the results are different)

According to other paper let's use the initial goemotions data and apply the following pre-processing steps used in the paper "Fine-Grained Classification for Emotion Detection Using Advanced Neural Models and GoEmotions Dataset" - 3rd paper

The pre-processing steps used in this paper are:
- Emoji Conversion
- Contraction Expansion
- Acronym and Misspelling Correction
- Text Normalization
- Tokenization

Let's use the architecture as well as the parameters used which are: 
Paper's CNN: Embedding -> Conv1D(256 filters) -> Dropout -> Dense(output)
Learning rate: 0.0002, Epochs: 12, Optimizer: Adam, Loss: Binary Cross-entropy

In [32]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam # Import Adam optimizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import F1Score # Import F1Score
import re
import emoji # For emoji conversion
import contractions # For expanding contractions
from sklearn.model_selection import train_test_split # Import for train-test split

# Hides the GPU from TensorFlow if not needed or causing issues
# tf.config.set_visible_devices([], 'GPU')

# --- Pre-processing Function based on the Paper ---
def preprocess_text(text):
    """
    Applies pre-processing steps as described in the paper:
    1. Convert Emojis to text
    2. Expand Contractions
    3. Fix specific Acronyms and Misspellings
    4. Lowercase text
    5. Normalize repeated characters
    """
    if not isinstance(text, str):
        return "" # Return empty string for non-string inputs

    # 1. Convert Emojis to text
    text = emoji.demojize(text, delimiters=(" ", " ")) # e.g., 👍 -> thumbs_up

    # 2. Expand Contractions
    text = contractions.fix(text) # e.g., "I'll" -> "I will"

    # 3. Fix specific Acronyms and Misspellings (examples from paper)
    text = re.sub(r'\b(Cuz|coz)\b', 'because', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(Ikr)\b', 'I know right', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(Faux pas)\b', 'mistake', text, flags=re.IGNORECASE)
    # Add more rules as needed

    # 4. Lowercase text
    text = text.lower()

    # 5. Normalize repeated characters (e.g., "coooool" -> "cool")
    text = re.sub(r'(.)\1{2,}', r'\1\1', text) # Reduces 3 or more repetitions to 2
                                            # For "cool", change to r'\1' if only one char is desired

    # Remove extra spaces that might have been introduced
    text = re.sub(r'\s+', ' ', text).strip()
    return text



# 1. Load the dataset
print("Loading dataset...")
# Ensure the path to your CSV is correct
try:
    df = pd.read_csv('data/goemotions/goemotions_filtered.csv')
except FileNotFoundError:
    print("Error: 'data/goemotions/goemotions_filtered.csv' not found.")
    print("Please ensure the dataset is in the correct path or update the path in the script.")
    exit()


print(f"Dataset shape: {df.shape}")
print("\nFirst few rows of the dataset:")
print(df.head())
original_columns = df.columns.tolist() # Get column names BEFORE any modifications
print("\nOriginal Column names:")
print(original_columns)

# Get the one-hot encoded labels from the original DataFrame
# The paper mentions 28 emotions. This script assumes 27 based on previous context.
# It's assumed the label columns are the last N columns in the *original* CSV.
num_label_columns = 27 # Adjust if necessary based on your CSV structure
if len(original_columns) < num_label_columns:
    print(f"Error: DataFrame has fewer than {num_label_columns} columns. Cannot extract labels as expected.")
    exit()

label_column_names = original_columns[-num_label_columns:]
print(f"\nIdentified label columns: {label_column_names}")

# Extract labels using the identified column names from the original DataFrame
try:
    labels_from_df = df[label_column_names].values
    # Verify that all label columns are numeric
    if not np.issubdtype(labels_from_df.dtype, np.number):
        print(f"Warning: Labels extracted from columns {label_column_names} are not all numeric. Attempting conversion.")
        # This conversion might fail if there's genuine non-numeric text
        labels_from_df = df[label_column_names].astype(float).values
except KeyError as e:
    print(f"Error extracting label columns: {e}. Check column names in your CSV and `num_label_columns`.")
    exit()
except ValueError as e:
    print(f"Error converting label columns to numeric: {e}. One of the identified label columns likely contains non-numeric text.")
    exit()

# Extract texts and apply pre-processing
print("\nApplying pre-processing to texts...")
# Ensure 'text' column exists
if 'text' not in df.columns:
    print("Error: 'text' column not found in DataFrame. Please check your CSV file.")
    exit()
df['processed_text'] = df['text'].apply(preprocess_text) # Now add the processed_text column
texts = df['processed_text'].tolist()



# 2. Tokenize and pad
max_words = 10000  # Max words to keep in the vocabulary
max_len = 100      # Max length of sequences
tokenizer = Tokenizer(num_words=max_words, oov_token="<unk>") # Added oov_token
tokenizer.fit_on_texts(texts) # Fit tokenizer on ALL texts to build comprehensive vocabulary
sequences = tokenizer.texts_to_sequences(texts)
X_padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post') # Padded sequences

# Ensure labels are of type float32 for TensorFlow/Keras
y_labels = np.array(labels_from_df, dtype=np.float32)


print(f"Shape of X_padded (all data before split): {X_padded.shape}")
print(f"Shape of y_labels (all labels before split): {y_labels.shape}, dtype: {y_labels.dtype}")

# 2.b. Split data into Training, Validation, and Test sets
try:
    stratify_target = None
    if y_labels.ndim > 1 and y_labels.shape[1] > 1: # Multi-label one-hot
        stratify_target = np.argmax(y_labels, axis=1) # Simplification for stratification
    elif y_labels.ndim == 1: # Single-label
        stratify_target = y_labels

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X_padded, y_labels, test_size=0.2, random_state=42, stratify=stratify_target
    )
except ValueError as e:
    print(f"Stratification failed: {e}. Falling back to non-stratified split.")
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X_padded, y_labels, test_size=0.2, random_state=42
    )


print(f"Shape of X_train_val (training + validation data): {X_train_val.shape}")
print(f"Shape of y_train_val (training + validation labels): {y_train_val.shape}, dtype: {y_train_val.dtype}")
print(f"Shape of X_test (test data): {X_test.shape}")
print(f"Shape of y_test (test labels): {y_test.shape}, dtype: {y_test.dtype}")


# 3. Load GloVe embeddings
embedding_dim = 50  # GloVe embeddings dimension (e.g., 50, 100, 200, 300)
embeddings_index = {}
# Ensure the path to your GloVe file is correct
glove_path = 'glove.6B.50d.txt' # Using 50d embeddings
try:
    with open(glove_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Found {len(embeddings_index)} word vectors in GloVe.")
except FileNotFoundError:
    print(f"Error: GloVe file '{glove_path}' not found.")
    print("Please download GloVe embeddings (e.g., glove.6B.50d.txt) and place it in the correct path or update the path.")
    exit()

# 4. Prepare embedding matrix
word_index = tokenizer.word_index # Use the same tokenizer fitted on all texts
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words: # Use num_words which is min(max_words, actual_vocab_size+1)
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print(f"Shape of embedding matrix: {embedding_matrix.shape}")

# 5. Build the CNN model (aligned with the paper)
# Paper's CNN: Embedding -> Conv1D(256 filters) -> Dropout -> Dense(output)
# Learning rate: 0.0002, Epochs: 12, Optimizer: Adam, Loss: Binary Cross-entropy
model = models.Sequential([
    layers.Embedding(
        input_dim=num_words,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len, 
        trainable=False  
    ),
    layers.Conv1D(
        filters=256,      
        kernel_size=3,    
        activation='relu' 
    ),
    layers.GlobalMaxPooling1D(), 
    layers.Dropout(0.2),                                   
    layers.Dense(y_labels.shape[1], activation='sigmoid')  
])

# Optimizer with learning rate from the paper
custom_optimizer = Adam(learning_rate=0.0002)

model.compile(
    optimizer=custom_optimizer,
    loss='binary_crossentropy', # As per paper
    metrics=[F1Score(average='micro', threshold=0.5, name='f1_score'), 'accuracy'] # Use F1Score class, 'micro' is good for multi-label
)

model.summary()

# 6. Train the model
# Paper specifies 12 epochs for the CNN model
epochs_from_paper = 12
batch_size = 32 # Common batch size, paper doesn't specify for CNN

# Early stopping is a good practice, though not explicitly mentioned for the CNN in the paper
early_stop = EarlyStopping(monitor='val_f1_score', mode='max', patience=3, restore_best_weights=True, verbose=1) # Monitor val_f1_score

print("\nStarting model training...")
history = model.fit(
    X_train_val, y_train_val, # Train on the training+validation split
    epochs=epochs_from_paper,
    batch_size=batch_size,
    validation_split=0.25, # Create validation set from X_train_val (0.25 of 0.8 original data = 0.2 of total)
    callbacks=[early_stop]
)

# 7. Evaluate on Test Set (Important for unbiased performance measure)
print("\nEvaluating model on the test set...")
test_results = model.evaluate(X_test, y_test, verbose=0)
test_metric_names = model.metrics_names
print("Test Set Evaluation:")
for name, value in zip(test_metric_names, test_results):
    print(f"{name}: {value:.4f}")


# 8. Save model
model_save_path = 'emotion_model_cnn_paper_aligned.keras'
model.save(model_save_path)
print(f"\nModel saved to {model_save_path}")

# 9. Make predictions function
def predict_emotion(text_input, trained_model, tokenizer_instance, max_len_sequences, current_label_column_names):
    """Predicts top 3 emotions for a given text."""
    # Pre-process the input text
    processed_text_input = preprocess_text(text_input)
    
    # Tokenize and pad the input text
    sequence = tokenizer_instance.texts_to_sequences([processed_text_input])
    padded_sequence = pad_sequences(sequence, maxlen=max_len_sequences, padding='post', truncating='post')
    
    # Get prediction
    if padded_sequence.shape[0] == 0: # Handle empty sequence after tokenization
        print("Warning: Text could not be tokenized effectively.")
        return []
        
    prediction_probs = trained_model.predict(padded_sequence)[0]
    
    # Get the top 3 emotions
    top_3_indices = prediction_probs.argsort()[-3:][::-1] # Indices of top 3 scores
    
    top_3_emotions_with_scores = []
    for idx in top_3_indices:
        if idx < len(current_label_column_names):
             top_3_emotions_with_scores.append((current_label_column_names[idx], prediction_probs[idx]))
        else:
            print(f"Warning: Predicted index {idx} is out of bounds for emotion labels.")

    return top_3_emotions_with_scores

# Example prediction
test_text = "I am so incredibly happy and excited today, it feels amazing! 😄🎉"
# Use the 'label_column_names' identified during data loading
predictions = predict_emotion(test_text, model, tokenizer, max_len, label_column_names)
print("\nExample prediction for:", test_text)
if predictions:
    for emotion_label, score in predictions:
        print(f"{emotion_label}: {score:.4f}")
else:
    print("No predictions could be made.")

test_text_2 = "This is so frustrating and annoying, I can't believe this happened... 😠"
predictions_2 = predict_emotion(test_text_2, model, tokenizer, max_len, label_column_names)
print("\nExample prediction for:", test_text_2)
if predictions_2:
    for emotion_label, score in predictions_2:
        print(f"{emotion_label}: {score:.4f}")
else:
    print("No predictions could be made.")



Loading dataset...
Dataset shape: (207814, 37)

First few rows of the dataset:
                                                text       id  \
0                                    That game hurt.  eew5j0j   
1     You do right, if you don't care then fuck 'em!  ed2mah1   
2                                 Man I love reddit.  eeibobj   
3  [NAME] was nowhere near them, he was by the Fa...  eda6yn6   
4  Right? Considering it’s such an important docu...  eespn2i   

                author            subreddit    link_id   parent_id  \
0                Brdd9                  nrl  t3_ajis4z  t1_eew18eq   
1             Labalool          confessions  t3_abru74  t1_ed2m7g7   
2        MrsRobertshaw             facepalm  t3_ahulml   t3_ahulml   
3  American_Fascist713  starwarsspeculation  t3_ackt2f  t1_eda65q2   
4         ImperialBoss           TrueReddit  t3_aizyuz  t1_eesoak0   

    created_utc  rater_id  example_very_unclear  admiration  ...  love  \
0  1.548381e+09         1          