In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Bidirectional, LSTM, Flatten, GlobalMaxPool1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import backend as K

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import torch
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, SequentialSampler

import seaborn as sns

from tqdm import tqdm

2024-03-25 15:37:13.495442: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-25 15:37:13.495499: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-25 15:37:13.497016: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-25 15:37:13.506547: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train = pd.read_csv('emotions_all.csv') # My dataset

train.head() # Display the first 5 rows of the train dataset

Unnamed: 0,sentence,emotion
0,That game hurt.,sadness
1,Man I love reddit.,happiness
2,Right? Considering it’s such an important docu...,happiness
3,"He isn't as big, but he's still quite popular....",disgust
4,That's crazy; I went to a super [RELIGION] hig...,happiness


In [3]:
# Find the unique emotions
unique_emotions = train['emotion'].unique()

# Initialize an empty DataFrame to hold the filtered dataset
filtered_train = pd.DataFrame()

for emotion in unique_emotions:
    # Filter the dataset for the current emotion
    emotion_df = train[train['emotion'] == emotion]
    
    # Check if the emotion_df has more than 3000 rows
    if len(emotion_df) > 4000:
        # If so, take the first 3000
        emotion_df = emotion_df.head(4000)
    
    # Concatenate the filtered emotion_df to the filtered_df
    filtered_train = pd.concat([filtered_train, emotion_df])

# Reset index of the filtered DataFrame
filtered_train = filtered_train.reset_index(drop=True)

filtered_train

Unnamed: 0,sentence,emotion
0,That game hurt.,sadness
1,"I wanted to downvote this, but it's not your f...",sadness
2,That is odd.,sadness
3,So happy for [NAME]. So sad he's not here. Ima...,sadness
4,"Dark and funny, but not really nice guy. He ha...",sadness
...,...,...
23995,when my mother had a nervous illness,fear
23996,i am saying now i did feel less inhibited than...,fear
23997,i don t remember feeling anything not even fea...,fear
23998,i was feeling reluctant about performing patti...,fear


In [4]:
filtered_train['emotion'].value_counts()

emotion
sadness      4000
happiness    4000
disgust      4000
surprise     4000
anger        4000
fear         4000
Name: count, dtype: int64

In [5]:
test = pd.read_csv('test.csv', sep='\t')

test.head() #Display the first 5 rows of the test dataset

Unnamed: 0,id,sentence
0,0,Girls are happy when they get flowers
1,1,His jaw dropped in disbelief when he saw the p...
2,2,Sometimes the ugly stench makes me wanna throw...
3,3,The foul odor from the garbage bin was disgust...
4,4,"I can’t believe it, they lost the game in the ..."


In [6]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(filtered_train['sentence'], filtered_train['emotion'], test_size=0.2, random_state=42)

In [7]:
# Create a global label dictionary
global_label_dict = {label: idx for idx, label in enumerate(unique_emotions)}

def encode_data(tokenizer, sentences, labels, label_dict, max_length=64):
    input_ids = []
    attention_masks = []
    numeric_labels = [label_dict[label] for label in labels]
    
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(numeric_labels)

    return input_ids, attention_masks, labels

In [8]:
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode the data using the global label dictionary
train_inputs, train_masks, train_labels = encode_data(tokenizer, X_train.tolist(), y_train.tolist(), global_label_dict)
val_inputs, val_masks, val_labels = encode_data(tokenizer, X_val.tolist(), y_val.tolist(), global_label_dict)

In [9]:
# Create DataLoader for training
batch_size = 64
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)

# Create DataLoader for validation
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [10]:
num_labels = len(global_label_dict)  # Number of unique labels

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels,  # The number of output labels
    output_attentions=False,
    output_hidden_states=False,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the GPU if available

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
# Define the number of epochs
epochs = 10

optimizer = AdamW(model.parameters(),
                  lr=2e-5,  # Learning rate
                  eps=1e-8  # Adam's epsilon for numerical stability
                 )

# Calculate the total number of training steps
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,  # Default value in transformers
                                            num_training_steps=total_steps)



In [None]:
# Function to calculate the accuracy of predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
for epoch_i in range(epochs):
    print(f"{'='*8} Epoch {epoch_i+1} / {epochs} {'='*8}")

    # Training
    model.train()
    total_loss = 0

    for step, batch in tqdm(enumerate(train_dataloader), desc="Training", total=len(train_dataloader)):
        # Move batch to the appropriate device
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss:.2f}")

    # Validation
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps = 0

    for batch in tqdm(validation_dataloader, desc="Validation", total=len(validation_dataloader)):
        # Move batch to the appropriate device
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        
        with torch.no_grad():
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Move logits to CPU for evaluation
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    
    print(f"Validation Accuracy: {eval_accuracy/nb_eval_steps:.2f}")

print("Training complete")



Training: 100%|██████████| 300/300 [01:47<00:00,  2.78it/s]


Average training loss: 1.50


Validation: 100%|██████████| 75/75 [00:08<00:00,  8.46it/s]


Validation Accuracy: 0.50


Training:  66%|██████▌   | 197/300 [01:09<00:36,  2.82it/s]

In [None]:
test_sentences = test['sentence'].tolist()
test_ids = test['id'].tolist()

In [None]:
def encode_test_data(tokenizer, sentences, max_length=64):
    input_ids = []
    attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,  # Sentence to encode
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=max_length,  # Pad & truncate all sentences
            padding='max_length',  # Explicitly pad to the max length
            truncation=True,  # Explicitly truncate to the max length
            return_attention_mask=True,  # Construct attention masks
            return_tensors='pt',  # Return PyTorch tensors
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    # Convert the lists into tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [None]:
test_inputs, test_masks = encode_test_data(tokenizer, test_sentences)

In [None]:
# Create a TensorDataset for the test data
test_dataset = TensorDataset(test_inputs, test_masks)

# Create a DataLoader for the test set
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=64)  # Adjust batch size if necessary

In [None]:
model.eval()  # Set the model to evaluation mode

predictions = []

for batch in test_dataloader:
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    
    with torch.no_grad():  # Temporarily set all the requires_grad flag to false
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    logits = outputs.logits
    logits = logits.detach().numpy()  # Move logits to CPU and convert to numpy array
    predictions.extend(np.argmax(logits, axis=1))

In [None]:
# Assuming 'global_label_dict' is your emotion to index dictionary used for encoding the labels
inverse_label_dict = {v: k for k, v in global_label_dict.items()}
predicted_emotions = [inverse_label_dict[pred] for pred in predictions]

In [None]:
results_df = pd.DataFrame({
    'id': test_ids,
    'emotion': predicted_emotions
})

In [None]:
results_df

In [None]:
# Save to CSV (optional)
results_df.to_csv('bert_model_more_epochs_gpu.csv', index=False)