In [None]:
import os

current_directory = os.getcwd()
print(f"Previous working directory: {current_directory}")

directory_path = '/content/drive/MyDrive/Test/Text_Emotion_Analysis'
os.chdir(directory_path)

current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

Previous working directory: /content
Current working directory: /content/drive/MyDrive/Test/Text_Emotion_Analysis


# Load and Preprocess the Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

In [None]:

df = pd.read_csv('Dataset/data.csv')

print(df.head())


                                                text  emotion
0  im feeling rather rotten so im not very ambiti...  sadness
1          im updating my blog because i feel shitty  sadness
2  i never make her separate from me because i do...  sadness
3  i left with my bouquet of red and yellow tulip...      joy
4    i was feeling a little vain when i did this one  sadness


In [None]:

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_texts = train_df['text'].tolist()
train_labels = train_df['emotion'].tolist()
val_texts = val_df['text'].tolist()
val_labels = val_df['emotion'].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert labels to numerical format
label_mapping = {'sadness': 0, 'joy': 1, 'fear': 2, 'anger': 3, 'love': 4, 'surprise': 5}
train_labels = [label_mapping[label] for label in train_labels]
val_labels = [label_mapping[label] for label in val_labels]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Fine-Tune the BERT Model

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

In [None]:

class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1391,0.212513
2,0.1429,0.148039
3,0.1471,0.195181


TrainOutput(global_step=3375, training_loss=0.26118540142200614, metrics={'train_runtime': 1065.4872, 'train_samples_per_second': 50.681, 'train_steps_per_second': 3.168, 'total_flos': 2414336194728000.0, 'train_loss': 0.26118540142200614, 'epoch': 3.0})

# Evaluate the Model

In [None]:

results = trainer.evaluate()
print(results)

# Save the model
model.save_pretrained('./emotion-analysis-bert')
tokenizer.save_pretrained('./emotion-analysis-bert')


{'eval_loss': 0.1951814591884613, 'eval_runtime': 9.3378, 'eval_samples_per_second': 214.182, 'eval_steps_per_second': 13.386, 'epoch': 3.0}


('./emotion-analysis-bert/tokenizer_config.json',
 './emotion-analysis-bert/special_tokens_map.json',
 './emotion-analysis-bert/vocab.txt',
 './emotion-analysis-bert/added_tokens.json')

# Reloading and Testing the Trained Model

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('./emotion-analysis-bert')
tokenizer = BertTokenizer.from_pretrained('./emotion-analysis-bert')

# Ensure the model is in evaluation mode
model.eval()

def preprocess_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    return inputs

# Function to predict emotions
def predict_emotions(inputs):
    with torch.no_grad():
        predictions = []
        for input in inputs:
            outputs = model(**input)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()
            predictions.append(predicted_class)
        return predictions

texts = ["I am so happy today!", "I feel very sad and lonely.", "This is amazing!"]

# Preprocess and tokenize the texts
inputs = [preprocess_text(text) for text in texts]

# Predict emotions for the example texts
predicted_classes = predict_emotions(inputs)

# Map the predicted class indices back to emotion labels
label_mapping = {0: 'sadness', 1: 'joy', 2: 'fear', 3: 'anger', 4: 'love', 5: 'surprise'}
predicted_emotions = [label_mapping[class_idx] for class_idx in predicted_classes]


for text, emotion in zip(texts, predicted_emotions):
    print(f"Text: {text}\nPredicted Emotion: {emotion}\n")


Text: I am so happy today!
Predicted Emotion: joy

Text: I feel very sad and lonely.
Predicted Emotion: sadness

Text: This is amazing!
Predicted Emotion: surprise

