In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
data = pd.read_csv("train.txt", sep=';')
data.columns = ["Text", "Emotions"]

# Extract texts and labels
texts = data["Text"].tolist()
labels = data["Emotions"].tolist()

# Encode labels to numerical values
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)


In [2]:
from transformers import RobertaTokenizer

# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the text inputs
max_length = 128  # Maximum length of a sequence
inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [3]:
import torch

# Prepare inputs and labels
input_ids = inputs['input_ids']
attention_masks = inputs['attention_mask']
labels = torch.tensor(labels)

# Split data
xtrain_ids, xtest_ids, xtrain_masks, xtest_masks, ytrain, ytest = train_test_split(
    input_ids, attention_masks, labels, test_size=0.45, random_state=42
)

# Create PyTorch Datasets
train_data = torch.utils.data.TensorDataset(xtrain_ids, xtrain_masks, ytrain)
test_data = torch.utils.data.TensorDataset(xtest_ids, xtest_masks, ytest)

# DataLoader for batching
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=16, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=16, shuffle=False)


In [4]:
from transformers import RobertaForSequenceClassification, AdamW

# Load pre-trained RoBERTa model for classification
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=len(set(labels))
)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from torch.nn import CrossEntropyLoss
from tqdm import tqdm  # For displaying progress bars

# Training loop
epochs = 4

for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_dataloader):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        
        # Zero out the gradients
        model.zero_grad()
        
        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Optimize
        optimizer.step()
    
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs} | Loss: {avg_loss:.4f}")


100%|██████████| 550/550 [42:52<00:00,  4.68s/it]


Epoch 1/4 | Loss: 2.3495


100%|██████████| 550/550 [37:28<00:00,  4.09s/it]  


Epoch 2/4 | Loss: 0.3538


100%|██████████| 550/550 [31:03<00:00,  3.39s/it]


Epoch 3/4 | Loss: 0.1909


100%|██████████| 550/550 [35:41<00:00,  3.89s/it]

Epoch 4/4 | Loss: 0.1319





In [6]:
from sklearn.metrics import classification_report

model.eval()  # Set the model to evaluation mode
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        
        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits
        
        # Get predictions
        predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.append(b_labels.cpu().numpy())

# Flatten the lists
predictions = [item for sublist in predictions for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Print classification report
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))


              precision    recall  f1-score   support

       anger       0.88      0.96      0.92       973
        fear       0.85      0.94      0.89       878
         joy       0.99      0.91      0.94      2426
        love       0.76      0.98      0.86       595
     sadness       0.99      0.94      0.96      2082
    surprise       0.87      0.69      0.77       246

    accuracy                           0.93      7200
   macro avg       0.89      0.90      0.89      7200
weighted avg       0.93      0.93      0.93      7200



In [7]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("roberta_emotion_model")
tokenizer.save_pretrained("roberta_emotion_model")

# Save the label encoder using pickle
import pickle
with open("roberta_label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
