In [1]:
# Install necessary libraries
!pip install transformers pandas torch scikit-learn numpy emoji demoji shap -q

# Import all required packages
import pandas as pd
import numpy as np
import re
import emoji
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Corrected Imports: AdamW is imported from torch.optim, not transformers
from transformers import RobertaTokenizer, RobertaModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn import BCEWithLogitsLoss, MSELoss
import json

print("‚úÖ Block 1 Complete: Libraries installed and imported correctly.")

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m [32m604.2/608.4 kB[0m [31m46.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m608.4/608.4 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/42.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m42.9/42.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m


In [3]:
# Load the dataset from the Parquet file
try:
    df = pd.read_parquet('emotions_dataset.parquet')
    print("Dataset 'emotions_dataset.parquet' loaded successfully.")
except FileNotFoundError:
    print("‚ùå ERROR: 'emotions_dataset.parquet' not found. Please upload it to Colab.")
    # Create a dummy dataframe for demonstration if file is not found
    data = {'Sentence': ["I am so happy, it's sarcastic though", "This is just the worst, I feel angry and sad", "Feeling neutral about this event"],
            'Label': ["joy;sarcasm", "anger;sadness", "neutral"]}
    df = pd.DataFrame(data)
    print("Using a dummy dataset for demonstration purposes.")

# --- Preprocessing Function ---
def preprocess_text(text):
    """Cleans and prepares the tweet text."""
    if not isinstance(text, str):
        return ""
    text = emoji.demojize(text)
    text = text.lower()
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z\s]', '', text) # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_sentence'] = df['Sentence'].apply(preprocess_text)

# --- Label Preparation ---
df['labels_list'] = df['Label'].apply(lambda x: x.split(';'))
all_labels = sorted(list(set(label for sublist in df['labels_list'] for label in sublist)))

is_sarcasm_present = 'sarcasm' in all_labels
unique_emotions = [label for label in all_labels if label != 'sarcasm' and label] # Filter out empty strings
df['sarcasm'] = df['labels_list'].apply(lambda x: 1 if 'sarcasm' in x else 0)

# One-Hot Encode the emotion labels
mlb = MultiLabelBinarizer(classes=unique_emotions)
one_hot_labels = mlb.fit_transform(df['labels_list'])

# Generate placeholder intensity targets (since they aren't in the dataset)
emotion_intensity = one_hot_labels.astype(np.float32) * np.random.uniform(low=0.5, high=0.9, size=one_hot_labels.shape)

# --- Split the Data ---
X_train, X_val, y_train_emotions, y_val_emotions, y_train_intensity, y_val_intensity, y_train_sarcasm, y_val_sarcasm = train_test_split(
    df['cleaned_sentence'].to_list(),
    one_hot_labels,
    emotion_intensity,
    df['sarcasm'].to_numpy(),
    test_size=0.2,
    random_state=42
)

print(f"Total training samples: {len(X_train)}")
print(f"Total validation samples: {len(X_val)}")
print(f"Emotion classes being trained: {mlb.classes_}")
print("‚úÖ Block 2 Complete: Data loaded and preprocessed.")

Dataset 'emotions_dataset.parquet' loaded successfully.
Total training samples: 105044
Total validation samples: 26262
Emotion classes being trained: ['anger' 'confusion' 'desire' 'disgust' 'fear' 'guilt' 'happiness' 'love'
 'neutral' 'sadness' 'shame' 'surprise']
‚úÖ Block 2 Complete: Data loaded and preprocessed.




In [4]:
class EmotionDataset(Dataset):
    def __init__(self, texts, emotion_labels, intensity_labels, sarcasm_labels, tokenizer, max_len=128):
        self.texts = texts
        self.emotion_labels = emotion_labels
        self.intensity_labels = intensity_labels
        self.sarcasm_labels = sarcasm_labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'emotion_labels': torch.tensor(self.emotion_labels[item], dtype=torch.float),
            'intensity_labels': torch.tensor(self.intensity_labels[item], dtype=torch.float),
            'sarcasm_labels': torch.tensor(self.sarcasm_labels[item], dtype=torch.float)
        }

class EmoRoBERTa(torch.nn.Module):
    def __init__(self, n_emotions):
        super(EmoRoBERTa, self).__init__()
        # Base RoBERTa model
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        # Task-specific heads
        self.emotion_classifier = torch.nn.Linear(self.roberta.config.hidden_size, n_emotions)
        self.intensity_regressor = torch.nn.Linear(self.roberta.config.hidden_size, n_emotions)
        self.sarcasm_detector = torch.nn.Linear(self.roberta.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        # Get the [CLS] token output from RoBERTa
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        # Pass the output through each head
        emotion_logits = self.emotion_classifier(pooled_output)
        intensity_scores = self.intensity_regressor(pooled_output)
        sarcasm_logits = self.sarcasm_detector(pooled_output)

        return emotion_logits, intensity_scores, sarcasm_logits

print("‚úÖ Block 3 Complete: PyTorch Dataset and Model classes defined.")

‚úÖ Block 3 Complete: PyTorch Dataset and Model classes defined.


In [5]:
# --- Configuration ---
TOKENIZER = RobertaTokenizer.from_pretrained('roberta-base')
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3 # You can increase this for better performance, but 3 is good for a start
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Create DataLoaders ---
train_dataset = EmotionDataset(X_train, y_train_emotions, y_train_intensity, y_train_sarcasm, TOKENIZER, MAX_LEN)
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# --- Initialize Model and Optimizer ---
model = EmoRoBERTa(n_emotions=len(mlb.classes_)).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5) # This line is now correct
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# --- Loss Functions ---
loss_fns = {
    'emotion': BCEWithLogitsLoss().to(device),
    'intensity': MSELoss().to(device),
    'sarcasm': BCEWithLogitsLoss().to(device)
}

# --- Training Loop ---
print(f"üöÄ Starting training on {device} for {EPOCHS} epochs...")
# This 'Some weights...' warning is normal and expected.
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = {
            'emotion': batch["emotion_labels"].to(device),
            'intensity': batch["intensity_labels"].to(device),
            'sarcasm': batch["sarcasm_labels"].to(device).unsqueeze(1)
        }

        # Forward pass
        emotion_logits, intensity_scores, sarcasm_logits = model(input_ids, attention_mask)

        # Calculate losses
        loss_e = loss_fns['emotion'](emotion_logits, labels['emotion'])
        loss_i = loss_fns['intensity'](intensity_scores, labels['intensity'])
        loss_s = loss_fns['sarcasm'](sarcasm_logits, labels['sarcasm'])

        # Weighted combined loss (you can tune these weights)
        combined_loss = loss_e + 0.5 * loss_i + 0.8 * loss_s
        total_loss += combined_loss.item()

        # --- THIS IS THE CORRECTED LINE ---
        # Backward pass on 'combined_loss', not 'combined'
        combined_loss.backward()
        # --- END OF CORRECTION ---

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if batch_idx % 100 == 0:
            print(f"  Epoch {epoch+1}/{EPOCHS}, Batch {batch_idx}/{len(train_data_loader)}, Loss: {combined_loss.item():.4f}")

    avg_train_loss = total_loss / len(train_data_loader)
    print(f"Epoch {epoch + 1}/{EPOCHS} | Average Training Loss: {avg_train_loss:.4f}")

# --- Save the trained model ---
torch.save(model.state_dict(), 'emoberta_model.bin')
print("‚úÖ Block 4 Complete: Model training finished and weights saved to 'emoberta_model.bin'.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üöÄ Starting training on cuda for 3 epochs...
  Epoch 1/3, Batch 0/6566, Loss: 1.1843
  Epoch 1/3, Batch 100/6566, Loss: 0.2929
  Epoch 1/3, Batch 200/6566, Loss: 0.2571
  Epoch 1/3, Batch 300/6566, Loss: 0.4557
  Epoch 1/3, Batch 400/6566, Loss: 0.2476
  Epoch 1/3, Batch 500/6566, Loss: 0.2257
  Epoch 1/3, Batch 600/6566, Loss: 0.1853
  Epoch 1/3, Batch 700/6566, Loss: 0.2071
  Epoch 1/3, Batch 800/6566, Loss: 0.4880
  Epoch 1/3, Batch 900/6566, Loss: 0.2151
  Epoch 1/3, Batch 1000/6566, Loss: 0.1696
  Epoch 1/3, Batch 1100/6566, Loss: 0.2092
  Epoch 1/3, Batch 1200/6566, Loss: 0.1666
  Epoch 1/3, Batch 1300/6566, Loss: 0.1587
  Epoch 1/3, Batch 1400/6566, Loss: 0.2114
  Epoch 1/3, Batch 1500/6566, Loss: 0.1584
  Epoch 1/3, Batch 1600/6566, Loss: 0.1483
  Epoch 1/3, Batch 1700/6566, Loss: 0.1550
  Epoch 1/3, Batch 1800/6566, Loss: 0.1878
  Epoch 1/3, Batch 1900/6566, Loss: 0.2109
  Epoch 1/3, Batch 2000/6566, Loss: 0.1678
  Epoch 1/3, Batch 2100/6566, Loss: 0.1004
  Epoch 1/3, Batch 