In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
import ast

2025-06-26 12:28:55.491273: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-26 12:28:55.504571: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750940935.519534  998596 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750940935.524102  998596 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750940935.536545  998596 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Load datasets with custom embeddings
training = pd.read_csv("data/NLP_features_Train.csv")
test = pd.read_csv("data/NLP_features_Test.csv")

print(f"Training data shape: {training.shape}")
print(f"Test data shape: {test.shape}")
print(f"Training columns: {training.columns.tolist()}")

Training data shape: (5894, 25)
Test data shape: (722, 25)
Training columns: ['Sentence', 'Emotion', 'POS_CCONJ', 'POS_VERB', 'POS_PRON', 'POS_SCONJ', 'POS_DET', 'POS_NOUN', 'POS_PUNCT', 'POS_ADJ', 'POS_ADV', 'POS_ADP', 'POS_PROPN', 'POS_AUX', 'POS_INTJ', 'POS_NUM', 'POS_SPACE', 'POS_SYM', 'POS_X', 'POS_tags', 'TF-IDF', 'Pretrained_Embeddings', 'Custom_Embeddings', 'Sentiment_Score', 'NounChunkCount']


In [4]:
# Function to parse custom embeddings (assuming they're stored as strings)
def parse_embeddings(embedding_str):
    """Parse embedding string to numpy array"""
    if pd.isna(embedding_str):
        return np.zeros(768)  # Default size, adjust if needed
    
    try:
        # Try to parse as list/array string
        if isinstance(embedding_str, str):
            # Remove brackets and split by comma
            embedding_str = embedding_str.strip('[]')
            embedding = np.array([float(x.strip()) for x in embedding_str.split(',')])
        else:
            embedding = np.array(embedding_str)
        return embedding
    except:
        # If parsing fails, return zeros
        return np.zeros(768)

# Parse custom embeddings
print("Parsing custom embeddings...")
train_custom_embeddings = np.array([parse_embeddings(emb) for emb in training['Custom_Embeddings']])
test_custom_embeddings = np.array([parse_embeddings(emb) for emb in test['Custom_Embeddings']])

print(f"Custom embeddings shape - Train: {train_custom_embeddings.shape}, Test: {test_custom_embeddings.shape}")

# Normalize custom embeddings
scaler = StandardScaler()
train_custom_embeddings = scaler.fit_transform(train_custom_embeddings)
test_custom_embeddings = scaler.transform(test_custom_embeddings)

Parsing custom embeddings...
Custom embeddings shape - Train: (5894, 768), Test: (722, 768)


In [5]:
# Define features and target
X = training["Sentence"].astype(str)
y = training["Emotion"]

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Number of classes: {len(label_encoder.classes_)}")
print(f"Classes: {label_encoder.classes_}")

Number of classes: 7
Classes: ['anger' 'disgust' 'fear' 'happiness' 'neutral' 'sadness' 'surprise']


In [6]:
from sklearn.model_selection import train_test_split

# Split into train and validation (80% train, 20% validation)
X_train, X_val, y_train, y_val, train_emb_train, train_emb_val = train_test_split(
    X, y_encoded, train_custom_embeddings, test_size=0.2, random_state=42, stratify=y_encoded
)

# Further reduce dataset size to 30%
reduce_fraction = 0.5

X_train, _, y_train, _, train_emb_train, _ = train_test_split(
    X_train, y_train, train_emb_train, test_size=(1 - reduce_fraction), random_state=42, stratify=y_train
)

X_val, _, y_val, _, train_emb_val, _ = train_test_split(
    X_val, y_val, train_emb_val, test_size=(1 - reduce_fraction), random_state=42, stratify=y_val
)

# Load test data
X_test = test['Sentence'].astype(str)
y_test = test['Emotion']
y_test_encoded = label_encoder.transform(y_test)

# Reduce test set size as well
X_test, _, y_test_encoded, _ = train_test_split(
    X_test, y_test_encoded, test_size=(1 - reduce_fraction), random_state=42, stratify=y_test_encoded
)

# Final set sizes
print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")


Train set size: 2357
Validation set size: 589
Test set size: 361


In [7]:
# Load Roberta tokenizer
model_name = "FacebookAI/roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [8]:
# Enhanced dataset class with custom embeddings
class EnhancedEmotionDataset(Dataset):
    def __init__(self, texts, labels, custom_embeddings, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.custom_embeddings = custom_embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        custom_emb = self.custom_embeddings[idx]
        
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "custom_embedding": torch.tensor(custom_emb, dtype=torch.float32),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [9]:
# Hybrid model combining Roberta and custom embeddings
class HybridRobertaModel(nn.Module):
    def __init__(self, model_name, num_labels, custom_embedding_dim, dropout_rate=0.3):
        super(HybridRobertaModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        
        roberta_dim = self.roberta.config.hidden_size
        
        # Custom embedding processing layers
        self.custom_projection = nn.Linear(custom_embedding_dim, roberta_dim // 2)
        self.custom_activation = nn.ReLU()
        self.custom_dropout = nn.Dropout(dropout_rate)
        
        # Combined feature dimension
        combined_dim = roberta_dim + roberta_dim // 2
        
        # Classification layers
        self.classifier = nn.Sequential(
            nn.Linear(combined_dim, combined_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(combined_dim // 2, num_labels)
        )
        
    def forward(self, input_ids, attention_mask, custom_embedding):
        # Roberta forward pass
        Roberta_outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        Roberta_pooled = Roberta_outputs.pooler_output
        Roberta_features = self.dropout(Roberta_pooled)
        
        # Custom embedding processing
        custom_features = self.custom_projection(custom_embedding)
        custom_features = self.custom_activation(custom_features)
        custom_features = self.custom_dropout(custom_features)
        
        # Combine features
        combined_features = torch.cat([Roberta_features, custom_features], dim=1)
        
        # Classification
        logits = self.classifier(combined_features)
        
        return logits

In [10]:
# Create enhanced datasets
train_dataset = EnhancedEmotionDataset(X_train.tolist(), y_train, train_emb_train, tokenizer)
val_dataset = EnhancedEmotionDataset(X_val.tolist(), y_val, train_emb_val, tokenizer)
test_dataset = EnhancedEmotionDataset(X_test.tolist(), y_test_encoded, test_custom_embeddings, tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print(f"Created datasets and dataloaders successfully")

Created datasets and dataloaders successfully


In [11]:
# Initialize hybrid model
custom_embedding_dim = train_custom_embeddings.shape[1]
model = HybridRobertaModel(
    model_name=model_name, 
    num_labels=len(label_encoder.classes_),
    custom_embedding_dim=custom_embedding_dim
)
model.to(device)

print(f"Model initialized with custom embedding dimension: {custom_embedding_dim}")

# Training setup
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)

def train_epoch(model, train_loader):
    model.train()
    total_loss, total_correct = 0, 0
    predictions, true_labels = [], []
    
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        custom_embedding = batch['custom_embedding'].to(device)
        labels = batch['label'].to(device)
        
        logits = model(input_ids, attention_mask, custom_embedding)
        loss = criterion(logits, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        
        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
    
    return total_loss / len(train_loader), accuracy_score(true_labels, predictions)

def evaluate_model(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            custom_embedding = batch['custom_embedding'].to(device)
            labels = batch['label'].to(device)
            
            logits = model(input_ids, attention_mask, custom_embedding)
            preds = torch.argmax(logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    return acc, f1, classification_report(true_labels, predictions), predictions, true_labels

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized with custom embedding dimension: 768


In [12]:
# Training loop with early stopping
best_val_acc = 0
patience_counter = 0
max_patience = 3
num_epochs = 3

print("Starting training...")
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    print("-" * 50)
    
    # Training
    train_loss, train_acc = train_epoch(model, train_loader)
    
    # Validation
    val_acc, val_f1, val_report, _, _ = evaluate_model(model, val_loader)
    
    # Learning rate scheduling
    scheduler.step(val_acc)
    
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")
    print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.2e}")
    
    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        # Save best model
        torch.save(model.state_dict(), 'best_hybrid_model.pth')
        print(f"New best validation accuracy: {best_val_acc:.4f}")
    else:
        patience_counter += 1
        print(f"No improvement. Patience: {patience_counter}/{max_patience}")
        
        if patience_counter >= max_patience:
            print("Early stopping triggered!")
            break

# Load best model for final evaluation
model.load_state_dict(torch.load('best_hybrid_model.pth'))
print(f"\nLoaded best model with validation accuracy: {best_val_acc:.4f}")

Starting training...

Epoch 1/3
--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train Loss: 1.8393, Train Acc: 0.2401
Val Acc: 0.3158, Val F1: 0.2349
Learning Rate: 2.00e-05
New best validation accuracy: 0.3158

Epoch 2/3
--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train Loss: 1.6709, Train Acc: 0.3394
Val Acc: 0.3735, Val F1: 0.3140
Learning Rate: 2.00e-05
New best validation accuracy: 0.3735

Epoch 3/3
--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train Loss: 1.5225, Train Acc: 0.4018
Val Acc: 0.3871, Val F1: 0.3373
Learning Rate: 2.00e-05
New best validation accuracy: 0.3871

Loaded best model with validation accuracy: 0.3871


In [13]:
# Final evaluation on test set
print("\n" + "="*60)
print("FINAL TEST EVALUATION")
print("="*60)

test_acc, test_f1, test_report, test_predictions, test_true_labels = evaluate_model(model, test_loader)

print(f"Final Test Accuracy: {test_acc:.4f}")
print(f"Final Test F1-Score: {test_f1:.4f}")
print("\nDetailed Classification Report:")
print(test_report)

# Compare with baseline (if you want to show improvement)
print("\n" + "="*60)
print("MODEL PERFORMANCE SUMMARY")
print("="*60)
print(f"Best Validation Accuracy: {best_val_acc:.4f}")
print(f"Final Test Accuracy: {test_acc:.4f}")
print(f"Final Test F1-Score: {test_f1:.4f}")
print(f"Number of Classes: {len(label_encoder.classes_)}")
print(f"Custom Embedding Dimension: {custom_embedding_dim}")


FINAL TEST EVALUATION
Final Test Accuracy: 0.3075
Final Test F1-Score: 0.3113

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        23
           1       0.00      0.00      0.00         1
           2       0.07      0.78      0.13         9
           3       0.51      0.20      0.28       122
           4       0.44      0.53      0.48       118
           5       0.14      0.20      0.16        15
           6       0.29      0.21      0.24        73

    accuracy                           0.31       361
   macro avg       0.21      0.27      0.19       361
weighted avg       0.38      0.31      0.31       361


MODEL PERFORMANCE SUMMARY
Best Validation Accuracy: 0.3871
Final Test Accuracy: 0.3075
Final Test F1-Score: 0.3113
Number of Classes: 7
Custom Embedding Dimension: 768


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
