In [1]:
!pip install transformers torch pandas scikit-learn numpy tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

2025-07-23 02:16:20.251319: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753236980.448089      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753236980.505242      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

Using device: cuda


In [4]:
class BanglaTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [5]:
class XLMRobertaForTextClassification(nn.Module):
    def __init__(self, num_classes=3, model_name='xlm-roberta-large'):
        super(XLMRobertaForTextClassification, self).__init__()
        
        # Load pre-trained XLM-RoBERTa model
        self.roberta = XLMRobertaModel.from_pretrained(model_name)
        
        # Get hidden size from config
        hidden_size = self.roberta.config.hidden_size
        
        # Classification head (as described in the paper for RoBERTa)
        # Hidden layer with tanh activation followed by classification layer
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        # Get RoBERTa outputs
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Get the [CLS] token representation (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]
        
        # Pass through classification head
        logits = self.classifier(cls_output)
        
        return logits

In [6]:
# Load your dataset
# Replace 'your_dataset.csv' with your actual file path
df = pd.read_csv('/kaggle/input/final-dataset/final-dataset.csv')

# Map labels to integers
label_map = {'positive': 0, 'negative': 1, 'neutral': 2}
df['label_encoded'] = df['Polarity'].map(label_map)  # Changed from 'Label' to 'Polarity'

# Check if mapping was successful
if df['label_encoded'].isnull().any():
    print("Warning: Some labels couldn't be mapped. Unique values in Polarity column:")
    print(df['Polarity'].unique())
    # Handle any case sensitivity issues
    df['Polarity'] = df['Polarity'].str.lower().str.strip()
    df['label_encoded'] = df['Polarity'].map(label_map)

# Split the data (80% train, 10% validation, 10% test)
texts = df['Text'].values  # Text column remains the same
labels = df['label_encoded'].values

# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
    texts, labels, test_size=0.1, random_state=42, stratify=labels
)

# Second split: 90% train, 10% val (from the 90% temp)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.111, random_state=42, stratify=y_temp
)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")

# Print label distribution
print("\nLabel distribution in training set:")
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique, counts):
    label_name = [k for k, v in label_map.items() if v == label][0]
    print(f"{label_name}: {count} ({count/len(y_train)*100:.2f}%)")

Train size: 4944
Validation size: 618
Test size: 618

Label distribution in training set:
positive: 1638 (33.13%)
negative: 1581 (31.98%)
neutral: 1725 (34.89%)


In [7]:
# Initialize tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

# Set max length based on your dataset
# The paper mentions different lengths for different datasets
# For sentiment analysis on short texts, they used 30-100 tokens
max_length = 100  # Adjust based on your text length

# Create datasets
train_dataset = BanglaTextDataset(X_train, y_train, tokenizer, max_length)
val_dataset = BanglaTextDataset(X_val, y_val, tokenizer, max_length)
test_dataset = BanglaTextDataset(X_test, y_test, tokenizer, max_length)

# Create data loaders
# Paper mentions batch size of 32
batch_size = 32  # Reduce if you run into memory issues
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

In [8]:
# Initialize model
model = XLMRobertaForTextClassification(num_classes=3)
model = model.to(device)

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer (Adam with learning rate 1e-5 as mentioned in the paper)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Number of epochs (paper mentions 10 epochs)
num_epochs = 10

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [9]:
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    predictions = []
    actual_labels = []
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, preds = torch.max(outputs, dim=1)
        predictions.extend(preds.cpu().numpy())
        actual_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions, average='weighted')
    
    return avg_loss, accuracy, f1

def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions, average='weighted')
    
    return avg_loss, accuracy, f1, predictions, actual_labels

In [10]:
# Training loop
best_val_accuracy = 0
best_model_path = 'best_xlm_roberta_bangla.pt'

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print("-" * 50)
    
    # Train
    train_loss, train_acc, train_f1 = train_epoch(
        model, train_loader, criterion, optimizer, device
    )
    print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, F1: {train_f1:.4f}")
    
    # Validate
    val_loss, val_acc, val_f1, _, _ = evaluate(
        model, val_loader, criterion, device
    )
    print(f"Val Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}")
    
    # Save best model based on validation accuracy
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        torch.save(model.state_dict(), best_model_path)
        print(f"Best model saved with validation accuracy: {val_acc:.4f}")


Epoch 1/10
--------------------------------------------------


Training: 100%|██████████| 155/155 [04:52<00:00,  1.89s/it]


Train Loss: 1.0416, Accuracy: 0.4581, F1: 0.4443


Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Val Loss: 0.8489, Accuracy: 0.6084, F1: 0.5781
Best model saved with validation accuracy: 0.6084

Epoch 2/10
--------------------------------------------------


Training: 100%|██████████| 155/155 [05:02<00:00,  1.95s/it]


Train Loss: 0.7763, Accuracy: 0.6640, F1: 0.6627


Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.71it/s]


Val Loss: 0.6867, Accuracy: 0.7023, F1: 0.6936
Best model saved with validation accuracy: 0.7023

Epoch 3/10
--------------------------------------------------


Training: 100%|██████████| 155/155 [05:03<00:00,  1.96s/it]


Train Loss: 0.6036, Accuracy: 0.7520, F1: 0.7514


Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]


Val Loss: 0.6421, Accuracy: 0.7443, F1: 0.7412
Best model saved with validation accuracy: 0.7443

Epoch 4/10
--------------------------------------------------


Training: 100%|██████████| 155/155 [05:03<00:00,  1.96s/it]


Train Loss: 0.5142, Accuracy: 0.7933, F1: 0.7929


Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]


Val Loss: 0.6941, Accuracy: 0.7168, F1: 0.7111

Epoch 5/10
--------------------------------------------------


Training: 100%|██████████| 155/155 [05:02<00:00,  1.95s/it]


Train Loss: 0.4133, Accuracy: 0.8398, F1: 0.8396


Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Val Loss: 0.7129, Accuracy: 0.7379, F1: 0.7377

Epoch 6/10
--------------------------------------------------


Training: 100%|██████████| 155/155 [05:02<00:00,  1.95s/it]


Train Loss: 0.3254, Accuracy: 0.8803, F1: 0.8801


Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Val Loss: 0.6863, Accuracy: 0.7670, F1: 0.7665
Best model saved with validation accuracy: 0.7670

Epoch 7/10
--------------------------------------------------


Training: 100%|██████████| 155/155 [05:03<00:00,  1.96s/it]


Train Loss: 0.2565, Accuracy: 0.9072, F1: 0.9071


Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Val Loss: 0.6919, Accuracy: 0.7557, F1: 0.7549

Epoch 8/10
--------------------------------------------------


Training: 100%|██████████| 155/155 [05:03<00:00,  1.96s/it]


Train Loss: 0.2269, Accuracy: 0.9187, F1: 0.9187


Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Val Loss: 0.7505, Accuracy: 0.7848, F1: 0.7844
Best model saved with validation accuracy: 0.7848

Epoch 9/10
--------------------------------------------------


Training: 100%|██████████| 155/155 [05:03<00:00,  1.96s/it]


Train Loss: 0.1753, Accuracy: 0.9391, F1: 0.9391


Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]


Val Loss: 0.9524, Accuracy: 0.7411, F1: 0.7397

Epoch 10/10
--------------------------------------------------


Training: 100%|██████████| 155/155 [05:02<00:00,  1.95s/it]


Train Loss: 0.1302, Accuracy: 0.9541, F1: 0.9541


Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]

Val Loss: 0.8336, Accuracy: 0.7718, F1: 0.7717





In [11]:
# Load best model
model.load_state_dict(torch.load(best_model_path))

# Evaluate on test set
test_loss, test_acc, test_f1, predictions, actual_labels = evaluate(
    model, test_loader, criterion, device
)

print(f"\nTest Results:")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

# Detailed classification report
label_names = ['positive', 'negative', 'neutral']
print("\nClassification Report:")
print(classification_report(actual_labels, predictions, target_names=label_names))

Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]


Test Results:
Test Loss: 0.9057
Test Accuracy: 0.7411
Test F1 Score: 0.7407

Classification Report:
              precision    recall  f1-score   support

    positive       0.77      0.80      0.79       205
    negative       0.76      0.68      0.72       198
     neutral       0.70      0.73      0.71       215

    accuracy                           0.74       618
   macro avg       0.74      0.74      0.74       618
weighted avg       0.74      0.74      0.74       618






In [12]:
def predict_sentiment(text, model, tokenizer, device, max_length=100):
    """
    Predict sentiment for a single text
    """
    model.eval()
    
    # Tokenize
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, prediction = torch.max(outputs, dim=1)
    
    # Map back to label
    label_map_reverse = {0: 'positive', 1: 'negative', 2: 'neutral'}
    predicted_label = label_map_reverse[prediction.item()]
    
    # Get probabilities
    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    
    return predicted_label, probabilities.cpu().numpy()[0]

# Example usage
sample_text = "এই মডেলটি খুব ভালো কাজ করছে"  # "This model is working very well"
predicted_label, probs = predict_sentiment(sample_text, model, tokenizer, device)
print(f"Text: {sample_text}")
print(f"Predicted: {predicted_label}")
print(f"Probabilities - Positive: {probs[0]:.4f}, Negative: {probs[1]:.4f}, Neutral: {probs[2]:.4f}")

Text: এই মডেলটি খুব ভালো কাজ করছে
Predicted: positive
Probabilities - Positive: 0.9863, Negative: 0.0011, Neutral: 0.0126
