In [None]:
!pip install transformers torch pandas scikit-learn numpy tqdm

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [None]:
class BanglaTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
class XLMRobertaForTextClassification(nn.Module):
    def __init__(self, num_classes=3, model_name='xlm-roberta-large'):
        super(XLMRobertaForTextClassification, self).__init__()
        
        # Load pre-trained XLM-RoBERTa model
        self.roberta = XLMRobertaModel.from_pretrained(model_name)
        
        # Get hidden size from config
        hidden_size = self.roberta.config.hidden_size
        
        # Classification head (as described in the paper for RoBERTa)
        # Hidden layer with tanh activation followed by classification layer
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        # Get RoBERTa outputs
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Get the [CLS] token representation (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]
        
        # Pass through classification head
        logits = self.classifier(cls_output)
        
        return logits

In [None]:
# Load your dataset
# Replace 'your_dataset.csv' with your actual file path
df = pd.read_csv('/kaggle/input/final-dataset/final-dataset.csv')

# Map labels to integers
label_map = {'positive': 0, 'negative': 1, 'neutral': 2}
df['label_encoded'] = df['Polarity'].map(label_map)  # Changed from 'Label' to 'Polarity'

# Check if mapping was successful
if df['label_encoded'].isnull().any():
    print("Warning: Some labels couldn't be mapped. Unique values in Polarity column:")
    print(df['Polarity'].unique())
    # Handle any case sensitivity issues
    df['Polarity'] = df['Polarity'].str.lower().str.strip()
    df['label_encoded'] = df['Polarity'].map(label_map)

# Split the data (80% train, 10% validation, 10% test)
texts = df['Text'].values  # Text column remains the same
labels = df['label_encoded'].values

# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
    texts, labels, test_size=0.1, random_state=42, stratify=labels
)

# Second split: 90% train, 10% val (from the 90% temp)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.111, random_state=42, stratify=y_temp
)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")

# Print label distribution
print("\nLabel distribution in training set:")
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique, counts):
    label_name = [k for k, v in label_map.items() if v == label][0]
    print(f"{label_name}: {count} ({count/len(y_train)*100:.2f}%)")

In [None]:
# Initialize tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

# Set max length based on your dataset
# The paper mentions different lengths for different datasets
# For sentiment analysis on short texts, they used 30-100 tokens
max_length = 100  # Adjust based on your text length

# Create datasets
train_dataset = BanglaTextDataset(X_train, y_train, tokenizer, max_length)
val_dataset = BanglaTextDataset(X_val, y_val, tokenizer, max_length)
test_dataset = BanglaTextDataset(X_test, y_test, tokenizer, max_length)

# Create data loaders
# Paper mentions batch size of 32
batch_size = 32  # Reduce if you run into memory issues
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Initialize model
model = XLMRobertaForTextClassification(num_classes=3)
model = model.to(device)

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer (Adam with learning rate 1e-5 as mentioned in the paper)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Number of epochs (paper mentions 10 epochs)
num_epochs = 10

In [None]:
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    predictions = []
    actual_labels = []
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, preds = torch.max(outputs, dim=1)
        predictions.extend(preds.cpu().numpy())
        actual_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions, average='weighted')
    
    return avg_loss, accuracy, f1

def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions, average='weighted')
    
    return avg_loss, accuracy, f1, predictions, actual_labels

In [None]:
# Training loop
best_val_accuracy = 0
best_model_path = 'best_xlm_roberta_bangla.pt'

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print("-" * 50)
    
    # Train
    train_loss, train_acc, train_f1 = train_epoch(
        model, train_loader, criterion, optimizer, device
    )
    print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, F1: {train_f1:.4f}")
    
    # Validate
    val_loss, val_acc, val_f1, _, _ = evaluate(
        model, val_loader, criterion, device
    )
    print(f"Val Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, F1: {val_f1:.4f}")
    
    # Save best model based on validation accuracy
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        torch.save(model.state_dict(), best_model_path)
        print(f"Best model saved with validation accuracy: {val_acc:.4f}")

In [None]:
# Load best model
model.load_state_dict(torch.load(best_model_path))

# Evaluate on test set
test_loss, test_acc, test_f1, predictions, actual_labels = evaluate(
    model, test_loader, criterion, device
)

print(f"\nTest Results:")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

# Detailed classification report
label_names = ['positive', 'negative', 'neutral']
print("\nClassification Report:")
print(classification_report(actual_labels, predictions, target_names=label_names))

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=100):
    """
    Predict sentiment for a single text
    """
    model.eval()
    
    # Tokenize
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, prediction = torch.max(outputs, dim=1)
    
    # Map back to label
    label_map_reverse = {0: 'positive', 1: 'negative', 2: 'neutral'}
    predicted_label = label_map_reverse[prediction.item()]
    
    # Get probabilities
    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    
    return predicted_label, probabilities.cpu().numpy()[0]

# Example usage
sample_text = "এই মডেলটি খুব ভালো কাজ করছে"  # "This model is working very well"
predicted_label, probs = predict_sentiment(sample_text, model, tokenizer, device)
print(f"Text: {sample_text}")
print(f"Predicted: {predicted_label}")
print(f"Probabilities - Positive: {probs[0]:.4f}, Negative: {probs[1]:.4f}, Neutral: {probs[2]:.4f}")