# MARBERT Fine-Tuning for Arabic Sentiment Analysis

This notebook demonstrates the complete process of fine-tuning MARBERT on the ASTD dataset for Arabic sentiment analysis.

## Overview
- Load and preprocess ASTD dataset
- Setup MARBERT model and tokenizer
- Fine-tune using Hugging Face Trainer
- Evaluate with Macro-F1 metric
- Save the fine-tuned model

## 1. Install Required Dependencies

In [None]:
# Install required packages
!pip install transformers datasets torch pandas numpy scikit-learn tqdm matplotlib seaborn

## 2. Import Libraries and Setup

In [None]:
import os
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding
)
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## 3. Data Loading and Preprocessing

Load the ASTD dataset and preprocess the text data.

In [None]:
# Define label mapping
label_map = {
    'POS': 1,      # Positive
    'NEG': 0,      # Negative
    'NEUTRAL': 2,  # Neutral
    'OBJ': 3       # Objective
}

reverse_label_map = {v: k for k, v in label_map.items()}
print("Label mapping:", label_map)

In [None]:
def load_astd_data(data_dir='data'):
    """Load ASTD dataset"""
    
    # Load main tweets file
    tweets_file = os.path.join(data_dir, 'raw', 'Tweets.txt')
    
    tweets = []
    labels = []
    
    with open(tweets_file, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if line:
                # Split by tab or space
                parts = line.split('\t')
                if len(parts) >= 2:
                    tweet_text = parts[0].strip()
                    label = parts[1].strip()
                    
                    if label in label_map:
                        tweets.append(tweet_text)
                        labels.append(label_map[label])
    
    return pd.DataFrame({
        'text': tweets,
        'label': labels
    })

# Load data
df = load_astd_data()
print(f"Dataset shape: {df.shape}")
print(f"Label distribution:\n{df['label'].value_counts().sort_index()}")
df.head()

## 4. Text Preprocessing

Clean and normalize Arabic text data.

In [None]:
import re

def clean_arabic_text(text):
    """Clean and normalize Arabic text"""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters but keep Arabic
    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s\w]', '', text)
    
    return text.strip()

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(clean_arabic_text)

# Remove empty texts
df = df[df['cleaned_text'].str.len() > 0].reset_index(drop=True)
print(f"After preprocessing: {df.shape}")
df.head()

## 5. Model and Tokenizer Setup

Load MARBERT model and tokenizer for sequence classification.

In [None]:
# Load MARBERT tokenizer and model
model_name = "UBC-NLP/MARBERT"
num_labels = len(label_map)

print(f"Loading tokenizer from {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Loading model from {model_name}...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels,
    ignore_mismatched_sizes=True
)

print(f"Model loaded with {num_labels} labels")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Test tokenization
sample_text = df['cleaned_text'].iloc[0]
print(f"Sample text: {sample_text}")

encoded = tokenizer(
    sample_text,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='pt'
)

print(f"Input IDs shape: {encoded['input_ids'].shape}")
print(f"Attention mask shape: {encoded['attention_mask'].shape}")
print(f"Decoded: {tokenizer.decode(encoded['input_ids'][0])}")

## 6. Dataset Preparation

Create PyTorch datasets and dataloaders.

In [None]:
class ASTDDataset(Dataset):
    """Custom dataset for ASTD"""
    
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Split data
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['cleaned_text'].values, 
    df['label'].values, 
    test_size=0.3, 
    random_state=42, 
    stratify=df['label']
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, 
    temp_labels, 
    test_size=0.5, 
    random_state=42, 
    stratify=temp_labels
)

print(f"Train: {len(train_texts)}")
print(f"Validation: {len(val_texts)}")
print(f"Test: {len(test_texts)}")

# Create datasets
train_dataset = ASTDDataset(train_texts, train_labels, tokenizer)
val_dataset = ASTDDataset(val_texts, val_labels, tokenizer)
test_dataset = ASTDDataset(test_texts, test_labels, tokenizer)

## 7. Training Setup and Execution

Configure training arguments and start fine-tuning.

In [None]:
# Define metrics function
def compute_metrics(eval_pred):
    """Compute evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    macro_f1 = f1_score(labels, predictions, average='macro')
    
    return {
        'accuracy': accuracy,
        'macro_f1': macro_f1
    }

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./marbert-sentiment-model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    warmup_steps=500,
    logging_steps=100,
    save_total_limit=2,
    dataloader_num_workers=4,
    remove_unused_columns=False,
    report_to=None  # Disable wandb for Colab
)

print("Training arguments:")
for key, value in training_args.__dict__.items():
    if not key.startswith('_'):
        print(f"  {key}: {value}")

In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

print("Trainer initialized successfully!")

In [None]:
# Start training
print("Starting training...")
trainer.train()

print("Training completed!")

## 8. Model Evaluation

Evaluate the fine-tuned model on test data.

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)
print(f"Test results: {test_results}")

In [None]:
# Get predictions
test_predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(test_predictions.predictions, axis=1)
true_labels = test_predictions.label_ids

# Detailed classification report
print("Classification Report:")
print(classification_report(
    true_labels, 
    predicted_labels, 
    target_names=[reverse_label_map[i] for i in range(len(label_map))]
))

In [None]:
# Plot confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(true_labels, predicted_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=[reverse_label_map[i] for i in range(len(label_map))],
            yticklabels=[reverse_label_map[i] for i in range(len(label_map))])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 9. Save and Test Model

Save the fine-tuned model and test it on new examples.

In [None]:
# Save the model
output_dir = "./marbert-sentiment-final"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

In [None]:
# Test the saved model
def predict_sentiment(text, model, tokenizer):
    """Predict sentiment for a given text"""
    # Tokenize
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors='pt'
    )
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    
    return reverse_label_map[predicted_class], confidence

# Test examples
test_examples = [
    "أنا سعيد جداً اليوم",  # I am very happy today
    "هذا سيء للغاية",      # This is very bad
    "الطقس عادي اليوم"      # Weather is normal today
]

print("Testing saved model:")
for example in test_examples:
    sentiment, conf = predict_sentiment(example, model, tokenizer)
    print(f"Text: {example}")
    print(f"Sentiment: {sentiment} (confidence: {conf:.3f})")
    print("-" * 50)

## 10. Summary and Next Steps

This notebook has demonstrated:
1.  Loading and preprocessing ASTD dataset
2.  Setting up MARBERT model and tokenizer
3.  Creating custom dataset class
4.  Training with Hugging Face Trainer
5.  Evaluation with Macro-F1 metric
6.  Saving the fine-tuned model
7.  Testing on new examples

### Next Steps:
- Try different hyperparameters
- Experiment with data augmentation
- Implement custom training loop for more control
- Use the model in production applications

In [None]:
# Model info
print("Final Model Information:")
print(f"Model type: MARBERT")
print(f"Number of labels: {num_labels}")
print(f"Labels: {list(label_map.keys())}")
print(f"Model saved to: {output_dir}")
print(f"Test accuracy: {test_results.get('eval_accuracy', 'N/A'):.4f}")
print(f"Test Macro-F1: {test_results.get('eval_macro_f1', 'N/A'):.4f}")