In [None]:
!pip install transformers datasets evaluate scikit-learn pandas numpy

In [15]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import evaluate
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
from typing import Dict, List, Union
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)

In [16]:
# Load data
train_df = pd.read_csv('training-data.csv')
test_df = pd.read_csv('testing-data.csv')

# Display a few samples to understand the data
train_df.head()

Unnamed: 0,tweet,sarcasm,sentiment,dialect
0,د محمود_العلايلي أرى أن الفريق أحمد_شفيق رقم م...,False,NEU,msa
1,مع فيدرر يا آجا والكبار,False,NEU,msa
2,الداعون لمبدأ الاختلاط بين الجنسين؛ كالداعين ل...,True,NEG,msa
3,_94 78 5 202 مساكين من الصبح و هوما رايحين راج...,True,NEG,gulf
4,قل شرق حلب ولا تقل حلب الشرقية وقل غرب حلب ولا...,False,NEU,msa


In [17]:
train_df = train_df.dropna()
test_df = test_df.dropna()

In [18]:
# Check dataset information
print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")
print("\nColumns in dataset:")
print(train_df.columns.tolist())

# Check for missing values
print("\nMissing values in training data:")
print(train_df.isnull().sum())

# Check distribution of labels for each task
print("\nDialect distribution:")
print(train_df['dialect'].value_counts())

print("\nSarcasm distribution:")
print(train_df['sarcasm'].value_counts())

print("\nSentiment distribution:")
print(train_df['sentiment'].value_counts())

Training data shape: (12546, 4)
Testing data shape: (2999, 4)

Columns in dataset:
['tweet', 'sarcasm', 'sentiment', 'dialect']

Missing values in training data:
tweet        0
sarcasm      0
sentiment    0
dialect      0
dtype: int64

Dialect distribution:
dialect
msa       8560
egypt     2675
gulf       644
levant     624
magreb      43
Name: count, dtype: int64

Sarcasm distribution:
sarcasm
False    10379
True      2167
Name: count, dtype: int64

Sentiment distribution:
sentiment
NEU    5746
NEG    4620
POS    2180
Name: count, dtype: int64


In [19]:
def preprocess_and_encode_data(df: pd.DataFrame, task: str, tokenizer, max_length: int = 128):
    """Preprocess text data and encode labels for a specific task
    
    Args:
        df: DataFrame containing the data
        task: One of 'dialect', 'sarcasm', or 'sentiment'
        tokenizer: BERT tokenizer
        max_length: Maximum sequence length for tokenization
        
    Returns:
        Processed Dataset object with encoded inputs and labels
    """
    # Get label mapping based on the task
    if task == 'dialect':
        unique_labels = sorted(df['dialect'].unique())
        label_column = 'dialect'
    elif task == 'sarcasm':
        unique_labels = sorted(df['sarcasm'].unique())
        label_column = 'sarcasm'
    elif task == 'sentiment':
        unique_labels = sorted(df['sentiment'].unique())
        label_column = 'sentiment'
    else:
        raise ValueError(f"Unsupported task: {task}")
    
    label_mapping = {label: i for i, label in enumerate(unique_labels)}
    print(f"Label mapping for {task}: {label_mapping}")
    
    # Encode labels
    labels = [label_mapping[label] for label in df[label_column]]
    
    # Create dataset
    dataset_dict = {
        "text": df["tweet"].tolist(),
        "label": labels
    }
    dataset = Dataset.from_dict(dataset_dict)
    
    # Tokenize function
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=max_length,
        )
    
    # Tokenize all examples
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    
    return tokenized_dataset, label_mapping

In [20]:
def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    # For multiclass, we use macro averaging
    f1_score = f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
    precision_score = precision.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall_score = recall.compute(predictions=predictions, references=labels, average="macro")["recall"]
    
    return {
        "accuracy": accuracy_score,
        "f1": f1_score,
        "precision": precision_score,
        "recall": recall_score
    }

In [13]:
def fine_tune_model(task: str, 
                 train_dataset, 
                 eval_dataset, 
                 model, 
                 num_labels: int,
                 output_dir: str,
                 epochs: int = 3,
                 batch_size: int = 16):
    """Fine-tune MARBERT for a specific task
    
    Args:
        task: The name of the task ('dialect', 'sarcasm', or 'sentiment')
        train_dataset: Training dataset
        eval_dataset: Evaluation dataset
        model: Pre-trained or previously fine-tuned model
        num_labels: Number of labels for the task
        output_dir: Directory to save the model
        epochs: Number of training epochs
        batch_size: Training batch size
        
    Returns:
        Fine-tuned model
    """
    logging.info(f"Fine-tuning model for task: {task}")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=2e-5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none",  # Disable reporting to avoid wandb or other integrations
    )
    
    # Set up trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )
    
    # Train the model
    trainer.train()
    
    # Evaluate the model
    eval_results = trainer.evaluate()
    logging.info(f"Evaluation results for {task}: {eval_results}")
    
    # Save the model
    trainer.save_model(output_dir)
    logging.info(f"Model saved to {output_dir}")
    
    return model

In [14]:
# Define paths for saving models
dialect_model_path = "marbert_dialect"
sarcasm_model_path = "marbert_sarcasm"
sentiment_model_path = "marbert_sentiment"

# Define hyperparameters
model_name = "UBC-NLP/MARBERT"
max_length = 128
batch_size = 16
epochs = 3
random_state = 42

In [21]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
# Preprocess data for dialect detection task
train_dialect_dataset, dialect_label_mapping = preprocess_and_encode_data(
    train_df, 'dialect', tokenizer, max_length
)
eval_dialect_dataset, _ = preprocess_and_encode_data(
    test_df, 'dialect', tokenizer, max_length
)

# Load base MARBERT model for dialect detection
dialect_model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(dialect_label_mapping),
    ignore_mismatched_sizes=True
)

# Fine-tune for dialect detection
dialect_model = fine_tune_model(
    task='dialect',
    train_dataset=train_dialect_dataset,
    eval_dataset=eval_dialect_dataset,
    model=dialect_model,
    num_labels=len(dialect_label_mapping),
    output_dir=dialect_model_path,
    epochs=epochs,
    batch_size=batch_size
)

Label mapping for dialect: {'egypt': 0, 'gulf': 1, 'levant': 2, 'magreb': 3, 'msa': 4}


Map:   0%|          | 0/12546 [00:00<?, ? examples/s]

Label mapping for dialect: {'egypt': 0, 'gulf': 1, 'levant': 2, 'magreb': 3, 'msa': 4}


Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:root:Fine-tuning model for task: dialect


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6919,0.769323,0.660887,0.389893,0.367127,0.4716
2,0.4976,0.760063,0.702901,0.395953,0.36912,0.481044
3,0.3654,0.903752,0.700567,0.385057,0.365866,0.457671


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
INFO:root:Evaluation results for dialect: {'eval_loss': 0.7600627541542053, 'eval_accuracy': 0.7029009669889963, 'eval_f1': 0.3959532000665623, 'eval_precision': 0.3691197775084424, 'eval_recall': 0.48104365588404663, 'eval_runtime': 10.6426, 'eval_samples_per_second': 281.792, 'eval_steps_per_second': 17.665, 'epoch': 3.0}
INFO:root:Model saved to marbert_dialect


In [26]:
# Preprocess data for dialect detection task
train_dialect_dataset, dialect_label_mapping = preprocess_and_encode_data(
    train_df, 'dialect', tokenizer, max_length
)
eval_dialect_dataset, _ = preprocess_and_encode_data(
    test_df, 'dialect', tokenizer, max_length
)

# Preprocess data for sarcasm detection task
train_sarcasm_dataset, sarcasm_label_mapping = preprocess_and_encode_data(
    train_df, 'sarcasm', tokenizer, max_length
)
eval_sarcasm_dataset, _ = preprocess_and_encode_data(
    test_df, 'sarcasm', tokenizer, max_length
)

# Preprocess data for sentiment classification task
train_sentiment_dataset, sentiment_label_mapping = preprocess_and_encode_data(
    train_df, 'sentiment', tokenizer, max_length
)
eval_sentiment_dataset, _ = preprocess_and_encode_data(
    test_df, 'sentiment', tokenizer, max_length
)


Label mapping for dialect: {'egypt': 0, 'gulf': 1, 'levant': 2, 'magreb': 3, 'msa': 4}


Map:   0%|          | 0/12546 [00:00<?, ? examples/s]

Label mapping for dialect: {'egypt': 0, 'gulf': 1, 'levant': 2, 'magreb': 3, 'msa': 4}


Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

Label mapping for sarcasm: {False: 0, True: 1}


Map:   0%|          | 0/12546 [00:00<?, ? examples/s]

Label mapping for sarcasm: {False: 0, True: 1}


Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

Label mapping for sentiment: {'NEG': 0, 'NEU': 1, 'POS': 2}


Map:   0%|          | 0/12546 [00:00<?, ? examples/s]

Label mapping for sentiment: {'NEG': 0, 'NEU': 1, 'POS': 2}


Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

In [14]:
# Preprocess data for sarcasm detection task
train_sarcasm_dataset, sarcasm_label_mapping = preprocess_and_encode_data(
    train_df, 'sarcasm', tokenizer, max_length
)
eval_sarcasm_dataset, _ = preprocess_and_encode_data(
    test_df, 'sarcasm', tokenizer, max_length
)

# Load fine-tuned dialect model for sarcasm detection
sarcasm_model = AutoModelForSequenceClassification.from_pretrained(
    dialect_model_path,
    num_labels=len(sarcasm_label_mapping),
    ignore_mismatched_sizes=True
)

# Fine-tune for sarcasm detection
sarcasm_model = fine_tune_model(
    task='sarcasm',
    train_dataset=train_sarcasm_dataset,
    eval_dataset=eval_sarcasm_dataset,
    model=sarcasm_model,
    num_labels=len(sarcasm_label_mapping),
    output_dir=sarcasm_model_path,
    epochs=epochs,
    batch_size=batch_size
)

Label mapping for sarcasm: {False: 0, True: 1}


Map:   0%|          | 0/12546 [00:00<?, ? examples/s]

Label mapping for sarcasm: {False: 0, True: 1}


Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at marbert_dialect and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:root:Fine-tuning model for task: sarcasm


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3455,0.46051,0.761921,0.720289,0.711235,0.738571
2,0.2098,0.688361,0.776259,0.707258,0.717398,0.699873
3,0.1425,1.035206,0.773258,0.701167,0.713252,0.692874


INFO:root:Evaluation results for sarcasm: {'eval_loss': 0.46050962805747986, 'eval_accuracy': 0.7619206402134044, 'eval_f1': 0.7202890213278239, 'eval_precision': 0.7112351933957173, 'eval_recall': 0.7385707926345729, 'eval_runtime': 10.7348, 'eval_samples_per_second': 279.372, 'eval_steps_per_second': 17.513, 'epoch': 3.0}
INFO:root:Model saved to marbert_sarcasm


In [15]:
# Preprocess data for sentiment classification task
train_sentiment_dataset, sentiment_label_mapping = preprocess_and_encode_data(
    train_df, 'sentiment', tokenizer, max_length
)
eval_sentiment_dataset, _ = preprocess_and_encode_data(
    test_df, 'sentiment', tokenizer, max_length
)

# Load fine-tuned sarcasm model for sentiment classification
sentiment_model = AutoModelForSequenceClassification.from_pretrained(
    sarcasm_model_path,
    num_labels=len(sentiment_label_mapping),
    ignore_mismatched_sizes=True
)

# Fine-tune for sentiment classification
sentiment_model = fine_tune_model(
    task='sentiment',
    train_dataset=train_sentiment_dataset,
    eval_dataset=eval_sentiment_dataset,
    model=sentiment_model,
    num_labels=len(sentiment_label_mapping),
    output_dir=sentiment_model_path,
    epochs=epochs,
    batch_size=batch_size
)

Label mapping for sentiment: {'NEG': 0, 'NEU': 1, 'POS': 2}


Map:   0%|          | 0/12546 [00:00<?, ? examples/s]

Label mapping for sentiment: {'NEG': 0, 'NEU': 1, 'POS': 2}


Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at marbert_sarcasm and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:root:Fine-tuning model for task: sentiment


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6759,0.754171,0.689897,0.643402,0.641646,0.658053
2,0.3882,0.880604,0.684895,0.649936,0.645133,0.655671
3,0.2505,1.19989,0.670557,0.633926,0.625838,0.64545


INFO:root:Evaluation results for sentiment: {'eval_loss': 0.880604088306427, 'eval_accuracy': 0.6848949649883295, 'eval_f1': 0.6499363937577665, 'eval_precision': 0.645132868559123, 'eval_recall': 0.6556710365352415, 'eval_runtime': 10.7769, 'eval_samples_per_second': 278.281, 'eval_steps_per_second': 17.445, 'epoch': 3.0}
INFO:root:Model saved to marbert_sentiment


In [22]:
def predict_with_model(text, model_path, label_mapping):
    """Make a prediction using a fine-tuned model
    
    Args:
        text: The input Arabic text
        model_path: Path to the fine-tuned model
        label_mapping: Dictionary mapping numerical labels to text labels
        
    Returns:
        Predicted label and confidence score
    """
    # Load model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding="max_length")
    
    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=1)
        prediction = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][prediction].item()
    
    # Map numerical label back to text label
    inverse_mapping = {v: k for k, v in label_mapping.items()}
    predicted_label = inverse_mapping[prediction]
    print(predicted_label)
    return predicted_label, confidence

In [23]:
# Example of how to use the fine-tuned models for inference
def analyze_arabic_text(text):
    """Analyze Arabic text using all three fine-tuned models
    
    Args:
        text: Arabic text input
        
    Returns:
        Dictionary with predictions for all three tasks
    """
    # Predict dialect
    dialect, dialect_confidence = predict_with_model(
        text, dialect_model_path, dialect_label_mapping
    )
    
    # Predict sarcasm
    sarcasm, sarcasm_confidence = predict_with_model(
        text, sarcasm_model_path, sarcasm_label_mapping
    )
    
    # Predict sentiment
    sentiment, sentiment_confidence = predict_with_model(
        text, sentiment_model_path, sentiment_label_mapping
    )
    
    return {
        "text": text,
        "dialect": {
            "prediction": dialect,
            "confidence": f"{dialect_confidence:.4f}"
        },
        "sarcasm": {
            "prediction": sarcasm,
            "confidence": f"{sarcasm_confidence:.4f}"
        },
        "sentiment": {
            "prediction": sentiment,
            "confidence": f"{sentiment_confidence:.4f}"
        }
    }

In [38]:
tweets = train_df["tweet"].tolist()


# Uncomment to run predictions after training is complete

for i, example in enumerate(tweets):
    print(f"\nExample {i+1}: {example}")
    results = analyze_arabic_text(example)
    print(f"Dialect: {results['dialect']['prediction']} (confidence: {results['dialect']['confidence']})")
    print(f"Sarcasm: {results['sarcasm']['prediction']} (confidence: {results['sarcasm']['confidence']})")
    print(f"Sentiment: {results['sentiment']['prediction']} (confidence: {results['sentiment']['confidence']})")



Example 1: أنا سعيد جدا بهذا الخبر العظيم
msa
False
POS
Dialect: msa (confidence: 0.8563)
Sarcasm: False (confidence: 0.9931)
Sentiment: POS (confidence: 0.9524)

Example 2: هههههه والله انك مسخرة يا رجل
egypt
True
NEG
Dialect: egypt (confidence: 0.8838)
Sarcasm: True (confidence: 0.7588)
Sentiment: NEG (confidence: 0.9769)

Example 3: الطقس حار جدا اليوم في القاهرة
msa
False
NEU
Dialect: msa (confidence: 0.9409)
Sarcasm: False (confidence: 0.9927)
Sentiment: NEU (confidence: 0.9103)


In [46]:
true_dialects = []
pred_dialects = []

true_sarcasm = []
pred_sarcasm = []

true_sentiments = []
pred_sentiments = []

for i, row in train_df.iterrows():
    text = row["tweet"]
    
    # Avoid overwriting list variables by using different names for per-row values
    true_dialect_value = row["dialect"]
    true_sarcasm_value = row["sarcasm"]
    true_sentiment_value = row["sentiment"]
    
    result = analyze_arabic_text(text)
    
    # Append ground truths
    true_dialects.append(true_dialect_value)
    true_sarcasm.append(str(true_sarcasm_value).strip().lower())  # Standardize
    true_sentiments.append(true_sentiment_value)
    
    # Append predictions
    pred_dialects.append(result["dialect"]["prediction"])
    pred_sarcasm.append(str(result["sarcasm"]["prediction"]).strip().lower())  # Standardize
    pred_sentiments.append(result["sentiment"]["prediction"])


msa
False
NEU
msa
False
POS
msa
True
NEG
gulf
True
NEG
msa
False
NEU
msa
False
NEG
egypt
True
NEG
egypt
True
NEG
msa
False
NEU
msa
False
NEU
msa
False
NEU
gulf
False
POS
msa
False
NEG
msa
False
NEU
egypt
True
NEG
msa
False
NEU
msa
False
NEG
egypt
False
NEU
msa
True
NEG
msa
False
NEU
levant
False
POS
msa
True
NEG
msa
False
NEU
egypt
False
NEU
msa
False
NEG
msa
False
NEG
msa
True
NEG
levant
True
NEG
msa
False
NEG
msa
False
NEG
gulf
False
NEG
msa
False
NEU
levant
False
NEG
egypt
True
NEG
msa
False
NEU
msa
False
NEU
msa
False
POS
msa
False
NEU
egypt
False
NEG
egypt
False
NEG
msa
False
NEU
levant
False
NEG
msa
False
POS
msa
False
NEU
gulf
True
NEG
msa
False
NEU
msa
False
NEG
msa
False
NEG
msa
False
POS
msa
False
POS
msa
False
NEU
msa
False
NEU
msa
False
NEG
msa
False
NEG
gulf
False
NEG
msa
False
NEU
msa
False
POS
egypt
False
POS
msa
False
NEU
egypt
True
NEG
msa
False
NEU
msa
False
NEU
egypt
True
NEG
msa
False
NEU
msa
True
NEG
gulf
False
NEU
msa
False
POS
msa
False
NEU
msa
False
NEG
msa
True

In [47]:
from sklearn.metrics import accuracy_score

print(f"Dialect Accuracy: {accuracy_score(true_dialects, pred_dialects):.4f}")
print(f"Sarcasm Accuracy: {accuracy_score(true_sarcasm, pred_sarcasm):.4f}")
print(f"Sentiment Accuracy: {accuracy_score(true_sentiments, pred_sentiments):.4f}")


Dialect Accuracy: 0.8893
Sarcasm Accuracy: 0.9183
Sentiment Accuracy: 0.9379
