# Fine Tuning Arabertv2
We will try finetuning arabertv2 on our dataset (MADAR) for it to identify arabic dialects based on region.

#### Tasks to do:
- Load Dataset ✔️
- Prepare the dataset ✔️
- Convert to Hugging Face Dataset ✔️
- Tokenization ✔️
- Model setup
- Training loop
- Run training
- Inference

### Imports

In [104]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import evaluate
import numpy as np

### Dataset
1. Loading dataset
2. Mapping each country to region
3. Limiting each region to 11,0000 sentences to balance it (6 regions and MSA so 77,000 sentences)
4. Shuffling the dataset so its random (random_state=42 for recreatability)

In [84]:
#loading dataset
dataset = pd.read_csv("Dataset/data_v0.1.0.csv")

dataset["dialect"].unique() # All the unique dialects in the dataset

array(['AE', 'BH', 'DZ', 'EG', 'IQ', 'JO', 'KW', 'LB', 'LY', 'MA', 'OM',
       'PL', 'QA', 'SA', 'SD', 'SY', 'TN', 'YE', 'MSA'], dtype=object)

In [85]:
#  Map each country to region
country_to_region = {
    "AE" : "GULF", 
    "BH" : "GULF",
    "DZ" : "NA", 
    "EG" : "NILE",
    "IQ" : "IRAQ",
    "JO" : "LEV",
    "KW" : "GULF",
    "LB" : "LEV",
    "LY" : "NA",
    "MA" : "NA", 
    "OM" : "GULF",
    "PL" : "LEV", 
    "QA" : "GULF",
    "SA" : "GULF",
    "SD" : "NILE", 
    "SY" :  "LEV",
    "TN" : "NA",
    "YE" : "YEM",
    "MSA" : "MSA"
}
dataset["regions"] = dataset["dialect"].map(country_to_region) # mapping dialect to new column called Region
dataset = dataset.groupby("regions",group_keys=False).sample(n=11000, random_state=42) #grouping each Region, limiting each to 11,000 then shuffling it
dataset.drop(columns="dialect", inplace=True) # remove dialect column as we dont need it
dataset["regions"].value_counts() # Regions and amount of each

regions
GULF    11000
IRAQ    11000
LEV     11000
MSA     11000
NA      11000
NILE    11000
YEM     11000
Name: count, dtype: int64

In [86]:
dataset.head()

Unnamed: 0,text,regions
193840,في العالم كله ماشي حرب صارت ما تندمو عليها راج...,GULF
189271,براءتهم قبل كل شي وش ذنبهم اله يغفرلهم,GULF
125177,حبيب قلبي انت عيل اعتذر لك لأني قرأت التغريده ...,GULF
7168,افخم جزراويه على وجه الارض,GULF
29618,دام يدفع مافي مشكله,GULF


get all unique regions and make a dictionary for the labels and ids

In [87]:
labels = dataset["regions"].unique()
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}
len(labels)

7

#### Converting Dataset to Hugging face Dataset
- converting it from pandas to Hugging Face Dataset
- splitting it 80% training and 20% testing

In [88]:
df = Dataset.from_pandas(dataset,preserve_index=False)

df = df.train_test_split(test_size=0.2,seed=42)

### Tokenizer
A tokenizer ✂️ is a tool that breaks down sentences into smaller pieces called tokens such as words or parts of words. For example, "I love dogs" can be turned into ["I", "love", "dogs"] to help computers understand text.
Then it turns the tokens into numbers (IDs) that a machine learning model can use. There are different types of tokenizers, like word, subword, or character tokenizers, depending on the task.
We will use the tokenizer that came with the ArabertV2 which was designed for arabic text.

In [89]:
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabert")
def tokenize(batch):
    return tokenizer(batch["text"],truncation=True)

tokenized = df.map(tokenize,batched=True)

Map: 100%|██████████| 61600/61600 [00:42<00:00, 1447.76 examples/s] 
Map: 100%|██████████| 15400/15400 [00:00<00:00, 29720.05 examples/s]


In [90]:
print(tokenized)

DatasetDict({
    train: Dataset({
        features: ['text', 'regions', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 61600
    })
    test: Dataset({
        features: ['text', 'regions', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15400
    })
})


In [96]:
# You currently have: DatasetDict(train/test) with columns: 
# ['text', 'Region', 'input_ids', 'token_type_ids', 'attention_mask']

from datasets import DatasetDict

# 1) Define a STABLE label order (don’t rely on set/unique order)
labels = ["GULF", "LEV", "NA", "NILE", "IRAQ", "YEM", "MSA"]
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}

# 2) Add integer labels to both splits (without re-tokenizing)
def add_labels(batch):
    return {"labels": label2id[batch["regions"]]}

tokenized = tokenized.map(add_labels)  # applies to both train and test in a DatasetDict

# 3) (Optional) keep only the columns the model needs
keep_cols = ["input_ids", "attention_mask", "labels"]
if "token_type_ids" in tokenized["train"].column_names:
    keep_cols.append("token_type_ids")

dataset = DatasetDict({
    "train": tokenized["train"].remove_columns([c for c in tokenized["train"].column_names if c not in keep_cols]),
    "test":  tokenized["test"].remove_columns([c for c in tokenized["test"].column_names  if c not in keep_cols]),
})

# 4) (Optional) set PyTorch format for speed
dataset.set_format(type="torch", columns=keep_cols)

# 5) Sanity checks
print(dataset)
print(dataset["train"].column_names)  # must include 'labels'
print(dataset["train"][0].keys())     # must include 'labels'


Map: 100%|██████████| 61600/61600 [00:02<00:00, 23761.86 examples/s]
Map: 100%|██████████| 15400/15400 [00:00<00:00, 25661.27 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 61600
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 15400
    })
})
['input_ids', 'token_type_ids', 'attention_mask', 'labels']
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])





## Model

In [98]:
model = AutoModelForSequenceClassification.from_pretrained(
    "aubmindlab/bert-base-arabert",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training Setup

1. **Data Collator**  
   Ensures that batches are padded dynamically to the longest sequence in each batch, making training efficient.

2. **Metrics**  
   We'll evaluate the model using **accuracy**, since our task is multi-class dialect classification.  


In [105]:
### Training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }


### Training Arguments

Here we define the **hyperparameters** and **training configuration**:
- `evaluation_strategy="epoch"` → Evaluate after each epoch.  
- `save_strategy="epoch"` → Save model checkpoints every epoch.  
- `learning_rate=2e-5` → Standard fine-tuning learning rate.  
- `num_train_epochs=5` → Train for 5 epochs.  
- `load_best_model_at_end=True` → Keeps the best model according to our metric (**f1_macro**).  


In [106]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    lr_scheduler_type="linear",
    seed=42,
    dataloader_num_workers=4,
)

In [108]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [111]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 1.3190721273422241, 'eval_accuracy': 0.692987012987013, 'eval_f1_macro': 0.693926644954301, 'eval_runtime': 34.2247, 'eval_samples_per_second': 449.968, 'eval_steps_per_second': 14.083, 'epoch': 5.0}


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
metrics = trainer.evaluate(tokenized["test"])
print(metrics)


In [None]:
import torch, numpy as np

def predict_one(text):
    # preprocess if you use Arabic normalization
    enc = tokenizer(text, return_tensors="pt", truncation=True).to(model.device)
    with torch.no_grad():
        logits = model(**enc).logits
    probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    pred_id = int(np.argmax(probs))
    return id2label[pred_id], float(probs[pred_id])

# Example usage
print(predict_one("يا حبيبي كيفك؟"))   # should map to LEV
print(predict_one("إزيك عامل إيه؟"))   # should map to NILE
