In [1]:
!pip install transformers datasets torch conllu

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecti

In [2]:
!pip install seqeval


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=34bda962efcc92cb9c0f75321ae639e47e7becc0d129d70f969b02e51b556d22
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


#### importing libraries

In [3]:
import torch
from datasets import Dataset
import pandas as pd
from conllu import parse
import numpy as np
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
)
from seqeval.metrics import accuracy_score, f1_score

###### # 1) Read CoNLL-U file into Python lists of tokens & UPOS tags

In [4]:
def read_conllu_file(file_path):
    from conllu import parse
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        sentences = parse(f.read())
    texts, tags = [], []
    for sent in sentences:
        words, upos = [], []
        for token in sent:
            words.append(token["form"])
            upos.append(token["upos"])
        if words:                    # skip empty lines
            texts.append(words)
            tags.append(upos)
    return texts, tags

train_texts, train_tags = read_conllu_file("/content/Arabic_POS.conllu")

###### # 2) Build tag2id / id2tag mappings

In [5]:
unique_tags = sorted({tag for sent in train_tags for tag in sent})
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}
num_labels = len(unique_tags)
print(f"Found {num_labels} unique POS tags.")

Found 18 unique POS tags.



##### 3) Create a raw Hugging‐Face Dataset

In [6]:
raw_ds = Dataset.from_dict({
    "tokens": train_texts,
    "tags":   [[tag2id[t] for t in seq] for seq in train_tags]
})

###### 4) Train/eval split

In [7]:
split = raw_ds.train_test_split(test_size=0.15, seed=42)
raw_train = split["train"]
raw_eval  = split["test"]
print(raw_train, raw_eval)


Dataset({
    features: ['tokens', 'tags'],
    num_rows: 5163
}) Dataset({
    features: ['tokens', 'tags'],
    num_rows: 912
})


###### 5) Load checkpoint, tokenizer and build a model with a fresh head
######    We use ignore_mismatched_sizes=True to drop the old 33-class head

In [8]:
checkpoint = "CAMeL-Lab/bert-base-arabic-camelbert-ca-pos-egy"

config = AutoConfig.from_pretrained(
    checkpoint,
    num_labels=num_labels,
    id2label=id2tag,
    label2id=tag2id,
)

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    config=config,
    ignore_mismatched_sizes=True
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca-pos-egy were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca-pos-egy and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([33, 768]) in the checkpoint and torch.Size([18, 768]) in the mod

tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/305k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

##### 6) Tokenize & align labels function

In [9]:
label_all_tokens = False  # set to True if you want to label subword tokens

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    all_labels = examples["tags"]
    new_labels = []

    for i, word_labels in enumerate(all_labels):
        word_ids = tokenized.word_ids(batch_index=i)
        prev_word = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word:
                label_ids.append(word_labels[word_idx])
            else:
                label_ids.append(word_labels[word_idx] if label_all_tokens else -100)
            prev_word = word_idx
        new_labels.append(label_ids)

    tokenized["labels"] = new_labels
    return tokenized

###### 7) Map over raw splits to get tokenized datasets

In [10]:
tokenized_train = raw_train.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "tags"]
)
tokenized_eval = raw_eval.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "tags"]
)


Map:   0%|          | 0/5163 [00:00<?, ? examples/s]

Map:   0%|          | 0/912 [00:00<?, ? examples/s]

###### 8) Define a metric function using seqeval

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    true_seqs, pred_seqs = [], []
    for true_row, pred_row in zip(labels, preds):
        t, p = [], []
        for t_id, p_id in zip(true_row, pred_row):
            if t_id == -100:
                continue
            t.append(id2tag[t_id])
            p.append(id2tag[p_id])
        true_seqs.append(t)
        pred_seqs.append(p)

    return {
        "accuracy": accuracy_score(true_seqs, pred_seqs),
        "f1":       f1_score(true_seqs, pred_seqs),
    }

###### 9) TrainingArguments & Trainer

In [12]:
training_args = TrainingArguments(
  "./results",            # output_dir
  do_train=True,
  do_eval=True,
  eval_steps=500,         # how often to run eval
  logging_steps=500,
  save_steps=500,
  learning_rate=2e-5,
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8,
  num_train_epochs=5,
  weight_decay=0.01,
  report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [13]:
trainer.train()

Step,Training Loss
500,0.2835
1000,0.1123
1500,0.0907
2000,0.07
2500,0.0547
3000,0.0477


TrainOutput(global_step=3230, training_loss=0.10503674291604813, metrics={'train_runtime': 886.268, 'train_samples_per_second': 29.128, 'train_steps_per_second': 3.644, 'total_flos': 1686587881904640.0, 'train_loss': 0.10503674291604813, 'epoch': 5.0})

In [14]:
# 6) You can also do a final evaluation on the eval set:
metrics = trainer.evaluate()
print(metrics)




{'eval_loss': 0.11617836356163025, 'eval_accuracy': 0.9713546578997009, 'eval_f1': 0.9593679807915731, 'eval_runtime': 6.5789, 'eval_samples_per_second': 138.626, 'eval_steps_per_second': 17.328, 'epoch': 5.0}


In [15]:

# 7) Or predict on a hold-out / test set:
predictions, label_ids, test_metrics = trainer.predict(tokenized_eval)
print(test_metrics)

{'test_loss': 0.11617836356163025, 'test_accuracy': 0.9713546578997009, 'test_f1': 0.9593679807915731, 'test_runtime': 6.6765, 'test_samples_per_second': 136.599, 'test_steps_per_second': 17.075}


In [16]:
trainer.save_model()

In [18]:
from transformers import pipeline

nlp = pipeline(
    "token-classification",
    model     = trainer.model,
    tokenizer = trainer.tokenizer,
    aggregation_strategy="simple",   # merges subword tags into one
    device    = 0                    # or -1 for CPU
)

print(nlp("هل انت بخير؟"))

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'PART', 'score': np.float32(0.95617265), 'word': 'هل', 'start': 0, 'end': 2}, {'entity_group': 'PRON', 'score': np.float32(0.9681617), 'word': 'انت', 'start': 3, 'end': 6}, {'entity_group': 'ADJ', 'score': np.float32(0.69105), 'word': 'بخير', 'start': 7, 'end': 11}, {'entity_group': 'PUNCT', 'score': np.float32(0.99983895), 'word': '؟', 'start': 11, 'end': 12}]
