م، نبدأ بـ 🧠 التعرف على الكيانات المسماة (Named Entity Recognition - NER) باستخدام نموذج متعدد اللغات زي bert-base-multilingual-cased.

حنستخدم بيانات جاهزة من datasets مثل WikiAnn، ونبدأ نجهز الكود خطوة خطوة.

In [None]:
!pip install transformers datasets seqeval


In [None]:
from datasets import load_dataset

# نستخدم بيانات WikiAnn للغة العربية كمثال
dataset = load_dataset("wikiann", "ar")

# نعاين أول مثال من بيانات التدريب
print(dataset["train"][0])


In [None]:
from transformers import AutoTokenizer

# نحمل Tokenizer الخاص بـ mBERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# مثال: نعاين كيف tokenizer بيشتغل
example = dataset["train"][0]["tokens"]
tokenized_input = tokenizer(example, is_split_into_words=True)
print(tokenized_input.tokens())


In [None]:
label_list = dataset["train"].features["ner_tags"].feature.names
print(label_list)


In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if True else -100)  # OR inside-word strategy
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
# تطبيق الدالة على مجموعات البيانات
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(label_list)
)


In [8]:
from transformers import TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report, f1_score

# دالة لتقييم النموذج
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions)
    }


In [None]:
!pip install -U transformers



In [None]:
# ✅ تدريب نموذج NER باللغة العربية باستخدام mBERT - كود كامل (نسخة معدلة)

from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
from seqeval.metrics import f1_score
import os

# ✅ إيقاف wandb
os.environ["WANDB_DISABLED"] = "true"

# ✅ تحميل البيانات
raw_datasets = load_dataset("wikiann", "ar")
label_list = raw_datasets["train"].features["ner_tags"].feature.names

# ✅ تحميل Tokenizer
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# ✅ تجهيز البيانات

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# ✅ تطبيق المعالجة
encoded_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)

# ✅ تحميل النموذج
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list)
)

# ✅ إعدادات التدريب
training_args = TrainingArguments(
    output_dir="./ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)

# ✅ دالة التقييم

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {"f1": f1_score(true_labels, true_predictions)}

# ✅ إنشاء المدرب وتدريب النموذج
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_datasets["train"].shuffle(seed=42).select(range(1000)),
    eval_dataset=encoded_datasets["validation"].select(range(200)),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import pipeline

# تحميل النموذج المدرب
model_path = "./ner_model/checkpoint-XXX"  # غيّر XXX برقم آخر Checkpoint عندك
ner_tokenizer = AutoTokenizer.from_pretrained(model_path)
ner_model = AutoModelForTokenClassification.from_pretrained(model_path)

# إنشاء Pipeline للتعرف على الكيانات
ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple")

# جملة تجريبية
text = "ولد محمد صلاح في قرية نجريج في مصر."

results = ner_pipe(text)

for entity in results:
    print(f"{entity['word']} ({entity['entity_group']}): {entity['score']:.2f}")


In [None]:
from transformers import pipeline

# ✅ تحميل بايبلاين NER باستخدام النموذج المدرب
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ✅ مثال لجملة عربية
text = "ذهب محمد إلى القاهرة لحضور مؤتمر الذكاء الاصطناعي في جامعة عين شمس."

# ✅ تطبيق النموذج
ner_results = ner_pipeline(text)

# ✅ عرض النتائج
for entity in ner_results:
    print(f"الكلمة: {entity['word']}, التصنيف: {entity['entity_group']}, النتيجة: {entity['score']:.2f}")


In [None]:
# حفظ النموذج والـ tokenizer علشان نقدر نستخدمهم بعدين
model.save_pretrained("ner_model_arabic")
tokenizer.save_pretrained("ner_model_arabic")


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("ner_model_arabic")
tokenizer = AutoTokenizer.from_pretrained("ner_model_arabic")
