In [173]:
pip install pyconll

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [174]:
pip install transformers datasets evaluate seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


# Token classification

In [175]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [176]:
#!unzip /content/archive.zip -d /content/extracted_data

## Load dataset

Start by loading the WNUT 17 dataset from the 🤗 Datasets library:

In [None]:
import pyconll
data = pyconll.load_from_file('/kaggle/input/data-set2/Arabic_POS.conllu')

Then take a look at an example:

#Preprocessing

In [178]:
sentences = []
labels = []
for sentence in data:
  sentences.append([token.form for token in sentence])
  labels.append([token.upos for token in sentence])

In [179]:
label_list = [
    "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", 
    "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "None"
]

In [180]:
labels = [
    [tag if tag is not None else "None" for tag in tag_sequence]
    for tag_sequence in labels
]
# then later you build
label_map = {label: i for i, label in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}


In [181]:
sentences[0]

['برلين',
 'ترفض',
 'حصول',
 'شركة',
 'اميركية',
 'على',
 'رخصة',
 'تصنيع',
 'دبابة',
 '"',
 'ليوبارد',
 '"',
 'الالمانية']

In [182]:
sentense_id = []
for i in range (len(sentences)):
  sentense_id.append(i)


In [183]:
labels[0]


['X',
 'VERB',
 'NOUN',
 'NOUN',
 'ADJ',
 'ADP',
 'NOUN',
 'NOUN',
 'NOUN',
 'PUNCT',
 'X',
 'PUNCT',
 'ADJ']

In [184]:
label_ids = [
    [label_map[tag] for tag in tag_sequence]
    for tag_sequence in labels
]
label_ids[0]

[16, 15, 7, 7, 0, 1, 7, 7, 7, 12, 16, 12, 0]

In [185]:
labels[0]

['X',
 'VERB',
 'NOUN',
 'NOUN',
 'ADJ',
 'ADP',
 'NOUN',
 'NOUN',
 'NOUN',
 'PUNCT',
 'X',
 'PUNCT',
 'ADJ']

In [186]:
data_dict = {
    'id': sentense_id,
    'ner_tags': label_ids,
    'tokens': sentences,
}

In [187]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [188]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification

# 1. Initialize tokenizer with correct Arabic model
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")

# 2. Now create the data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    return_tensors="tf"  
)

In [189]:
from datasets import Dataset
data_dict = Dataset.from_dict(data_dict)

In [190]:
tokenized_data = data_dict.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/6075 [00:00<?, ? examples/s]

#Split data

In [191]:
from sklearn.model_selection import train_test_split
train_test_split = tokenized_data.train_test_split(test_size=0.2)

In [192]:
tokenized_data_train = train_test_split['train']
tokenized_data_test = train_test_split['test']


#Evaluate

In [193]:
import evaluate

seqeval = evaluate.load("seqeval")

In [194]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

#Model

In [195]:
from transformers import create_optimizer

batch_size = 4
num_train_epochs = 3
num_train_steps = (len(train_test_split["train"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [196]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "aubmindlab/bert-base-arabertv02", 
    num_labels=len(label_list),
    id2label=id2label, 
    label2id=label_map
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForTokenClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [197]:
tf_train_set = model.prepare_tf_dataset(
    train_test_split["train"],
    shuffle=True,
    batch_size=4,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    train_test_split["test"],
    shuffle=False,
    batch_size=4,
    collate_fn=data_collator,
)

In [198]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [199]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [200]:
# 1. Suppress seqeval warnings (optional)
import warnings
warnings.filterwarnings("ignore", message=".* seems not to be NE tag.")

# 2. Configure Git (required)
!git config --global user.email "your_email@example.com"
!git config --global user.name "Your Name"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="Mariam_classifer2",
    hub_model_id="MariamOsama3/Mariam_classifer2",
    tokenizer=tokenizer,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/MariamOsama3/Mariam_classifer2 into local empty directory.


In [202]:
callbacks = [metric_callback, push_to_hub_callback]

In [205]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)

Epoch 1/2
Epoch 2/2


<tf_keras.src.callbacks.History at 0x7de9c81c8b90>

#Inference

In [206]:
model.save_pretrained("Mariam_classifer2", id2label=id2label, label2id=label_map)

In [207]:
model.push_to_hub("Mariam_classifer2")
tokenizer.push_to_hub("Mariam_classifer2")

README.md:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/MariamOsama3/Mariam_classifer2/commit/47c89365a33a36621eff450e232e5cfffe81928f', commit_message='Upload tokenizer', commit_description='', oid='47c89365a33a36621eff450e232e5cfffe81928f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MariamOsama3/Mariam_classifer2', endpoint='https://huggingface.co', repo_type='model', repo_id='MariamOsama3/Mariam_classifer2'), pr_revision=None, pr_num=None)

In [219]:
text = "برلين ترفض حصول شركة امريكية على رخصة"


In [224]:
from transformers import pipeline

classifier = pipeline("ner", model="MariamOsama3/Mariam_classifer2")
classifier(text)

Some layers from the model checkpoint at MariamOsama3/Mariam_classifer2 were not used when initializing TFBertForTokenClassification: ['dropout_759']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at MariamOsama3/Mariam_classifer2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.
Device set to use 0


[{'entity': 'X',
  'score': 0.9966648,
  'index': 1,
  'word': 'برلين',
  'start': 0,
  'end': 5},
 {'entity': 'VERB',
  'score': 0.99800056,
  'index': 2,
  'word': 'ترفض',
  'start': 6,
  'end': 10},
 {'entity': 'NOUN',
  'score': 0.99793893,
  'index': 3,
  'word': 'حصول',
  'start': 11,
  'end': 15},
 {'entity': 'NOUN',
  'score': 0.99928254,
  'index': 4,
  'word': 'شركة',
  'start': 16,
  'end': 20},
 {'entity': 'ADJ',
  'score': 0.98166656,
  'index': 5,
  'word': 'امريكية',
  'start': 21,
  'end': 28},
 {'entity': 'ADP',
  'score': 0.99959654,
  'index': 6,
  'word': 'على',
  'start': 29,
  'end': 32},
 {'entity': 'NOUN',
  'score': 0.99877137,
  'index': 7,
  'word': 'رخصة',
  'start': 33,
  'end': 37}]

#Model2