In [None]:
# Verify that you have the GPU recognized
!nvidia-smi

In [None]:
!pip install -U transformers==4.45.2
!pip install torch==2.5.1 torchvision==0.20.1
!pip install seqeval==1.2.2
!pip uninstall -y torchaudio



In [None]:
#######################

In [None]:
!export PYTHONPATH=PYTHONPATH:/path/to/ArabicNER

# The following part of the code is adapted from:
Mustafa Jarrar, Mohammed Khalilia, and Sana Ghanem (2022) "Wojood: Nested Arabic Named Entity Corpus and Recognition using BERT. "Proceedings of the International Conference on Language Resources and Evaluation (LREC 2022), Marseille, France.
# Source: https://github.com/SinaLab/ArabicNER


In [None]:
# Remove existing package and clone again from Github
!rm -rf /content/ArabicNER
!git clone https://github.com/SinaLab/ArabicNER.git

In [None]:
# Add the ArabicNER package to the system path
import sys
import argparse
sys.path.append('/content/ArabicNER/')

In [None]:
# Import train function
from arabiner.bin.train import main as train

#We changed the model name each time to test different models:

in this line      
  "kwargs": {"dropout": 0.1, "bert_model": "qarib/bert-base-qarib"}  


In [None]:
# Setup the model arguments
args_dict = {
    # Model output path to save artifacts and model predictions
    "output_path": "/content/output/",

    # train/test/validation data paths
    "train_path": "/content/ANERtrainC10.txt",
    "test_path": "/content/ANERtestC10.txt",
    "val_path": "/content/ANERvalC10.txt",

    # seed for randomization
    "seed": 1,

    "batch_size": 16,

    # Nmber of workers for the dataloader
    "num_workers": 1,

    # GPU/device Ids to train model on
    # For two GPUs use [0, 1]
    # For three GPUs use [0, 1, 2], etc.
    "gpus": [0],

    # Overwrite data in output_path directory specified above
    "overwrite": True,

    # How often to print the logs in terms of number of steps
    "log_interval": 10,

    # Data configuration
    # Here we specify the dataset class and there are two options:
    #  arabiner.data.datasets.DefaultDataset: for flat NER
    #  arabiner.data.datasets.NestedTagsDataset: for nested NER
    #
    # kwargs: keyword arguments to the dataset class
    # This notebook used the DefaultDataset for flat NER
    "data_config": {
        "fn": "arabiner.data.datasets.DefaultDataset",
        "kwargs": {"max_seq_len": 256}
    },

    # Neural net configuration
    # There are two NNs:
    #   arabiner.nn.BertSeqTagger: flat NER tagger
    #   arabiner.nn.BertNestedTagger: nested NER tagger
    #
    # kwargs: keyword arguments to the NN
    # This notebook uses BertSeqTagger for flat NER tagging
    "network_config": {
        "fn": "arabiner.nn.BertSeqTagger",
        "kwargs": {"dropout": 0.1, "bert_model": "qarib/bert-base-qarib"}  #  We changed this model name each time to test different models

    },

    # Model trainer configuration
    #
    #  arabiner.trainers.BertTrainer: for flat NER training
    #  arabiner.trainers.BertNestedTrainer: for nested NER training
    #
    # kwargs: keyword arguments to arabiner.trainers.BertTrainer
    #         additional arguments you can pass includes
    #           - clip: for gradient clpping
    #           - patience: number of epochs for early termination
    # This notebook uses BertTrainer for fat NER training
    "trainer_config": {
        "fn": "arabiner.trainers.BertTrainer",
        "kwargs": {"max_epochs": 50}
    },

    # Optimizer configuration
    # Our experiments use torch.optim.AdamW, however, you are free to pass
    # any other optmizers such as torch.optim.Adam or torch.optim.SGD
    # lr: learning rate
    # kwargs: keyword arguments to torch.optim.AdamW or whatever optimizer you use
    #
    # Additional optimizers can be found here:
    # https://pytorch.org/docs/stable/optim.html
    "optimizer": {
        "fn": "torch.optim.AdamW",
        "kwargs": {"lr": 1e-5}
    },

    # Learning rate scheduler configuration
    # You can pass a learning scheduler such as torch.optim.lr_scheduler.StepLR
    # kwargs: keyword arguments to torch.optim.AdamW or whatever scheduler you use
    #
    # Additional schedulers can be found here:
    # https://pytorch.org/docs/stable/optim.html
    "lr_scheduler": {
        "fn": "torch.optim.lr_scheduler.ExponentialLR",
        "kwargs": {"gamma": 1}
    },

    # Loss function configuration
    # We use cross entropy loss
    # kwargs: keyword arguments to torch.nn.CrossEntropyLoss or whatever loss function you use
    "loss": {
        "fn": "torch.nn.CrossEntropyLoss",
        "kwargs": {}
    }
}

# Convert args dictionary to argparse namespace
args = argparse.Namespace()
args.__dict__ = args_dict

In [None]:
# Start training the model
train(args)

In [None]:

################ To Test the Model ####################

In [None]:
# Remove existing ArabicNER model and clone the model huggingface repo
!rm -rf /content/outputval
#!git clone --branch flat https://huggingface.co/SinaLab/ArabicNER-Wojood

In [None]:
# Import test function
from arabiner.bin.eval import main as eval

TEST on the Validation data

In [None]:
# Setup the evaluation arguments
args_dict = {
    # Output path to save logs, metrics and predictions
    "output_path": "/content/outputval/",

    # train/test/validation data paths
    # The data provided in the ArabicNER repo is a sample data
    # data_paths takes a list of data paths in case you need to evaluate multiple datasets
    "data_paths": ["/content/ANERvalC10.txt"],

    # Path to the model, this corresponds to the "output_path" you specified
    # during training the model
    "model_path": "/content/output/",

    "batch_size": 16
}

# Convert args dictionary to argparse namespace
args = argparse.Namespace()
args.__dict__ = args_dict
eval(args)

In [None]:
#test on the test data

In [None]:
# Setup the evaluation arguments
args_dict = {
    # Output path to save logs, metrics and predictions
    "output_path": "/content/outputtest/",

    # train/test/validation data paths
    # The data provided in the ArabicNER repo is a sample data
    # data_paths takes a list of data paths in case you need to evaluate multiple datasets
    "data_paths": ["/content/ANERtestC10.txt"],

    # Path to the model, this corresponds to the "output_path" you specified
    # during training the model
    "model_path": "/content/output/",

    "batch_size": 16
}

# Convert args dictionary to argparse namespace
args = argparse.Namespace()
args.__dict__ = args_dict

eval(args)