# ParsBERT (v3.0)
## BertNER
This model fine-tuned for the Named Entity Recognition (NER) task on a mixed NER dataset collected from ARMAN, PEYMA, and WikiANN that covered ten types of entities:

* Date (DAT)
* Event (EVE)
* Facility (FAC)
* Location (LOC)
* Money (MON)
* Organization (ORG)
* Percent (PCT)
* Person (PER)
* Product (PRO)
* Time (TIM)

In [1]:
!nvidia-smi
!lscpu

Mon Aug 16 15:01:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install transformers==4.7.0

Collecting hazm==0.7.0
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[?25l[K     |█                               | 10 kB 16.8 MB/s eta 0:00:01[K     |██                              | 20 kB 14.5 MB/s eta 0:00:01[K     |███                             | 30 kB 13.4 MB/s eta 0:00:01[K     |████▏                           | 40 kB 10.7 MB/s eta 0:00:01[K     |█████▏                          | 51 kB 10.4 MB/s eta 0:00:01[K     |██████▏                         | 61 kB 11.6 MB/s eta 0:00:01[K     |███████▎                        | 71 kB 9.2 MB/s eta 0:00:01[K     |████████▎                       | 81 kB 10.1 MB/s eta 0:00:01[K     |█████████▎                      | 92 kB 9.8 MB/s eta 0:00:01[K     |██████████▍                     | 102 kB 10.4 MB/s eta 0:00:01[K     |███████████▍                    | 112 kB 10.4 MB/s eta 0:00:01[K     |████████████▍                   | 122 kB 10.4 MB/s eta 0:00:01[K     |█████████████▌                  | 133 kB 10.4 MB/s eta 0:

In [3]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [4]:
import os
import gc
import ast
import time
import hazm
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import transformers
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForTokenClassification

from IPython.display import display, HTML, clear_output
from ipywidgets import widgets, Layout

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [5]:
class NER:
    def __init__(self, model_name):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        self.config = AutoConfig.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        # self.labels = list(self.config.label2id.keys())
        self.id2label = self.config.id2label

    @staticmethod
    def load_ner_data(file_path, word_index, tag_index, delimiter, join=False):
        dataset, labels = [], []
        with open(file_path, encoding="utf8") as infile:
            sample_text, sample_label = [], []
            for line in infile:
                parts = line.strip().split(delimiter)
                if len(parts) > 1:
                    word, tag = parts[word_index], parts[tag_index]
                    if not word:
                        continue
                    sample_text.append(word)
                    sample_label.append(tag)
                else:
                    # end of sample
                    if sample_text and sample_label:
                        if join:
                            dataset.append(' '.join(sample_text))
                            labels.append(' '.join(sample_label))
                        else:
                            dataset.append(sample_text)
                            labels.append(sample_label)
                    sample_text, sample_label = [], []
        if sample_text and sample_label:
            if join:
                dataset.append(' '.join(sample_text))
                labels.append(' '.join(sample_label))
            else:
                dataset.append(sample_text)
                labels.append(sample_label)
        return dataset, labels

    def load_test_datasets(self, dataset_name, dataset_dir, **kwargs):
        if dataset_name.lower() == "peyma":
            ner_file_path = dataset_dir + 'test.txt'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            return self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter='|',
                                      join=kwargs.get('join', False))
        elif dataset_name.lower() == "arman":
            dataset, labels = [], []
            for i in range(1, 4):
                ner_file_path = dataset_dir + f'test_fold{i}.txt'
                if not os.path.exists(ner_file_path):
                    print(ner_file_path)
                dataset_part, labels_part = self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter=' ',
                                                               join=kwargs.get('join', False))
                dataset += dataset_part
                labels += labels_part
            return dataset, labels
        elif dataset_name.lower() == "hooshvare-peyman+arman+wikiann":
            ner_file_path = dataset_dir + 'test.csv'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            data = pd.read_csv(ner_file_path, delimiter="\t")
            sentences, sentences_tags = data['tokens'].values.tolist(), data['ner_tags'].values.tolist()
            sentences = [ast.literal_eval(ss) for ss in sentences]
            sentences_tags = [ast.literal_eval(ss) for ss in sentences_tags]
            print(f'test part:\n #sentences: {len(sentences)}, #sentences_tags: {len(sentences_tags)}')
            return sentences, sentences_tags

    def load_datasets(self, dataset_name, dataset_dir, **kwargs):
        if dataset_name.lower() == "farsiyar":
            dataset, labels = [], []
            for i in range(1, 6):
                ner_file_path = dataset_dir + 'Persian-NER-part{i}.txt'
                if not os.path.exists(ner_file_path):
                    print(ner_file_path)
                dataset_part, labels_part = self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter='\t',
                                                               join=kwargs.get('join', False))
                dataset += dataset_part
                labels += labels_part
            return dataset, labels
        elif dataset_name.lower() == "wikiann":
            ner_file_path = dataset_dir + 'wikiann-fa.bio'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            dataset_all, labels_all = self.load_ner_data(ner_file_path, word_index=0, tag_index=-1, delimiter=' ',
                                                         join=kwargs.get('join', False))
            print(f'all data: #data: {len(dataset_all)}, #labels: {len(labels_all)}')

            try:
                _, data_test, _, label_test = train_test_split(dataset_all, labels_all, test_size=0.1, random_state=1,
                                                               stratify=labels_all)
                print("with stratify")
            except:
                _, data_test, _, label_test = train_test_split(dataset_all, labels_all, test_size=0.1, random_state=1)
                print("without stratify")
            print(f'test part:\n #data: {len(data_test)}, #labels: {len(label_test)}')
            return dataset_all, labels_all, data_test, label_test

    def ner_inference(self, input_text, device, max_length):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        pt_batch = self.tokenizer(
            [self.normalizer.normalize(sequence) for sequence in input_text],
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        pt_batch = pt_batch.to(device)
        pt_outputs = self.model(**pt_batch)
        pt_predictions = torch.argmax(pt_outputs.logits, dim=-1)
        pt_predictions = pt_predictions.cpu().detach().numpy().tolist()

        output_predictions = []
        for i, sequence in enumerate(input_text):
            tokens = self.tokenizer.tokenize(self.tokenizer.decode(self.tokenizer.encode(sequence)))
            predictions = [(token, self.id2label[prediction]) for token, prediction in
                           zip(tokens, pt_predictions[i])]
            output_predictions.append(predictions)
        return output_predictions

    def ner_evaluation(self, input_text, input_labels, device, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        max_len = 0
        tokenized_texts, new_labels = [], []
        for sentence, sentence_label in zip(input_text, input_labels):
            if type(sentence) == str:
                sentence = sentence.strip().split()
            if len(sentence) != len(sentence_label):
                print('Something wrong has been happened! Length of a sentence and its label is not equal!')
                return
            tokenized_sentence, new_sentence_label = [], []
            for word, label in zip(sentence, sentence_label):
                # Tokenize the word and count # of subwords the word is broken into
                tokenized_word = self.tokenizer.tokenize(word)
                n_subwords = len(tokenized_word)

                # Add the tokenized word to the final tokenized word list
                tokenized_sentence.extend(tokenized_word)
                # Add the same label to the new list of labels `n_subwords` times
                new_sentence_label.extend([label] * n_subwords)

            max_len = max(max_len, len(tokenized_sentence))
            tokenized_texts.append(tokenized_sentence)
            new_labels.append(new_sentence_label)

        max_len = min(max_len, self.config.max_position_embeddings)
        print("max_len:", max_len)
        input_ids = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                                  maxlen=max_len, dtype="long", value=self.config.pad_token_id,
                                  truncating="post", padding="post")
        del tokenized_texts
        input_labels = pad_sequences([[self.config.label2id.get(l) for l in lab] for lab in new_labels],
                                     maxlen=max_len, value=self.config.label2id.get('O'), padding="post",
                                     dtype="long", truncating="post")
        del new_labels

        train_data = TensorDataset(torch.tensor(input_ids), torch.tensor(input_labels))
        data_loader = DataLoader(train_data, batch_size=batch_size)
        # data_loader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
        print("#samples:", len(input_ids))
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_loss, total_time = 0, 0
        output_predictions = []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids, b_labels = batch

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_labels = b_labels.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                outputs = self.model(b_input_ids, labels=b_labels)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')
            # get the loss
            total_loss += outputs.loss.item()

            b_predictions = torch.argmax(outputs.logits, dim=2)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_labels = b_labels.cpu().detach().numpy().tolist()

            for i, sample in enumerate(b_input_ids):
                sample_input = list(sample)
                # remove pad tokens
                while sample_input[-1] == self.config.pad_token_id:
                    sample_input.pop()
                # tokens = self.tokenizer.tokenize(self.tokenizer.decode(sample_input))
                tokens = [self.tokenizer.decode([t]) for t in sample_input]
                sample_true_labels = [self.id2label[e] for e in b_labels[i][:len(sample_input)]]
                sample_predictions = [self.id2label[e] for e in b_predictions[i][:len(sample_input)]]
                output_predictions.append(
                    [(t, sample_true_labels[j], sample_predictions[j]) for j, t in enumerate(tokens)])

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(data_loader)
        print("average loss:", avg_train_loss)
        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_ids))

        return output_predictions

    def ner_evaluation_2(self, input_text, input_labels, device, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        print("len(input_text):", len(input_text))
        print("len(input_labels):", len(input_labels))
        c = 0
        max_len = 0
        tokenized_texts, new_labels = [], []
        for sentence, sentence_label in zip(input_text, input_labels):
            if type(sentence) == str:
                sentence = sentence.strip().split()
            if len(sentence) != len(sentence_label):
                print('Something wrong has been happened! Length of a sentence and its label is not equal!')
                return
            tokenized_words = self.tokenizer(sentence, padding=False, add_special_tokens=False).input_ids
            tokenized_sentence_ids, new_sentence_label = [], []
            for i, tokenized_word in enumerate(tokenized_words):
                # Add the tokenized word to the final tokenized word list
                tokenized_sentence_ids += tokenized_word
                # Add the same label to the new list of labels `number of subwords` times
                new_sentence_label.extend([self.config.label2id.get(sentence_label[i])] * len(tokenized_word))

            max_len = max(max_len, len(tokenized_sentence_ids))
            tokenized_texts.append(tokenized_sentence_ids)
            new_labels.append(new_sentence_label)
            c += 1
            if c % 10000 == 0:
                print("c:", c)
        max_len = min(max_len, self.config.max_position_embeddings)
        print("max_len:", max_len)
        input_ids = pad_sequences(tokenized_texts, maxlen=max_len, dtype="long", value=self.config.pad_token_id,
                                  truncating="post", padding="post")
        del tokenized_texts
        input_labels = pad_sequences(new_labels, maxlen=max_len, value=self.config.label2id.get('O'), padding="post",
                                     dtype="long", truncating="post")
        del new_labels

        train_data = TensorDataset(torch.tensor(input_ids), torch.tensor(input_labels))
        data_loader = DataLoader(train_data, batch_size=batch_size)
        # data_loader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
        print("#samples:", len(input_ids))
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_time = 0
        output_predictions = []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids, b_labels = batch

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_labels = b_labels.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                outputs = self.model(b_input_ids, labels=b_labels)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')

            b_predictions = torch.argmax(outputs.logits, dim=2)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_labels = b_labels.cpu().detach().numpy().tolist()

            for i, sample in enumerate(b_input_ids):
                sample_input = list(sample)
                # remove pad tokens
                while sample_input[-1] == self.config.pad_token_id:
                    sample_input.pop()
                # tokens = self.tokenizer.tokenize(self.tokenizer.decode(sample_input))
                tokens = [self.tokenizer.decode([t]) for t in sample_input]
                sample_true_labels = [self.id2label[e] for e in b_labels[i][:len(sample_input)]]
                sample_predictions = [self.id2label[e] for e in b_predictions[i][:len(sample_input)]]
                output_predictions.append(
                    [(t, sample_true_labels[j], sample_predictions[j]) for j, t in enumerate(tokens)])

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_ids))

        return output_predictions

    def check_input_label_consistency(self, labels):
        model_labels = self.config.label2id.keys()
        dataset_labels = set()
        for l in labels:
            dataset_labels.update(set(l))
        print("model labels:", model_labels)
        print("dataset labels:", dataset_labels)
        print("intersection:", set(model_labels).intersection(dataset_labels))
        print("model_labels-dataset_labels:", list(set(model_labels) - set(dataset_labels)))
        print("dataset_labels-model_labels:", list(set(dataset_labels) - set(model_labels)))
        if list(set(dataset_labels) - set(model_labels)):
            return False
        return True

    @staticmethod
    def resolve_input_label_consistency(labels, label_translation_map):
        for i, sentence_labels in enumerate(labels):
            for j, label in enumerate(sentence_labels):
                labels[i][j] = label_translation_map.get(label)
        return labels

    @staticmethod
    def evaluate_prediction_results(labels, output_predictions):
        dataset_labels = set()
        for label in labels:
            dataset_labels.update(set(label))

        true_labels, predictions = [], []
        for sample_output in output_predictions:
            sample_true_labels = []
            sample_predicted_labels = []
            for token, true_label, predicted_label in sample_output:
                sample_true_labels.append(true_label)
                if predicted_label in dataset_labels:
                    sample_predicted_labels.append(predicted_label)
                else:
                    sample_predicted_labels.append('O')
            true_labels.append(sample_true_labels)
            predictions.append(sample_predicted_labels)

        print("Test Accuracy: {}".format(accuracy_score(true_labels, predictions)))
        print("Test Precision: {}".format(precision_score(true_labels, predictions)))
        print("Test Recall: {}".format(recall_score(true_labels, predictions)))
        print("Test F1-Score: {}".format(f1_score(true_labels, predictions)))
        print("Test classification Report:\n{}".format(classification_report(true_labels, predictions, digits=10)))


In [6]:
model_name='HooshvareLab/bert-fa-zwnj-base-ner'
ner_model = NER(model_name)

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/346 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

In [7]:
print(ner_model.config)

BertConfig {
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "ner",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-DAT",
    "2": "B-EVE",
    "3": "B-FAC",
    "4": "B-LOC",
    "5": "B-MON",
    "6": "B-ORG",
    "7": "B-PCT",
    "8": "B-PER",
    "9": "B-PRO",
    "10": "B-TIM",
    "11": "I-DAT",
    "12": "I-EVE",
    "13": "I-FAC",
    "14": "I-LOC",
    "15": "I-MON",
    "16": "I-ORG",
    "17": "I-PCT",
    "18": "I-PER",
    "19": "I-PRO",
    "20": "I-TIM"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-DAT": 1,
    "B-EVE": 2,
    "B-FAC": 3,
    "B-LOC": 4,
    "B-MON": 5,
    "B-ORG": 6,
    "B-PCT": 7,
    "B-PER": 8,
    "B-PRO": 9,
    "B-TIM": 10,
    "I-DAT": 11,
    "I-EVE": 12,
    "I-FAC": 13,
    "I-LOC": 14,
    "I-MON": 15,
    "I-ORG": 16,
    "I-P

#### Sample Inference:

In [8]:
texts = [
    "مدیرکل محیط زیست استان البرز با بیان اینکه با بیان اینکه موضوع شیرابه‌های زباله‌های انتقال یافته در منطقه حلقه دره خطری برای این استان است، گفت: در این مورد گزارشاتی در ۲۵ مرداد ۱۳۹۷ تقدیم مدیران استان شده است.",
    "به گزارش خبرگزاری تسنیم از کرج، حسین محمدی در نشست خبری مشترک با معاون خدمات شهری شهرداری کرج که با حضور مدیرعامل سازمان‌های پسماند، پارک‌ها و فضای سبز و نماینده منابع طبیعی در سالن کنفرانس شهرداری کرج برگزار شد، اظهار داشت: ۸۰٪  جمعیت استان البرز در کلانشهر کرج زندگی می‌کنند.",
    "وی افزود: با همکاری‌های مشترک بین اداره کل محیط زیست و شهرداری کرج برنامه‌های مشترکی برای حفاظت از محیط زیست در شهر کرج در دستور کار قرار گرفته که این اقدامات آثار مثبتی داشته و تاکنون نزدیک به ۱۰۰ میلیارد هزینه جهت خریداری اکس-ریس صورت گرفته است.",
]

In [9]:
inference_output = ner_model.ner_inference(texts, device, ner_model.config.max_position_embeddings)

In [10]:
print(inference_output)

[[('[CLS]', 'O'), ('مدیرکل', 'O'), ('محیط', 'B-ORG'), ('زیست', 'I-ORG'), ('استان', 'I-ORG'), ('البرز', 'I-ORG'), ('با', 'O'), ('بیان', 'O'), ('اینکه', 'O'), ('با', 'O'), ('بیان', 'O'), ('اینکه', 'O'), ('موضوع', 'O'), ('شیر', 'O'), ('##ابه', 'O'), ('[ZWNJ]', 'O'), ('های', 'O'), ('زباله', 'O'), ('[ZWNJ]', 'O'), ('های', 'O'), ('انتقال', 'O'), ('یافته', 'O'), ('در', 'O'), ('منطقه', 'B-LOC'), ('حلقه', 'I-LOC'), ('دره', 'I-LOC'), ('خطری', 'O'), ('برای', 'O'), ('این', 'O'), ('استان', 'O'), ('است', 'O'), ('،', 'O'), ('گفت', 'O'), (':', 'O'), ('در', 'O'), ('این', 'O'), ('مورد', 'O'), ('گزارشاتی', 'O'), ('در', 'O'), ('۲۵', 'B-DAT'), ('مرداد', 'I-DAT'), ('۱۳۹۷', 'I-DAT'), ('تقدیم', 'O'), ('مدیران', 'O'), ('استان', 'O'), ('شده', 'O'), ('است', 'O'), ('.', 'O'), ('[SEP]', 'O')], [('[CLS]', 'O'), ('به', 'O'), ('گزارش', 'O'), ('خبرگزاری', 'B-ORG'), ('تسنیم', 'I-ORG'), ('از', 'O'), ('کرج', 'B-LOC'), ('،', 'O'), ('حسین', 'B-PER'), ('محمدی', 'I-PER'), ('در', 'O'), ('نشست', 'O'), ('خبری', 'O'), ('مشترک', 

In [11]:
#@title Live Playground { display-mode: "form" }

css_is_load = False
css = """<style>
.ner-box {
    direction: rtl;
    font-size: 18px !important;
    line-height: 20px !important;
    margin: 0 0 15px;
    padding: 10px;
    text-align: justify;
    color: #343434 !important;
}
.token, .token span {
    display: inline-block !important;
    padding: 2px;
    margin: 2px 0;
}
.token.token-ner {
    background-color: #f6cd61;
    font-weight: bold;
    color: #000;
}
.token.token-ner .ner-label {
    color: #9a1f40;
    margin: 0px 2px;
}
</style>"""

if not css_is_load:
    display(HTML(css))
    css_is_load = True

submit_wd = widgets.Button(description='Send', disabled=False, button_style='success', tooltip='Submit')
text_wd = widgets.Textarea(placeholder='Please enter you text ...', rows=5, layout=Layout(width='90%'))
output_wd = widgets.Output()

display(HTML("""
<h2>Test NER model</h2>
<p style="padding: 2px 20px; margin: 0 0 20px;">
</p>
<br /><br />
"""))

display(text_wd)
display(submit_wd)
display(output_wd)

def submit_text(sender):
    with output_wd:
        clear_output(wait=True)
        text = text_wd.value
        _output = ner_model.ner_inference([text], device, ner_model.config.max_position_embeddings)
        # print(_output)
        pred_sequence = []
        for token, label in _output[0]:
            if token not in ['[CLS]', '[SEP]']:
                if label != 'O':
                    pred_sequence.append(
                        '<span class="token token-ner">%s<span class="ner-label">%s</span></span>' 
                        % (token, label))
                else:
                    pred_sequence.append(
                        '<span class="token">%s</span>' 
                        % token)
            
        html = '<p class="ner-box">%s</p>' % ' '.join(pred_sequence) 
        display(HTML(html))

submit_wd.on_click(submit_text)

Textarea(value='', layout=Layout(width='90%'), placeholder='Please enter you text ...', rows=5)

Button(button_style='success', description='Send', style=ButtonStyle(), tooltip='Submit')

Output()

#### PEYMA dataset:
PEYMA dataset includes 7,145 sentences with a total of 302,530 tokens from which 41,148 tokens are tagged with seven different classes: 

- Organization
- Money
- Location
- Date
- Time
- Person
- Percent

|     Label    |   #   |
|:------------:|:-----:|
| Organization | 16964 |
|     Money    |  2037 |
|   Location   |  8782 |
|     Date     |  4259 |
|     Time     |  732  |
|    Person    |  7675 |
|    Percent   |  699  |

Download
You can download the dataset from [here](https://hooshvare.github.io/docs/datasets/ner) with leads to following google drive file of HooshvareLab:

In [12]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1WZxpFRtEs5HZWyWQ2Pyg9CCuIBs1Kmvx'})
download.GetContentFile('peyma.zip')
!ls

adc.json  peyma.zip  sample_data


In [13]:
!unzip peyma.zip
!ls
!ls peyma

Archive:  peyma.zip
   creating: peyma/
  inflating: peyma/dev.txt           
  inflating: peyma/test.txt          
  inflating: peyma/train.txt         
adc.json  peyma  peyma.zip  sample_data
dev.txt  test.txt  train.txt


In [14]:
sentences, labels = ner_model.load_test_datasets(dataset_name="peyma", dataset_dir="./peyma/")
print(len(sentences), len(labels))
print(sentences[0])
print(labels[0])

1026 1026
['کنایه', 'سرلشگر', 'فیروزآبادی', 'به', 'پادشاه', 'عربستان', 'و', 'پسرش']
['O', 'O', 'B_ORG', 'O', 'O', 'B_LOC', 'O', 'O']


In [15]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I_MON', 'I_PCT', 'I_LOC', 'I_PER', 'I_DAT', 'B_TIM', 'B_MON', 'B_PER', 'B_ORG', 'I_ORG', 'I_TIM', 'B_DAT', 'B_PCT', 'B_LOC', 'O'}
intersection: {'O'}
model_labels-dataset_labels: ['I-MON', 'I-PRO', 'I-PER', 'B-PER', 'I-LOC', 'B-LOC', 'B-ORG', 'I-EVE', 'B-MON', 'I-DAT', 'B-TIM', 'B-PRO', 'I-ORG', 'I-TIM', 'B-FAC', 'B-EVE', 'I-FAC', 'B-PCT', 'B-DAT', 'I-PCT']
dataset_labels-model_labels: ['I_MON', 'B_TIM', 'I_PCT', 'B_MON', 'B_PER', 'B_ORG', 'I_ORG', 'I_TIM', 'B_DAT', 'I_LOC', 'I_PER', 'B_PCT', 'B_LOC', 'I_DAT']
False


In [16]:
label_translate = {
    'B_ORG': 'B-ORG', 
    'I_ORG': 'I-ORG',
    'B_LOC': 'B-LOC',
    'I_LOC': 'I-LOC',
    'B_PER': 'B-PER', 
    'I_PER': 'I-PER',
    'B_DAT': 'B-DAT', 
    'I_DAT': 'I-DAT', 
    'B_PCT': 'B-PCT', 
    'I_PCT': 'I-PCT', 
    'B_TIM': 'B-TIM', 
    'I_TIM': 'I-TIM', 
    'B_MON': 'B-MON', 
    'I_MON': 'I-MON',
    'O': 'O'
}
labels = ner_model.resolve_input_label_consistency(labels, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-PER', 'I-MON', 'B-PER', 'I-LOC', 'B-LOC', 'B-ORG', 'B-PCT', 'B-MON', 'B-DAT', 'I-DAT', 'B-TIM', 'I-ORG', 'O', 'I-PCT', 'I-TIM'}
intersection: {'I-PER', 'I-MON', 'B-DAT', 'B-PER', 'B-MON', 'I-DAT', 'B-TIM', 'I-LOC', 'B-LOC', 'I-TIM', 'B-ORG', 'I-ORG', 'O', 'I-PCT', 'B-PCT'}
model_labels-dataset_labels: ['I-PRO', 'B-PRO', 'B-EVE', 'I-EVE', 'I-FAC', 'B-FAC']
dataset_labels-model_labels: []
True


In [17]:
!nvidia-smi
!lscpu

Mon Aug 16 15:04:43 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    72W / 149W |    971MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [18]:
inference_output_peyma = ner_model.ner_evaluation(sentences, labels, device, batch_size=512)

max_len: 155
#samples: 1026
#batch: 3
Start to evaluate test data ...
inference time for step 0: 0.03489479800003892
inference time for step 1: 0.014750266999953965
inference time for step 2: 0.013240209000002778
average loss: 1.9469637870788574
total inference time: 0.06288527399999566
total inference time / #samples: 6.12916900584753e-05


In [19]:
for sample_output in inference_output_peyma[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

کنایه	O	O
سرل	O	O
##شگر	O	O
[UNK]	B-ORG	B-PER
به	O	O
پادشاه	O	O
عربستان	B-LOC	B-LOC
و	O	O
پسرش	O	O

[UNK]	O	O
سابق	O	O
ستاد	B-ORG	O
کل	I-ORG	O
نیروهای	I-ORG	I-ORG
مسلح	I-ORG	I-ORG
با	O	O
بیان	O	O
اینکه	O	O
آ	O	B-LOC
##ل	O	O
سعود	O	I-PER
با	O	O
حمایت	O	O
همه	O	O
جانبه	O	O
غرب	O	O
بر	O	O
سرزمین	B-LOC	O
حجاز	I-LOC	B-LOC
حاکم	O	O
شد	O	O
گفت	O	O
:	O	O
غرب	O	O
با	O	O
حاکم	O	O
کرد	O	O
##د	O	O
آ	O	O
##ل	O	O
سعود	O	O
بر	O	O
حجاز	B-LOC	B-LOC
هدفی	O	O
جز	O	O
##نا	O	O
##بود	O	O
##ی	O	O
اسلام	O	O
نداشته	O	O
و	O	O
این	O	O
نقشه	O	O
انگلیس	B-LOC	B-LOC
بود	O	O
.	O	O

سرل	O	O
##شگر	O	O
حسن	B-PER	B-PER
[UNK]	I-PER	I-PER
روز	O	O
دوشنبه	O	O
درح	O	O
##اشیه	O	O
[UNK]	O	O
ختم	O	O
مادر	O	O
حیدر	B-PER	B-PER
مصلح	I-PER	I-PER
##ی	I-PER	I-PER
درج	O	O
##مع	O	O
خبرنگاران	O	O
درباره	O	O
موضوع	O	O
یمن	B-LOC	B-LOC
افزود	O	O
:	O	O
ماهیت	O	O
آ	O	O
##ن	O	O
##چه	O	O
در	O	O
یمن	B-LOC	B-LOC
اتفاق	O	O
می	O	O
افتد	O	O
وهاب	O	O
##یت	O	O
است	O	O
وهاب	O	O
##یت	O	O
یک	O	O
مذهب	O	O
انگلیسی	O	B-ORG
است	O	O
.	O	O

وی	O	O
ادامه	O	O
دا

In [20]:
ner_model.evaluate_prediction_results(labels, inference_output_peyma)

Test Accuracy: 0.9513963851753439
Test Precision: 0.6586444007858546
Test Recall: 0.5636822194199244
Test F1-Score: 0.6074745186862968
Test classification Report:
              precision    recall  f1-score   support

         DAT  0.5519480519 0.3794642857 0.4497354497       224
         LOC  0.7132352941 0.6918687589 0.7023895728       701
         MON  0.8148148148 0.7586206897 0.7857142857        29
         ORG  0.6754658385 0.5478589421 0.6050069541       794
         PCT  0.8157894737 0.6200000000 0.7045454545        50
         PER  0.5881104034 0.4955277281 0.5378640777       559
         TIM  0.2727272727 0.2727272727 0.2727272727        22

   micro avg  0.6586444008 0.5636822194 0.6074745187      2379
   macro avg  0.6331558785 0.5380096682 0.5797118668      2379
weighted avg  0.6553623369 0.5636822194 0.6045272632      2379



In [21]:
output_file_name = "ner_peyma_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_peyma:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Arman dataset:
ARMAN dataset holds 7,682 sentences with 250,015 sentences tagged over six different classes.

1. Organization
2. Location
3. Facility
4. Event
5. Product
6. Person


|     Label    |   #   |
|:------------:|:-----:|
| Organization | 30108 |
|   Location   | 12924 |
|   Facility   |  4458 |
|     Event    |  7557 |
|    Product   |  4389 |
|    Person    | 15645 |

**Download**
You can download the dataset from [here](https://github.com/HaniehP/PersianNER)


In [22]:
!wget https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
!ls

--2021-08-16 15:05:15--  https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/HaniehP/PersianNER/master/ArmanPersoNERCorpus.zip [following]
--2021-08-16 15:05:15--  https://raw.githubusercontent.com/HaniehP/PersianNER/master/ArmanPersoNERCorpus.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1931170 (1.8M) [application/zip]
Saving to: ‘ArmanPersoNERCorpus.zip’


2021-08-16 15:05:16 (19.4 MB/s) - ‘ArmanPersoNERCorpus.zip’ saved [1931170/1931170]

adc.json						  peyma
ArmanPersoNERCorpus.zip					  peyma.zip
ner_peyma_Hooshvar

In [23]:
!unzip ArmanPersoNERCorpus.zip -d arman
!ls

Archive:  ArmanPersoNERCorpus.zip
  inflating: arman/test_fold1.txt    
  inflating: arman/ReadMe.txt        
  inflating: arman/train_fold3.txt   
  inflating: arman/train_fold2.txt   
  inflating: arman/train_fold1.txt   
  inflating: arman/test_fold3.txt    
  inflating: arman/test_fold2.txt    
adc.json						  peyma
arman							  peyma.zip
ArmanPersoNERCorpus.zip					  sample_data
ner_peyma_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt


In [24]:
sentences, labels = ner_model.load_test_datasets(dataset_name="arman", dataset_dir="./arman/")
print(len(sentences), len(labels))
print(sentences[0])
print(labels[0])

7681 7681
['افقی', ':', '0', 'ـ', 'از', 'عوامل', 'دوران', 'پهلوی', 'و', 'نخست\u200cوزیر', 'ایران', 'در', 'سالهای', 'ابتدائی', 'دهه', 'چهل', 'خورشیدی', 'كه', 'جلد', 'سوم', 'یادداشتهایش', 'هم', 'چندی', 'پیش', 'در', 'تهران', 'منتشر', 'شد', '0', 'ـ', 'پرستاری', 'از', 'ناخوش\u200cاحوال', 'ـ', 'پوشاک', 'و', 'جامه', 'ـ', 'فانتزی', 'و', 'شیک', '0', 'ـ', 'در', 'حال', 'وزیدن', 'ـ', 'اطلاعیه', 'ـ', 'پایتخت', 'جمهوری', 'استونی', 'در', 'حوضه', 'بالتیک', '0', 'ـ', 'علم', 'راهبرد', 'مؤسسه', 'و', 'سازمان', 'ـ', 'نوعی', 'شمع', '0', 'ـ', 'حرف', 'جمع', 'مؤنث', 'ـ', 'در', 'ایران', 'به', 'تولیدکننده', 'کتاب', 'اطلاق', 'می\u200cشود', 'ـ', 'از', 'شهرهای', 'باختری', 'افغانستان', 'كه', 'تا', 'عصر', 'ناصرالدین\u200cشاه', 'جزئی', 'از', 'خراسان', 'بود', 'ـ', 'ویتامین', 'انعقاد', '0', 'ـ', 'سبزی', 'غده\u200cای', 'ـ', 'دوستی', 'و', 'محبت', 'ـ', 'داستان', 'بلند', 'ـ', 'شهری', 'در', 'آلمان', '0', 'ـ', 'سلول', 'بدن', 'موجودات', 'ـ', 'از', 'انواع', 'کالباس', '0', 'ـ', 'حاشیه', 'و', 'هامش', 'ـ', 'پیدا', 'نشدنی', 'ـ', 'خ

In [25]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-pers', 'B-pers', 'I-loc', 'I-pro', 'B-fac', 'B-event', 'I-event', 'I-fac', 'B-pro', 'B-loc', 'I-org', 'O', 'B-org'}
intersection: {'O'}
model_labels-dataset_labels: ['I-MON', 'I-PRO', 'I-PER', 'B-PER', 'I-LOC', 'B-LOC', 'B-ORG', 'I-EVE', 'B-MON', 'I-DAT', 'B-TIM', 'B-PRO', 'I-ORG', 'I-TIM', 'B-FAC', 'B-EVE', 'I-FAC', 'B-PCT', 'B-DAT', 'I-PCT']
dataset_labels-model_labels: ['I-pers', 'I-fac', 'B-pro', 'B-pers', 'I-loc', 'B-event', 'B-loc', 'I-org', 'I-pro', 'B-fac', 'B-org', 'I-event']
False


In [26]:
label_translate = {
    'B-org': 'B-ORG', 
    'I-org': 'I-ORG',
    'B-loc': 'B-LOC',
    'I-loc': 'I-LOC',
    'B-pers': 'B-PER', 
    'I-pers': 'I-PER',
    'B-event': 'B-EVE',
    'I-event': 'I-EVE',
    'B-pro': 'B-PRO',
    'I-pro': 'I-PRO',
    'B-fac': 'B-FAC', 
    'I-fac': 'I-FAC',
    'O': 'O'
}
labels = ner_model.resolve_input_label_consistency(labels, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-PER', 'I-PRO', 'B-PER', 'B-EVE', 'I-LOC', 'B-LOC', 'B-ORG', 'I-EVE', 'I-FAC', 'B-PRO', 'I-ORG', 'O', 'B-FAC'}
intersection: {'I-PER', 'I-PRO', 'B-PER', 'B-EVE', 'I-LOC', 'B-PRO', 'B-LOC', 'B-ORG', 'I-EVE', 'I-ORG', 'O', 'I-FAC', 'B-FAC'}
model_labels-dataset_labels: ['I-MON', 'I-DAT', 'B-DAT', 'B-TIM', 'B-MON', 'B-PCT', 'I-PCT', 'I-TIM']
dataset_labels-model_labels: []
True


batch size=256 -> inference time for one batch is about 205 s

batch size=512 -> inference time for one batch is about 410 s

batch size=1024 -> crach

In [27]:
!nvidia-smi
!lscpu

Mon Aug 16 15:05:17 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    73W / 149W |   5341MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [28]:
inference_output_arman = ner_model.ner_evaluation(sentences, labels, device, batch_size=512)

max_len: 287
#samples: 7681
#batch: 16
Start to evaluate test data ...
inference time for step 0: 0.02611925299993345
inference time for step 1: 0.014217516000030628
inference time for step 2: 0.012593388000027517
inference time for step 3: 0.012391326999932062
inference time for step 4: 0.012748277000014241
inference time for step 5: 0.012259845000016867
inference time for step 6: 0.01247171599993635
inference time for step 7: 0.012546586999974352
inference time for step 8: 0.018778814000029342
inference time for step 9: 0.012737905999983923
inference time for step 10: 0.01191931799996837
inference time for step 11: 0.013217306000001372
inference time for step 12: 0.012866875999975491
inference time for step 13: 0.012629958000047736
inference time for step 14: 0.012363497000023926
inference time for step 15: 0.01237762499999917
average loss: 2.8188908994197845
total inference time: 0.2222392089998948
total inference time / #samples: 2.8933629605506418e-05


In [29]:
for sample_output in inference_output_arman[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

افقی	O	O
:	O	O
0	O	O
[UNK]	O	O
از	O	O
عوامل	O	O
دوران	O	O
پهلوی	O	O
و	O	O
نخست	O	O
[ZWNJ]	O	O
وزیر	O	O
ایران	B-LOC	B-LOC
در	O	O
سالهای	O	O
[UNK]	O	O
دهه	O	O
چهل	O	O
خورشیدی	O	O
[UNK]	O	O
جلد	O	O
سوم	O	O
یادداشت	O	O
##هایش	O	O
هم	O	O
چندی	O	O
پیش	O	O
در	O	O
تهران	B-LOC	B-LOC
منتشر	O	O
شد	O	O
0	O	O
[UNK]	O	O
پرستاری	O	O
از	O	O
ناخوش	O	O
[ZWNJ]	O	O
احوال	O	O
[UNK]	O	O
پوشاک	O	O
و	O	O
جامه	O	O
[UNK]	O	O
فانتزی	O	O
و	O	O
شیک	O	O
0	O	O
[UNK]	O	O
در	O	O
حال	O	O
وزی	O	O
##دن	O	O
[UNK]	O	O
اطلاعیه	O	O
[UNK]	O	O
پایتخت	O	O
جمهوری	O	B-LOC
استونی	B-LOC	B-LOC
در	I-LOC	I-LOC
حوضه	I-LOC	B-LOC
بالتیک	I-LOC	I-LOC
0	O	O
[UNK]	O	O
علم	O	O
راهبرد	O	O
[UNK]	O	O
و	O	O
سازمان	O	O
[UNK]	O	O
نوعی	O	O
شمع	O	O
0	O	O
[UNK]	O	O
حرف	O	O
جمع	O	O
[UNK]	O	O
[UNK]	O	O
در	O	O
ایران	B-LOC	B-LOC
به	O	O
تولیدکننده	O	O
کتاب	O	O
اطلاق	O	O
می	O	O
[ZWNJ]	O	O
شود	O	O
[UNK]	O	O
از	O	O
شهرهای	O	O
باختری	O	O
افغانستان	B-LOC	B-LOC
[UNK]	O	O
تا	O	O
عصر	O	O
ناصرالدین	B-PER	B-PER
[ZWNJ]	B-PER	I-PER
شاه	B-PER	I-PER
[UNK]	O	O
از	O	O
خرا

In [30]:
ner_model.evaluate_prediction_results(labels, inference_output_arman)

Test Accuracy: 0.9264036159796806
Test Precision: 0.32943808091326315
Test Recall: 0.5123560460652591
Test F1-Score: 0.40102345015375224
Test classification Report:
              precision    recall  f1-score   support

         EVE  0.0335703357 0.5550595238 0.0633115505       672
         FAC  0.4622222222 0.3371150729 0.3898781631       617
         LOC  0.5040885860 0.6353875886 0.5621734587      4657
         ORG  0.6627332602 0.4467258602 0.5337016575      5406
         PER  0.5285072155 0.5133272059 0.5208066208      4352
         PRO  0.5629984051 0.3646694215 0.4426332288       968

   micro avg  0.3294380809 0.5123560461 0.4010234502     16672
   macro avg  0.4590200041 0.4753807788 0.4187507799     16672
weighted avg  0.5448100279 0.5123560461 0.5087183616     16672



In [31]:
output_file_name = "ner_arman_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_arman:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Arman+Peyma

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1WZxpFRtEs5HZWyWQ2Pyg9CCuIBs1Kmvx'})
download.GetContentFile('peyma.zip')
!ls

In [None]:
!unzip peyma.zip
!ls
!ls peyma

In [32]:
sentences_peyma, labels_peyma = ner_model.load_test_datasets(dataset_name="peyma", dataset_dir="./peyma/")
print(len(sentences_peyma), len(labels_peyma))
print(sentences_peyma[0])
print(labels_peyma[0])

1026 1026
['کنایه', 'سرلشگر', 'فیروزآبادی', 'به', 'پادشاه', 'عربستان', 'و', 'پسرش']
['O', 'O', 'B_ORG', 'O', 'O', 'B_LOC', 'O', 'O']


In [33]:
is_consistent = ner_model.check_input_label_consistency(labels_peyma)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I_MON', 'I_PCT', 'I_LOC', 'I_PER', 'I_DAT', 'B_TIM', 'B_MON', 'B_PER', 'B_ORG', 'I_ORG', 'I_TIM', 'B_DAT', 'B_PCT', 'B_LOC', 'O'}
intersection: {'O'}
model_labels-dataset_labels: ['I-MON', 'I-PRO', 'I-PER', 'B-PER', 'I-LOC', 'B-LOC', 'B-ORG', 'I-EVE', 'B-MON', 'I-DAT', 'B-TIM', 'B-PRO', 'I-ORG', 'I-TIM', 'B-FAC', 'B-EVE', 'I-FAC', 'B-PCT', 'B-DAT', 'I-PCT']
dataset_labels-model_labels: ['I_MON', 'B_TIM', 'I_PCT', 'B_MON', 'B_PER', 'B_ORG', 'I_ORG', 'I_TIM', 'B_DAT', 'I_LOC', 'I_PER', 'B_PCT', 'B_LOC', 'I_DAT']
False


In [34]:
label_translate = {
    'B_ORG': 'B-ORG', 
    'I_ORG': 'I-ORG',
    'B_LOC': 'B-LOC',
    'I_LOC': 'I-LOC',
    'B_PER': 'B-PER', 
    'I_PER': 'I-PER',
    'B_DAT': 'B-DAT', 
    'I_DAT': 'I-DAT', 
    'B_PCT': 'B-PCT', 
    'I_PCT': 'I-PCT', 
    'B_TIM': 'B-TIM', 
    'I_TIM': 'I-TIM', 
    'B_MON': 'B-MON', 
    'I_MON': 'I-MON',
    'O': 'O'
}
labels_peyma = ner_model.resolve_input_label_consistency(labels_peyma, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels_peyma)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-PER', 'I-MON', 'B-PER', 'I-LOC', 'B-LOC', 'B-ORG', 'B-PCT', 'B-MON', 'B-DAT', 'I-DAT', 'B-TIM', 'I-ORG', 'O', 'I-PCT', 'I-TIM'}
intersection: {'I-PER', 'I-MON', 'B-DAT', 'B-PER', 'B-MON', 'I-DAT', 'B-TIM', 'I-LOC', 'B-LOC', 'I-TIM', 'B-ORG', 'I-ORG', 'O', 'I-PCT', 'B-PCT'}
model_labels-dataset_labels: ['I-PRO', 'B-PRO', 'B-EVE', 'I-EVE', 'I-FAC', 'B-FAC']
dataset_labels-model_labels: []
True


In [None]:
!wget https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
!ls

In [None]:
!unzip ArmanPersoNERCorpus.zip -d arman
!ls

In [35]:
sentences_arman, labels_arman = ner_model.load_test_datasets(dataset_name="arman", dataset_dir="./arman/")
print(len(sentences_arman), len(labels_arman))
print(sentences_arman[0])
print(labels_arman[0])

7681 7681
['افقی', ':', '0', 'ـ', 'از', 'عوامل', 'دوران', 'پهلوی', 'و', 'نخست\u200cوزیر', 'ایران', 'در', 'سالهای', 'ابتدائی', 'دهه', 'چهل', 'خورشیدی', 'كه', 'جلد', 'سوم', 'یادداشتهایش', 'هم', 'چندی', 'پیش', 'در', 'تهران', 'منتشر', 'شد', '0', 'ـ', 'پرستاری', 'از', 'ناخوش\u200cاحوال', 'ـ', 'پوشاک', 'و', 'جامه', 'ـ', 'فانتزی', 'و', 'شیک', '0', 'ـ', 'در', 'حال', 'وزیدن', 'ـ', 'اطلاعیه', 'ـ', 'پایتخت', 'جمهوری', 'استونی', 'در', 'حوضه', 'بالتیک', '0', 'ـ', 'علم', 'راهبرد', 'مؤسسه', 'و', 'سازمان', 'ـ', 'نوعی', 'شمع', '0', 'ـ', 'حرف', 'جمع', 'مؤنث', 'ـ', 'در', 'ایران', 'به', 'تولیدکننده', 'کتاب', 'اطلاق', 'می\u200cشود', 'ـ', 'از', 'شهرهای', 'باختری', 'افغانستان', 'كه', 'تا', 'عصر', 'ناصرالدین\u200cشاه', 'جزئی', 'از', 'خراسان', 'بود', 'ـ', 'ویتامین', 'انعقاد', '0', 'ـ', 'سبزی', 'غده\u200cای', 'ـ', 'دوستی', 'و', 'محبت', 'ـ', 'داستان', 'بلند', 'ـ', 'شهری', 'در', 'آلمان', '0', 'ـ', 'سلول', 'بدن', 'موجودات', 'ـ', 'از', 'انواع', 'کالباس', '0', 'ـ', 'حاشیه', 'و', 'هامش', 'ـ', 'پیدا', 'نشدنی', 'ـ', 'خ

In [36]:
is_consistent = ner_model.check_input_label_consistency(labels_arman)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-pers', 'B-pers', 'I-loc', 'I-pro', 'B-fac', 'B-event', 'I-event', 'I-fac', 'B-pro', 'B-loc', 'I-org', 'O', 'B-org'}
intersection: {'O'}
model_labels-dataset_labels: ['I-MON', 'I-PRO', 'I-PER', 'B-PER', 'I-LOC', 'B-LOC', 'B-ORG', 'I-EVE', 'B-MON', 'I-DAT', 'B-TIM', 'B-PRO', 'I-ORG', 'I-TIM', 'B-FAC', 'B-EVE', 'I-FAC', 'B-PCT', 'B-DAT', 'I-PCT']
dataset_labels-model_labels: ['I-pers', 'I-fac', 'B-pro', 'B-pers', 'I-loc', 'B-event', 'B-loc', 'I-org', 'I-pro', 'B-fac', 'B-org', 'I-event']
False


In [37]:
label_translate = {
    'B-org': 'B-ORG', 
    'I-org': 'I-ORG',
    'B-loc': 'B-LOC',
    'I-loc': 'I-LOC',
    'B-pers': 'B-PER', 
    'I-pers': 'I-PER',
    'B-event': 'B-EVE',
    'I-event': 'I-EVE',
    'B-pro': 'B-PRO',
    'I-pro': 'I-PRO',
    'B-fac': 'B-FAC', 
    'I-fac': 'I-FAC',
    'O': 'O'
}
labels_arman = ner_model.resolve_input_label_consistency(labels_arman, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels_arman)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-PER', 'I-PRO', 'B-PER', 'B-EVE', 'I-LOC', 'B-LOC', 'B-ORG', 'I-EVE', 'I-FAC', 'B-PRO', 'I-ORG', 'O', 'B-FAC'}
intersection: {'I-PER', 'I-PRO', 'B-PER', 'B-EVE', 'I-LOC', 'B-PRO', 'B-LOC', 'B-ORG', 'I-EVE', 'I-ORG', 'O', 'I-FAC', 'B-FAC'}
model_labels-dataset_labels: ['I-MON', 'I-DAT', 'B-DAT', 'B-TIM', 'B-MON', 'B-PCT', 'I-PCT', 'I-TIM']
dataset_labels-model_labels: []
True


In [38]:
sentences = sentences_arman + sentences_peyma
labels = labels_arman + labels_peyma
print(len(sentences), len(labels))

8707 8707


In [39]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-PER', 'I-PRO', 'I-MON', 'B-PER', 'B-EVE', 'I-LOC', 'B-LOC', 'I-TIM', 'B-ORG', 'I-EVE', 'I-FAC', 'B-PCT', 'B-MON', 'B-DAT', 'I-DAT', 'B-TIM', 'B-PRO', 'I-ORG', 'O', 'I-PCT', 'B-FAC'}
intersection: {'I-PER', 'I-MON', 'I-PRO', 'B-PER', 'I-LOC', 'B-LOC', 'B-ORG', 'I-EVE', 'B-MON', 'I-DAT', 'B-TIM', 'B-PRO', 'I-ORG', 'I-TIM', 'B-FAC', 'B-EVE', 'I-FAC', 'B-PCT', 'B-DAT', 'O', 'I-PCT'}
model_labels-dataset_labels: []
dataset_labels-model_labels: []
True


In [40]:
!nvidia-smi
!lscpu

Mon Aug 16 15:11:55 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    70W / 149W |   8267MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [41]:
inference_output = ner_model.ner_evaluation(sentences, labels, device, batch_size=512)

max_len: 287
#samples: 8707
#batch: 18
Start to evaluate test data ...
inference time for step 0: 0.02602576100002807
inference time for step 1: 0.013520275999894693
inference time for step 2: 0.011762119000081839
inference time for step 3: 0.012068204999991394
inference time for step 4: 0.012613455999940015
inference time for step 5: 0.01238092200014762
inference time for step 6: 0.012779242000078739
inference time for step 7: 0.011930809999967096
inference time for step 8: 0.013097048999952676
inference time for step 9: 0.012064399000109916
inference time for step 10: 0.013062319000027856
inference time for step 11: 0.013134028999957081
inference time for step 12: 0.012224622000076124
inference time for step 13: 0.012272584999891478
inference time for step 14: 0.012511819000110336
inference time for step 15: 0.012153899000168167
inference time for step 16: 0.021451541000033103
inference time for step 17: 0.012851439000087339
average loss: 2.85923969745636
total inference time: 0.2479

In [42]:
for sample_output in inference_output[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

افقی	O	O
:	O	O
0	O	O
[UNK]	O	O
از	O	O
عوامل	O	O
دوران	O	O
پهلوی	O	O
و	O	O
نخست	O	O
[ZWNJ]	O	O
وزیر	O	O
ایران	B-LOC	B-LOC
در	O	O
سالهای	O	O
[UNK]	O	O
دهه	O	O
چهل	O	O
خورشیدی	O	O
[UNK]	O	O
جلد	O	O
سوم	O	O
یادداشت	O	O
##هایش	O	O
هم	O	O
چندی	O	O
پیش	O	O
در	O	O
تهران	B-LOC	B-LOC
منتشر	O	O
شد	O	O
0	O	O
[UNK]	O	O
پرستاری	O	O
از	O	O
ناخوش	O	O
[ZWNJ]	O	O
احوال	O	O
[UNK]	O	O
پوشاک	O	O
و	O	O
جامه	O	O
[UNK]	O	O
فانتزی	O	O
و	O	O
شیک	O	O
0	O	O
[UNK]	O	O
در	O	O
حال	O	O
وزی	O	O
##دن	O	O
[UNK]	O	O
اطلاعیه	O	O
[UNK]	O	O
پایتخت	O	O
جمهوری	O	B-LOC
استونی	B-LOC	B-LOC
در	I-LOC	I-LOC
حوضه	I-LOC	B-LOC
بالتیک	I-LOC	I-LOC
0	O	O
[UNK]	O	O
علم	O	O
راهبرد	O	O
[UNK]	O	O
و	O	O
سازمان	O	O
[UNK]	O	O
نوعی	O	O
شمع	O	O
0	O	O
[UNK]	O	O
حرف	O	O
جمع	O	O
[UNK]	O	O
[UNK]	O	O
در	O	O
ایران	B-LOC	B-LOC
به	O	O
تولیدکننده	O	O
کتاب	O	O
اطلاق	O	O
می	O	O
[ZWNJ]	O	O
شود	O	O
[UNK]	O	O
از	O	O
شهرهای	O	O
باختری	O	O
افغانستان	B-LOC	B-LOC
[UNK]	O	O
تا	O	O
عصر	O	O
ناصرالدین	B-PER	B-PER
[ZWNJ]	B-PER	I-PER
شاه	B-PER	I-PER
[UNK]	O	O
از	O	O
خرا

In [43]:
ner_model.evaluate_prediction_results(labels, inference_output)

Test Accuracy: 0.9262576317956791
Test Precision: 0.34243331708336233
Test Recall: 0.5161933756758176
Test F1-Score: 0.41173145763989194
Test classification Report:
              precision    recall  f1-score   support

         DAT  0.1782945736 0.4107142857 0.2486486486       224
         EVE  0.0328518584 0.5550595238 0.0620322634       672
         FAC  0.4360587002 0.3371150729 0.3802559415       617
         LOC  0.5226069862 0.6450167973 0.5773953721      5358
         MON  0.4230769231 0.7586206897 0.5432098765        29
         ORG  0.6617231638 0.4533870968 0.5380934150      6200
         PCT  0.7500000000 0.6600000000 0.7021276596        50
         PER  0.5259919372 0.5047851761 0.5151704073      4911
         PRO  0.5372907154 0.3646694215 0.4344615385       968
         TIM  0.1590909091 0.3181818182 0.2121212121        22

   micro avg  0.3424333171 0.5161933757 0.4117314576     19051
   macro avg  0.4226985767 0.5007549882 0.4213516335     19051
weighted avg  0.5453985

In [44]:
output_file_name = "ner_arman-and-peyma_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### WikiAnn dataset:

In [45]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX'})
download.GetContentFile('fa.tar.gz')
!ls

adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner_arman-and-peyma_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
ner_arman_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
ner_peyma_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
peyma
peyma.zip
sample_data


In [46]:
!tar -zxvf fa.tar.gz
!ls

README.txt
wikiann-fa.bio
adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner_arman-and-peyma_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
ner_arman_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
ner_peyma_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
peyma
peyma.zip
README.txt
sample_data
wikiann-fa.bio


In [47]:
sentences_all, labels_all, sentences_test, labels_test = ner_model.load_datasets(dataset_name="wikiann", dataset_dir="./")
print(len(sentences_all), len(sentences_all))
print(len(sentences_test), len(labels_test))
print(sentences_test[0])
print(labels_test[0])

all data: #data: 272266, #labels: 272266


  return array(a, dtype, copy=False, order=order)


without stratify
test part:
 #data: 27227, #labels: 27227
272266 272266
27227 27227
['**', 'زاغی', 'نوک\u200cزرد', ',', "''Pica", 'nuttalli', "''"]
['O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O']


In [48]:
is_consistent = ner_model.check_input_label_consistency(labels_test)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-PER', 'B-PER', 'I-LOC', 'B-LOC', 'B-ORG', 'I-ORG', 'O'}
intersection: {'I-PER', 'B-PER', 'I-LOC', 'B-LOC', 'B-ORG', 'I-ORG', 'O'}
model_labels-dataset_labels: ['I-MON', 'I-DAT', 'I-PRO', 'B-TIM', 'B-DAT', 'B-PRO', 'B-EVE', 'I-TIM', 'I-EVE', 'B-PCT', 'B-FAC', 'I-PCT', 'I-FAC', 'B-MON']
dataset_labels-model_labels: []
True


In [49]:
!nvidia-smi
!lscpu

Mon Aug 16 15:19:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    73W / 149W |   8695MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [50]:
inference_output_wikiann = ner_model.ner_evaluation_2(sentences_test, labels_test, device, batch_size=512)

len(input_text): 27227
len(input_labels): 27227
c: 10000
c: 20000
max_len: 115
#samples: 27227
#batch: 54
Start to evaluate test data ...
inference time for step 0: 0.022613880999870162
inference time for step 1: 0.012594886999977462
inference time for step 2: 0.01224773500007359
inference time for step 3: 0.01186484400000154
inference time for step 4: 0.01445890199988753
inference time for step 5: 0.011645285999975385
inference time for step 6: 0.012215476999926977
inference time for step 7: 0.011736841999891112
inference time for step 8: 0.011957014000017807
inference time for step 9: 0.01142766499992831
inference time for step 10: 0.01171900800000003
inference time for step 11: 0.012453273000119225
inference time for step 12: 0.011754164999956629
inference time for step 13: 0.0119812609998462
inference time for step 14: 0.012299060000032114
inference time for step 15: 0.012623833000134255
inference time for step 16: 0.013493278999931135
inference time for step 17: 0.0125128849999782

In [51]:
for sample_output in inference_output_wikiann[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

*	O	O
*	O	O
زاغ	B-LOC	B-PER
##ی	B-LOC	I-PER
نوک	I-LOC	I-PER
[ZWNJ]	I-LOC	I-PER
زرد	I-LOC	I-PER
,	O	O
'	O	O
'	O	O
Pic	O	O
##a	O	O
n	O	O
##ut	O	O
##ta	O	O
##ll	O	O
##i	O	O
'	O	O
'	O	O

تغییر	O	B-PER
##مس	O	I-PER
##یر	O	I-PER
مک	B-LOC	I-PER
[ZWNJ]	B-LOC	I-PER
ویل	B-LOC	I-PER
،	B-LOC	O
داکوتای	I-LOC	B-LOC
شمالی	I-LOC	I-LOC

وست	B-LOC	B-EVE
یونیور	I-LOC	B-ORG
##سیتی	I-LOC	I-ORG
پلیس	I-LOC	I-EVE
،	I-LOC	O
تگزاس	I-LOC	B-LOC

تغییر	O	B-EVE
##مس	O	I-PER
##یر	O	O
دلت	B-PER	I-PER
##ف	B-PER	I-PER
فون	I-PER	I-PER
لیل	I-PER	I-PER
##نس	I-PER	I-PER
##رون	I-PER	I-PER

تغییر	O	O
##مس	O	O
##یر	O	O
نیروگاه	B-ORG	O
[ZWNJ]	B-ORG	B-EVE
های	B-ORG	O
زنجیره	I-ORG	B-LOC
[ZWNJ]	I-ORG	B-EVE
ای	I-ORG	I-LOC
یاسوج	I-ORG	B-LOC



In [52]:
ner_model.evaluate_prediction_results(labels_test, inference_output_wikiann)

Test Accuracy: 0.5181010902794272
Test Precision: 0.15501633224451702
Test Recall: 0.1765518707482993
Test F1-Score: 0.16508472891715947
Test classification Report:
              precision    recall  f1-score   support

         LOC  0.1265558195 0.1000187723 0.1117332494     26635
         ORG  0.3456561922 0.3246282738 0.3348123930     13249
         PER  0.0667264217 0.1872554500 0.0983919524      7156

   micro avg  0.1550163322 0.1765518707 0.1650847289     47040
   macro avg  0.1796461445 0.2039674987 0.1816458649     47040
weighted avg  0.1791646985 0.1765518707 0.1725348067     47040



In [53]:
output_file_name = "ner_wikiann_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_wikiann:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Hooshvare - Arman+Peyma+WikiAnn

https://github.com/hooshvare/parsner

In [54]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1fC2WGlpqumUTaT9Dr_U1jO2no3YMKFJ4'})
download.GetContentFile('ner-v1.zip')
!ls

adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner_arman-and-peyma_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
ner_arman_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
ner_peyma_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
ner-v1.zip
ner_wikiann_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
peyma
peyma.zip
README.txt
sample_data
wikiann-fa.bio


In [55]:
!unzip ner-v1.zip
!ls
!ls ner

Archive:  ner-v1.zip
   creating: ner/
  inflating: ner/valid.csv           
  inflating: ner/ner.csv             
  inflating: ner/test.csv            
  inflating: ner/train.csv           
adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner
ner_arman-and-peyma_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
ner_arman_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
ner_peyma_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
ner-v1.zip
ner_wikiann_HooshvareLab-bert-fa-zwnj-base-ner_outputs.txt
peyma
peyma.zip
README.txt
sample_data
wikiann-fa.bio
ner.csv  test.csv  train.csv  valid.csv


In [56]:
sentences_paw, labels_paw = ner_model.load_test_datasets(dataset_name="hooshvare-peyman+arman+wikiann", dataset_dir="./ner/")
print(len(sentences_paw), len(labels_paw))
print(sentences_paw[0])
print(labels_paw[0])

test part:
 #sentences: 6049, #sentences_tags: 6049
6049 6049
['همچنین', 'عملیات', 'لرزه\u200cنگاری', 'دوبعدی', 'نیز', 'با', 'فعالیت', 'مستمر', 'چهار', 'گروه', 'کاری', 'در', 'مناطقی', 'که', 'از', 'نظر', 'اکتشافی', 'مورد', 'نظر', 'بود', '،', 'به', 'پایان', 'رسید', 'که', 'نتایج', 'آن', 'در', 'حال', 'بررسی', 'است', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [57]:
is_consistent = ner_model.check_input_label_consistency(labels_paw)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-PER', 'I-PRO', 'I-MON', 'B-PER', 'B-EVE', 'I-LOC', 'B-LOC', 'I-TIM', 'B-ORG', 'I-EVE', 'I-FAC', 'B-MON', 'B-PCT', 'B-DAT', 'I-DAT', 'B-TIM', 'B-PRO', 'I-ORG', 'O', 'I-PCT', 'B-FAC'}
intersection: {'I-PER', 'I-MON', 'I-PRO', 'B-PER', 'I-LOC', 'B-LOC', 'B-ORG', 'I-EVE', 'B-MON', 'I-DAT', 'B-TIM', 'B-PRO', 'I-ORG', 'I-TIM', 'B-FAC', 'B-EVE', 'I-FAC', 'B-PCT', 'B-DAT', 'O', 'I-PCT'}
model_labels-dataset_labels: []
dataset_labels-model_labels: []
True


In [58]:
!nvidia-smi
!lscpu

Mon Aug 16 15:28:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    71W / 149W |   3987MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [59]:
inference_output = ner_model.ner_evaluation_2(sentences_paw, labels_paw, device, batch_size=256)

len(input_text): 6049
len(input_labels): 6049
max_len: 512
#samples: 6049
#batch: 24
Start to evaluate test data ...
inference time for step 0: 0.04267346700021335
inference time for step 1: 0.013330374999895866
inference time for step 2: 0.012311772999964887
inference time for step 3: 0.012299098000084996
inference time for step 4: 0.014045640999938769
inference time for step 5: 0.013255491000109032
inference time for step 6: 0.01241337300007217
inference time for step 7: 0.012583361999986664
inference time for step 8: 0.011883570999998483
inference time for step 9: 0.012661411999943084
inference time for step 10: 0.013035381000008783
inference time for step 11: 0.012665896999806137
inference time for step 12: 0.012081637999926897
inference time for step 13: 0.012153827000020101
inference time for step 14: 0.011760656999740604
inference time for step 15: 0.011664711000321404
inference time for step 16: 0.013023749000240059
inference time for step 17: 0.01228988799994113
inference time

In [60]:
for sample_output in inference_output[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

همچنین	O	O
عملیات	O	O
لرزه	O	O
[ZWNJ]	O	B-EVE
نگاری	O	O
دوبعدی	O	O
نیز	O	O
با	O	O
فعالیت	O	O
مستمر	O	O
چهار	O	O
گروه	O	O
کاری	O	O
در	O	O
مناطقی	O	O
که	O	O
از	O	O
نظر	O	O
اکتشافی	O	O
مورد	O	O
نظر	O	O
بود	O	O
،	O	O
به	O	O
پایان	O	O
رسید	O	O
که	O	O
نتایج	O	O
آ	O	O
##ن	O	O
در	O	O
حال	O	O
بررسی	O	O
است	O	O
.	O	O

محدث	B-PER	B-PER
در	O	O
مورد	O	O
مشارکت	O	O
شرکتهای	O	O
خارجی	O	O
در	O	O
فعالیتهای	O	O
اکتشافی	O	O
کشور	O	O
گفت	O	O
:	O	O
تاکنون	O	O
چند	O	O
منطقه	O	O
اکتشافی	O	O
را	O	O
برای	O	O
مشارکت	O	O
و	O	O
سرمایه	O	O
[ZWNJ]	O	B-EVE
گذاری	O	O
شرکتهای	O	O
خارجی	O	O
اعلام	O	O
کرده	O	O
[ZWNJ]	O	B-EVE
ایم	O	O
و	O	O
در	O	O
حال	O	O
مذاکره	O	O
با	O	O
طرفه	O	O
##ای	O	O
خارجی	O	O
هستیم	O	O
و	O	O
انتظار	O	O
می	O	O
[ZWNJ]	O	B-EVE
رود	O	O
تا	O	O
آ	O	O
##خر	O	O
امسال	O	O
بتوانیم	O	O
چند	O	O
قرارداد	O	O
را	O	O
نهایی	O	O
کنیم	O	O
.	O	O

مدیر	O	O
امور	B-ORG	O
اکتشاف	I-ORG	O
شرکت	I-ORG	B-ORG
ملی	I-ORG	I-ORG
نفت	I-ORG	I-ORG
فرو	O	O
##افتادگی	O	O
دزفول	B-LOC	B-LOC
و	O	O
منطقه	B-LOC	B-LOC
گسل	I-LOC	I-LOC
کازرون	

In [61]:
ner_model.evaluate_prediction_results(labels_paw, inference_output)

Test Accuracy: 0.9079543658675141
Test Precision: 0.2709653060368335
Test Recall: 0.50547816559712
Test F1-Score: 0.3528062269561655
Test classification Report:
              precision    recall  f1-score   support

         DAT  0.3449664430 0.5788288288 0.4322960471       444
         EVE  0.0121011253 0.5268456376 0.0236588306       298
         FAC  0.2941176471 0.2105263158 0.2453987730       285
         LOC  0.6128872367 0.6516469038 0.6316730524      3795
         MON  0.5652173913 0.5284552846 0.5462184874       123
         ORG  0.6054971706 0.3890909091 0.4737507906      3850
         PCT  0.7281553398 0.7575757576 0.7425742574        99
         PER  0.6118188252 0.5080811049 0.5551452882      3403
         PRO  0.4089456869 0.2990654206 0.3454790823       428
         TIM  0.3541666667 0.3207547170 0.3366336634        53

   micro avg  0.2709653060 0.5054781656 0.3528062270     12778
   macro avg  0.4537873532 0.4770870880 0.4332828272     12778
weighted avg  0.5724756693 

In [62]:
output_file_name = "ner_arman-and-peyma-and-wikiann_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### New Test

In [None]:
from transformers import pipeline
nlp = pipeline("ner", model=ner_model.model, tokenizer=ner_model.tokenizer)
example = "کنایه سرلشگر فیروزآبادی به پادشاه عربستان و پسرش"

ner_results = nlp(example)
for ent in ner_results:
  print(ent)