# AlbertNER
This model fine-tuned for the Named Entity Recognition (NER) task on a mixed NER dataset collected from ARMAN, PEYMA, and WikiANN that covered ten types of entities:

* Date (DAT)
* Event (EVE)
* Facility (FAC)
* Location (LOC)
* Money (MON)
* Organization (ORG)
* Percent (PCT)
* Person (PER)
* Product (PRO)
* Time (TIM)


In [1]:
!nvidia-smi
!lscpu

Mon Aug 16 13:12:45 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install transformers==4.7.0

Collecting hazm==0.7.0
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[?25l[K     |█                               | 10 kB 27.0 MB/s eta 0:00:01[K     |██                              | 20 kB 32.8 MB/s eta 0:00:01[K     |███                             | 30 kB 23.6 MB/s eta 0:00:01[K     |████▏                           | 40 kB 19.0 MB/s eta 0:00:01[K     |█████▏                          | 51 kB 14.1 MB/s eta 0:00:01[K     |██████▏                         | 61 kB 11.1 MB/s eta 0:00:01[K     |███████▎                        | 71 kB 11.8 MB/s eta 0:00:01[K     |████████▎                       | 81 kB 13.1 MB/s eta 0:00:01[K     |█████████▎                      | 92 kB 13.8 MB/s eta 0:00:01[K     |██████████▍                     | 102 kB 10.1 MB/s eta 0:00:01[K     |███████████▍                    | 112 kB 10.1 MB/s eta 0:00:01[K     |████████████▍                   | 122 kB 10.1 MB/s eta 0:00:01[K     |█████████████▌                  | 133 kB 10.1 MB/s eta 

In [3]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [4]:
import os
import gc
import ast
import time
import hazm
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import transformers
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForTokenClassification

from IPython.display import display, HTML, clear_output
from ipywidgets import widgets, Layout

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [5]:
class NER:
    def __init__(self, model_name):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        self.config = AutoConfig.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        # self.labels = list(self.config.label2id.keys())
        self.id2label = self.config.id2label

    @staticmethod
    def load_ner_data(file_path, word_index, tag_index, delimiter, join=False):
        dataset, labels = [], []
        with open(file_path, encoding="utf8") as infile:
            sample_text, sample_label = [], []
            for line in infile:
                parts = line.strip().split(delimiter)
                if len(parts) > 1:
                    word, tag = parts[word_index], parts[tag_index]
                    if not word:
                        continue
                    sample_text.append(word)
                    sample_label.append(tag)
                else:
                    # end of sample
                    if sample_text and sample_label:
                        if join:
                            dataset.append(' '.join(sample_text))
                            labels.append(' '.join(sample_label))
                        else:
                            dataset.append(sample_text)
                            labels.append(sample_label)
                    sample_text, sample_label = [], []
        if sample_text and sample_label:
            if join:
                dataset.append(' '.join(sample_text))
                labels.append(' '.join(sample_label))
            else:
                dataset.append(sample_text)
                labels.append(sample_label)
        return dataset, labels

    def load_test_datasets(self, dataset_name, dataset_dir, **kwargs):
        if dataset_name.lower() == "peyma":
            ner_file_path = dataset_dir + 'test.txt'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            return self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter='|',
                                      join=kwargs.get('join', False))
        elif dataset_name.lower() == "arman":
            dataset, labels = [], []
            for i in range(1, 4):
                ner_file_path = dataset_dir + f'test_fold{i}.txt'
                if not os.path.exists(ner_file_path):
                    print(ner_file_path)
                dataset_part, labels_part = self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter=' ',
                                                               join=kwargs.get('join', False))
                dataset += dataset_part
                labels += labels_part
            return dataset, labels
        elif dataset_name.lower() == "hooshvare-peyman+arman+wikiann":
            ner_file_path = dataset_dir + 'test.csv'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            data = pd.read_csv(ner_file_path, delimiter="\t")
            sentences, sentences_tags = data['tokens'].values.tolist(), data['ner_tags'].values.tolist()
            sentences = [ast.literal_eval(ss) for ss in sentences]
            sentences_tags = [ast.literal_eval(ss) for ss in sentences_tags]
            print(f'test part:\n #sentences: {len(sentences)}, #sentences_tags: {len(sentences_tags)}')
            return sentences, sentences_tags

    def load_datasets(self, dataset_name, dataset_dir, **kwargs):
        if dataset_name.lower() == "farsiyar":
            dataset, labels = [], []
            for i in range(1, 6):
                ner_file_path = dataset_dir + 'Persian-NER-part{i}.txt'
                if not os.path.exists(ner_file_path):
                    print(ner_file_path)
                dataset_part, labels_part = self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter='\t',
                                                               join=kwargs.get('join', False))
                dataset += dataset_part
                labels += labels_part
            return dataset, labels
        elif dataset_name.lower() == "wikiann":
            ner_file_path = dataset_dir + 'wikiann-fa.bio'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            dataset_all, labels_all = self.load_ner_data(ner_file_path, word_index=0, tag_index=-1, delimiter=' ',
                                                         join=kwargs.get('join', False))
            print(f'all data: #data: {len(dataset_all)}, #labels: {len(labels_all)}')

            try:
                _, data_test, _, label_test = train_test_split(dataset_all, labels_all, test_size=0.1, random_state=1,
                                                               stratify=labels_all)
                print("with stratify")
            except:
                _, data_test, _, label_test = train_test_split(dataset_all, labels_all, test_size=0.1, random_state=1)
                print("without stratify")
            print(f'test part:\n #data: {len(data_test)}, #labels: {len(label_test)}')
            return dataset_all, labels_all, data_test, label_test

    def ner_inference(self, input_text, device, max_length):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        pt_batch = self.tokenizer(
            [self.normalizer.normalize(sequence) for sequence in input_text],
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        pt_batch = pt_batch.to(device)
        pt_outputs = self.model(**pt_batch)
        pt_predictions = torch.argmax(pt_outputs.logits, dim=-1)
        pt_predictions = pt_predictions.cpu().detach().numpy().tolist()

        output_predictions = []
        for i, sequence in enumerate(input_text):
            tokens = self.tokenizer.tokenize(self.tokenizer.decode(self.tokenizer.encode(sequence)))
            predictions = [(token, self.id2label[prediction]) for token, prediction in
                           zip(tokens, pt_predictions[i])]
            output_predictions.append(predictions)
        return output_predictions

    def ner_evaluation(self, input_text, input_labels, device, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        max_len = 0
        tokenized_texts, new_labels = [], []
        for sentence, sentence_label in zip(input_text, input_labels):
            if type(sentence) == str:
                sentence = sentence.strip().split()
            if len(sentence) != len(sentence_label):
                print('Something wrong has been happened! Length of a sentence and its label is not equal!')
                return
            tokenized_sentence, new_sentence_label = [], []
            for word, label in zip(sentence, sentence_label):
                # Tokenize the word and count # of subwords the word is broken into
                tokenized_word = self.tokenizer.tokenize(word)
                n_subwords = len(tokenized_word)

                # Add the tokenized word to the final tokenized word list
                tokenized_sentence.extend(tokenized_word)
                # Add the same label to the new list of labels `n_subwords` times
                new_sentence_label.extend([label] * n_subwords)

            max_len = max(max_len, len(tokenized_sentence))
            tokenized_texts.append(tokenized_sentence)
            new_labels.append(new_sentence_label)

        max_len = min(max_len, self.config.max_position_embeddings)
        print("max_len:", max_len)
        input_ids = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                                  maxlen=max_len, dtype="long", value=self.config.pad_token_id,
                                  truncating="post", padding="post")
        del tokenized_texts
        input_labels = pad_sequences([[self.config.label2id.get(l) for l in lab] for lab in new_labels],
                                     maxlen=max_len, value=self.config.label2id.get('O'), padding="post",
                                     dtype="long", truncating="post")
        del new_labels

        train_data = TensorDataset(torch.tensor(input_ids), torch.tensor(input_labels))
        data_loader = DataLoader(train_data, batch_size=batch_size)
        # data_loader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
        print("#samples:", len(input_ids))
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_loss, total_time = 0, 0
        output_predictions = []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids, b_labels = batch

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_labels = b_labels.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                outputs = self.model(b_input_ids, labels=b_labels)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')
            # get the loss
            total_loss += outputs.loss.item()

            b_predictions = torch.argmax(outputs.logits, dim=2)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_labels = b_labels.cpu().detach().numpy().tolist()

            for i, sample in enumerate(b_input_ids):
                sample_input = list(sample)
                # remove pad tokens
                while sample_input[-1] == self.config.pad_token_id:
                    sample_input.pop()
                # tokens = self.tokenizer.tokenize(self.tokenizer.decode(sample_input))
                tokens = [self.tokenizer.decode([t]) for t in sample_input]
                sample_true_labels = [self.id2label[e] for e in b_labels[i][:len(sample_input)]]
                sample_predictions = [self.id2label[e] for e in b_predictions[i][:len(sample_input)]]
                output_predictions.append(
                    [(t, sample_true_labels[j], sample_predictions[j]) for j, t in enumerate(tokens)])

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(data_loader)
        print("average loss:", avg_train_loss)
        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_ids))

        return output_predictions

    def ner_evaluation_2(self, input_text, input_labels, device, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        print("len(input_text):", len(input_text))
        print("len(input_labels):", len(input_labels))
        c = 0
        max_len = 0
        tokenized_texts, new_labels = [], []
        for sentence, sentence_label in zip(input_text, input_labels):
            if type(sentence) == str:
                sentence = sentence.strip().split()
            if len(sentence) != len(sentence_label):
                print('Something wrong has been happened! Length of a sentence and its label is not equal!')
                return
            tokenized_words = self.tokenizer(sentence, padding=False, add_special_tokens=False).input_ids
            tokenized_sentence_ids, new_sentence_label = [], []
            for i, tokenized_word in enumerate(tokenized_words):
                # Add the tokenized word to the final tokenized word list
                tokenized_sentence_ids += tokenized_word
                # Add the same label to the new list of labels `number of subwords` times
                new_sentence_label.extend([self.config.label2id.get(sentence_label[i])] * len(tokenized_word))

            max_len = max(max_len, len(tokenized_sentence_ids))
            tokenized_texts.append(tokenized_sentence_ids)
            new_labels.append(new_sentence_label)
            c += 1
            if c % 10000 == 0:
                print("c:", c)
        max_len = min(max_len, self.config.max_position_embeddings)
        print("max_len:", max_len)
        input_ids = pad_sequences(tokenized_texts, maxlen=max_len, dtype="long", value=self.config.pad_token_id,
                                  truncating="post", padding="post")
        del tokenized_texts
        input_labels = pad_sequences(new_labels, maxlen=max_len, value=self.config.label2id.get('O'), padding="post",
                                     dtype="long", truncating="post")
        del new_labels

        train_data = TensorDataset(torch.tensor(input_ids), torch.tensor(input_labels))
        data_loader = DataLoader(train_data, batch_size=batch_size)
        # data_loader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
        print("#samples:", len(input_ids))
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_time = 0
        output_predictions = []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids, b_labels = batch

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_labels = b_labels.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                outputs = self.model(b_input_ids, labels=b_labels)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')

            b_predictions = torch.argmax(outputs.logits, dim=2)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_labels = b_labels.cpu().detach().numpy().tolist()

            for i, sample in enumerate(b_input_ids):
                sample_input = list(sample)
                # remove pad tokens
                while sample_input[-1] == self.config.pad_token_id:
                    sample_input.pop()
                # tokens = self.tokenizer.tokenize(self.tokenizer.decode(sample_input))
                tokens = [self.tokenizer.decode([t]) for t in sample_input]
                sample_true_labels = [self.id2label[e] for e in b_labels[i][:len(sample_input)]]
                sample_predictions = [self.id2label[e] for e in b_predictions[i][:len(sample_input)]]
                output_predictions.append(
                    [(t, sample_true_labels[j], sample_predictions[j]) for j, t in enumerate(tokens)])

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_ids))

        return output_predictions

    def check_input_label_consistency(self, labels):
        model_labels = self.config.label2id.keys()
        dataset_labels = set()
        for l in labels:
            dataset_labels.update(set(l))
        print("model labels:", model_labels)
        print("dataset labels:", dataset_labels)
        print("intersection:", set(model_labels).intersection(dataset_labels))
        print("model_labels-dataset_labels:", list(set(model_labels) - set(dataset_labels)))
        print("dataset_labels-model_labels:", list(set(dataset_labels) - set(model_labels)))
        if list(set(dataset_labels) - set(model_labels)):
            return False
        return True

    @staticmethod
    def resolve_input_label_consistency(labels, label_translation_map):
        for i, sentence_labels in enumerate(labels):
            for j, label in enumerate(sentence_labels):
                labels[i][j] = label_translation_map.get(label)
        return labels

    @staticmethod
    def evaluate_prediction_results(labels, output_predictions):
        dataset_labels = set()
        for label in labels:
            dataset_labels.update(set(label))

        true_labels, predictions = [], []
        for sample_output in output_predictions:
            sample_true_labels = []
            sample_predicted_labels = []
            for token, true_label, predicted_label in sample_output:
                sample_true_labels.append(true_label)
                if predicted_label in dataset_labels:
                    sample_predicted_labels.append(predicted_label)
                else:
                    sample_predicted_labels.append('O')
            true_labels.append(sample_true_labels)
            predictions.append(sample_predicted_labels)

        print("Test Accuracy: {}".format(accuracy_score(true_labels, predictions)))
        print("Test Precision: {}".format(precision_score(true_labels, predictions)))
        print("Test Recall: {}".format(recall_score(true_labels, predictions)))
        print("Test F1-Score: {}".format(f1_score(true_labels, predictions)))
        print("Test classification Report:\n{}".format(classification_report(true_labels, predictions, digits=10)))


In [6]:
model_name='HooshvareLab/albert-fa-zwnj-base-v2-ner'
ner_model = NER(model_name)

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/857k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.23M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/186 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/44.5M [00:00<?, ?B/s]

In [7]:
print(ner_model.config)

AlbertConfig {
  "architectures": [
    "AlbertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "finetuning_task": "ner",
  "gap_size": 0,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-DAT",
    "2": "B-EVE",
    "3": "B-FAC",
    "4": "B-LOC",
    "5": "B-MON",
    "6": "B-ORG",
    "7": "B-PCT",
    "8": "B-PER",
    "9": "B-PRO",
    "10": "B-TIM",
    "11": "I-DAT",
    "12": "I-EVE",
    "13": "I-FAC",
    "14": "I-LOC",
    "15": "I-MON",
    "16": "I-ORG",
    "17": "I-PCT",
    "18": "I-PER",
    "19": "I-PRO",
    "20": "I-TIM"
  },
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "label2id": {
    "B-DAT": 1,
    "B-EVE": 2,
    "B-FAC": 3,
    "B-LOC": 4,
    "B-MON": 5,
    "B-ORG": 6,
    "B-PCT": 7,
    "B-PER": 8,
    "B

#### Sample Inference:

In [8]:
texts = [
    "مدیرکل محیط زیست استان البرز با بیان اینکه با بیان اینکه موضوع شیرابه‌های زباله‌های انتقال یافته در منطقه حلقه دره خطری برای این استان است، گفت: در این مورد گزارشاتی در ۲۵ مرداد ۱۳۹۷ تقدیم مدیران استان شده است.",
    "به گزارش خبرگزاری تسنیم از کرج، حسین محمدی در نشست خبری مشترک با معاون خدمات شهری شهرداری کرج که با حضور مدیرعامل سازمان‌های پسماند، پارک‌ها و فضای سبز و نماینده منابع طبیعی در سالن کنفرانس شهرداری کرج برگزار شد، اظهار داشت: ۸۰٪  جمعیت استان البرز در کلانشهر کرج زندگی می‌کنند.",
    "وی افزود: با همکاری‌های مشترک بین اداره کل محیط زیست و شهرداری کرج برنامه‌های مشترکی برای حفاظت از محیط زیست در شهر کرج در دستور کار قرار گرفته که این اقدامات آثار مثبتی داشته و تاکنون نزدیک به ۱۰۰ میلیارد هزینه جهت خریداری اکس-ریس صورت گرفته است.",
]

In [9]:
inference_output = ner_model.ner_inference(texts, device, ner_model.config.max_position_embeddings)

In [10]:
print(inference_output)

[[('[CLS]', 'B-ORG'), ('▁مدیرکل', 'O'), ('▁محیط', 'B-ORG'), ('▁زیست', 'I-ORG'), ('▁استان', 'I-ORG'), ('▁البرز', 'I-ORG'), ('▁با', 'O'), ('▁بیان', 'O'), ('▁اینکه', 'O'), ('▁با', 'O'), ('▁بیان', 'O'), ('▁اینکه', 'O'), ('▁موضوع', 'O'), ('▁شیر', 'O'), ('ابه', 'O'), ('[ZWNJ]', 'O'), ('▁های', 'O'), ('▁زباله', 'O'), ('[ZWNJ]', 'O'), ('▁های', 'O'), ('▁انتقال', 'O'), ('▁یافته', 'O'), ('▁در', 'O'), ('▁منطقه', 'O'), ('▁حلقه', 'O'), ('▁در', 'O'), ('ه', 'O'), ('▁خطری', 'O'), ('▁برای', 'O'), ('▁این', 'O'), ('▁استان', 'O'), ('▁است', 'O'), ('،', 'O'), ('▁گفت', 'O'), (':', 'O'), ('▁در', 'O'), ('▁این', 'O'), ('▁مورد', 'O'), ('▁گزارشات', 'O'), ('ی', 'O'), ('▁در', 'O'), ('▁۲۵', 'B-DAT'), ('▁مرداد', 'I-DAT'), ('▁۱۳۹۷', 'I-DAT'), ('▁تقدیم', 'O'), ('▁مدیران', 'O'), ('▁استان', 'O'), ('▁شده', 'O'), ('▁است', 'O'), ('.', 'O'), ('[SEP]', 'O')], [('[CLS]', 'O'), ('▁به', 'O'), ('▁گزارش', 'O'), ('▁خبرگزاری', 'B-ORG'), ('▁تسنیم', 'I-ORG'), ('▁از', 'O'), ('▁کرج', 'B-LOC'), ('،', 'O'), ('▁حسین', 'B-PER'), ('▁محمدی', 'I

In [11]:
#@title Live Playground { display-mode: "form" }

css_is_load = False
css = """<style>
.ner-box {
    direction: rtl;
    font-size: 18px !important;
    line-height: 20px !important;
    margin: 0 0 15px;
    padding: 10px;
    text-align: justify;
    color: #343434 !important;
}
.token, .token span {
    display: inline-block !important;
    padding: 2px;
    margin: 2px 0;
}
.token.token-ner {
    background-color: #f6cd61;
    font-weight: bold;
    color: #000;
}
.token.token-ner .ner-label {
    color: #9a1f40;
    margin: 0px 2px;
}
</style>"""

if not css_is_load:
    display(HTML(css))
    css_is_load = True

submit_wd = widgets.Button(description='Send', disabled=False, button_style='success', tooltip='Submit')
text_wd = widgets.Textarea(placeholder='Please enter you text ...', rows=5, layout=Layout(width='90%'))
output_wd = widgets.Output()

display(HTML("""
<h2>Test NER model</h2>
<p style="padding: 2px 20px; margin: 0 0 20px;">
</p>
<br /><br />
"""))

display(text_wd)
display(submit_wd)
display(output_wd)

def submit_text(sender):
    with output_wd:
        clear_output(wait=True)
        text = text_wd.value
        _output = ner_model.ner_inference([text], device, ner_model.config.max_position_embeddings)
        # print(_output)
        pred_sequence = []
        for token, label in _output[0]:
            if token not in ['[CLS]', '[SEP]']:
                if label != 'O':
                    pred_sequence.append(
                        '<span class="token token-ner">%s<span class="ner-label">%s</span></span>' 
                        % (token, label))
                else:
                    pred_sequence.append(
                        '<span class="token">%s</span>' 
                        % token)
            
        html = '<p class="ner-box">%s</p>' % ' '.join(pred_sequence) 
        display(HTML(html))

submit_wd.on_click(submit_text)

Textarea(value='', layout=Layout(width='90%'), placeholder='Please enter you text ...', rows=5)

Button(button_style='success', description='Send', style=ButtonStyle(), tooltip='Submit')

Output()

#### PEYMA dataset:
PEYMA dataset includes 7,145 sentences with a total of 302,530 tokens from which 41,148 tokens are tagged with seven different classes: 

- Organization
- Money
- Location
- Date
- Time
- Person
- Percent

|     Label    |   #   |
|:------------:|:-----:|
| Organization | 16964 |
|     Money    |  2037 |
|   Location   |  8782 |
|     Date     |  4259 |
|     Time     |  732  |
|    Person    |  7675 |
|    Percent   |  699  |

Download
You can download the dataset from [here](https://hooshvare.github.io/docs/datasets/ner) with leads to following google drive file of HooshvareLab:

In [12]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1WZxpFRtEs5HZWyWQ2Pyg9CCuIBs1Kmvx'})
download.GetContentFile('peyma.zip')
!ls

adc.json  peyma.zip  sample_data


In [13]:
!unzip peyma.zip
!ls
!ls peyma

Archive:  peyma.zip
   creating: peyma/
  inflating: peyma/dev.txt           
  inflating: peyma/test.txt          
  inflating: peyma/train.txt         
adc.json  peyma  peyma.zip  sample_data
dev.txt  test.txt  train.txt


In [14]:
sentences, labels = ner_model.load_test_datasets(dataset_name="peyma", dataset_dir="./peyma/")
print(len(sentences), len(labels))
print(sentences[0])
print(labels[0])

1026 1026
['کنایه', 'سرلشگر', 'فیروزآبادی', 'به', 'پادشاه', 'عربستان', 'و', 'پسرش']
['O', 'O', 'B_ORG', 'O', 'O', 'B_LOC', 'O', 'O']


In [15]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I_PCT', 'B_PER', 'B_PCT', 'I_LOC', 'I_DAT', 'B_MON', 'B_TIM', 'I_TIM', 'I_PER', 'B_ORG', 'I_ORG', 'O', 'B_DAT', 'I_MON', 'B_LOC'}
intersection: {'O'}
model_labels-dataset_labels: ['B-PRO', 'B-LOC', 'I-DAT', 'B-PER', 'I-PER', 'I-PRO', 'I-LOC', 'I-PCT', 'I-TIM', 'I-FAC', 'B-EVE', 'I-MON', 'B-PCT', 'B-ORG', 'B-FAC', 'B-TIM', 'I-EVE', 'I-ORG', 'B-DAT', 'B-MON']
dataset_labels-model_labels: ['I_PCT', 'B_PER', 'B_DAT', 'B_PCT', 'I_LOC', 'I_MON', 'I_DAT', 'B_MON', 'B_TIM', 'I_TIM', 'I_PER', 'B_ORG', 'B_LOC', 'I_ORG']
False


In [16]:
label_translate = {
    'B_PER': 'B-PER', 
    'I_PER': 'I-PER',
    'B_LOC': 'B-LOC',
    'I_LOC': 'I-LOC',
    'B_ORG': 'B-ORG',
    'I_ORG': 'I-ORG', 
    'B_MON': 'B-MON',
    'I_MON': 'I-MON', 
    'B_DAT': 'B-DAT', 
    'I_DAT': 'I-DAT',
    'B_TIM': 'B-TIM',
    'I_TIM': 'I-TIM', 
    'B_PCT': 'B-PCT',
    'I_PCT': 'I-PCT',
    'O': 'O'
}
labels = ner_model.resolve_input_label_consistency(labels, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-DAT', 'B-LOC', 'I-MON', 'B-PER', 'I-PER', 'B-PCT', 'B-ORG', 'I-LOC', 'I-PCT', 'O', 'B-TIM', 'I-ORG', 'I-TIM', 'B-DAT', 'B-MON'}
intersection: {'I-DAT', 'B-LOC', 'I-PCT', 'O', 'B-TIM', 'I-MON', 'I-ORG', 'I-TIM', 'B-ORG', 'B-PER', 'I-PER', 'B-DAT', 'B-PCT', 'B-MON', 'I-LOC'}
model_labels-dataset_labels: ['B-PRO', 'B-FAC', 'I-EVE', 'B-EVE', 'I-FAC', 'I-PRO']
dataset_labels-model_labels: []
True


In [17]:
!nvidia-smi
!lscpu

Mon Aug 16 13:14:30 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    76W / 149W |    589MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [18]:
inference_output_peyma = ner_model.ner_evaluation(sentences, labels, device, batch_size=512)

max_len: 167
#samples: 1026
#batch: 3
Start to evaluate test data ...
inference time for step 0: 0.045081542999980684
inference time for step 1: 0.01587190999998711
inference time for step 2: 0.016097095999953126
average loss: 2.3410960833231607
total inference time: 0.07705054899992092
total inference time / #samples: 7.509800097458179e-05


In [19]:
for sample_output in inference_output_peyma[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

کنایه	O	I-ORG
سر	O	I-ORG
ل	O	I-ORG
ش	O	I-ORG
گر	O	I-ORG
فیروز	B-ORG	I-ORG
اباد	B-ORG	I-ORG
ی	B-ORG	I-ORG
به	O	I-ORG
پادشاه	O	I-ORG
عربستان	B-LOC	I-ORG
و	O	I-ORG
پسرش	O	I-ORG

	O	I-ORG
ر	O	I-ORG
<unk>	O	I-ORG
یس	O	I-ORG
سابق	O	I-ORG
ستاد	B-ORG	B-ORG
کل	I-ORG	I-ORG
نیروهای	I-ORG	I-ORG
مسلح	I-ORG	I-ORG
با	O	I-ORG
بیان	O	I-ORG
اینکه	O	O
ال	O	B-ORG
سعود	O	I-ORG
با	O	O
حمایت	O	O
همه	O	O
جانبه	O	O
غرب	O	I-ORG
بر	O	O
سرزمین	B-LOC	I-ORG
حجاز	I-LOC	I-ORG
حاکم	O	I-ORG
شد	O	O
گفت	O	O
	O	I-ORG
:	O	I-ORG
غرب	O	I-ORG
با	O	O
حاکم	O	O
کرد	O	O
د	O	I-ORG
ال	O	B-ORG
سعود	O	I-ORG
بر	O	O
حجاز	B-LOC	I-ORG
هدفی	O	I-ORG
جز	O	I-ORG
نا	O	I-ORG
بود	O	I-ORG
ی	O	I-ORG
اسلام	O	O
نداشته	O	I-ORG
و	O	I-ORG
این	O	I-ORG
نقشه	O	O
انگلیس	B-LOC	I-ORG
بود	O	I-ORG
	O	I-ORG
.	O	I-ORG

سر	O	I-ORG
ل	O	I-ORG
ش	O	I-ORG
گر	O	I-ORG
حسن	B-PER	I-ORG
فیروز	I-PER	I-ORG
اباد	I-PER	I-ORG
ی	I-PER	I-ORG
روز	O	I-ORG
دوشنبه	O	I-ORG
در	O	O
حاشیه	O	O
	O	B-ORG
ا	O	O
<unk>	O	O
ین	O	O
ختم	O	O
مادر	O	O
حیدر	B-PER	I-ORG
مصلح	I-PER	I-ORG
ی	I-PER	I-OR

In [20]:
ner_model.evaluate_prediction_results(labels, inference_output_peyma)

Test Accuracy: 0.5722360041942661
Test Precision: 0.062427071178529754
Test Recall: 0.13161131611316113
Test F1-Score: 0.08468539770478829
Test classification Report:
              precision    recall  f1-score   support

         DAT  0.3333333333 0.0315315315 0.0576131687       222
         LOC  0.4712643678 0.0598540146 0.1062176166       685
         MON  0.1875000000 0.1071428571 0.1363636364        28
         ORG  0.0516064257 0.3048635824 0.0882706509       843
         PCT  0.3500000000 0.1186440678 0.1772151899        59
         PER  0.3750000000 0.0103448276 0.0201342282       580
         TIM  0.0000000000 0.0000000000 0.0000000000        22

   micro avg  0.0624270712 0.1316113161 0.0846853977      2439
   macro avg  0.2526720181 0.0903401259 0.0836877844      2439
weighted avg  0.2803281299 0.1316113161 0.0762251250      2439



In [21]:
output_file_name = "ner_peyma_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_peyma:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Arman dataset:
ARMAN dataset holds 7,682 sentences with 250,015 sentences tagged over six different classes.

1. Organization
2. Location
3. Facility
4. Event
5. Product
6. Person


|     Label    |   #   |
|:------------:|:-----:|
| Organization | 30108 |
|   Location   | 12924 |
|   Facility   |  4458 |
|     Event    |  7557 |
|    Product   |  4389 |
|    Person    | 15645 |

**Download**
You can download the dataset from [here](https://github.com/HaniehP/PersianNER)


In [22]:
!wget https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
!ls

--2021-08-16 13:15:06--  https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/HaniehP/PersianNER/master/ArmanPersoNERCorpus.zip [following]
--2021-08-16 13:15:06--  https://raw.githubusercontent.com/HaniehP/PersianNER/master/ArmanPersoNERCorpus.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1931170 (1.8M) [application/zip]
Saving to: ‘ArmanPersoNERCorpus.zip’


2021-08-16 13:15:06 (26.0 MB/s) - ‘ArmanPersoNERCorpus.zip’ saved [1931170/1931170]

adc.json						       peyma
ArmanPersoNERCorpus.zip					       peyma.zip
ner_peym

In [23]:
!unzip ArmanPersoNERCorpus.zip -d arman
!ls

Archive:  ArmanPersoNERCorpus.zip
  inflating: arman/test_fold1.txt    
  inflating: arman/ReadMe.txt        
  inflating: arman/train_fold3.txt   
  inflating: arman/train_fold2.txt   
  inflating: arman/train_fold1.txt   
  inflating: arman/test_fold3.txt    
  inflating: arman/test_fold2.txt    
adc.json						       peyma
arman							       peyma.zip
ArmanPersoNERCorpus.zip					       sample_data
ner_peyma_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt


In [24]:
sentences, labels = ner_model.load_test_datasets(dataset_name="arman", dataset_dir="./arman/")
print(len(sentences), len(labels))
print(sentences[0])
print(labels[0])

7681 7681
['افقی', ':', '0', 'ـ', 'از', 'عوامل', 'دوران', 'پهلوی', 'و', 'نخست\u200cوزیر', 'ایران', 'در', 'سالهای', 'ابتدائی', 'دهه', 'چهل', 'خورشیدی', 'كه', 'جلد', 'سوم', 'یادداشتهایش', 'هم', 'چندی', 'پیش', 'در', 'تهران', 'منتشر', 'شد', '0', 'ـ', 'پرستاری', 'از', 'ناخوش\u200cاحوال', 'ـ', 'پوشاک', 'و', 'جامه', 'ـ', 'فانتزی', 'و', 'شیک', '0', 'ـ', 'در', 'حال', 'وزیدن', 'ـ', 'اطلاعیه', 'ـ', 'پایتخت', 'جمهوری', 'استونی', 'در', 'حوضه', 'بالتیک', '0', 'ـ', 'علم', 'راهبرد', 'مؤسسه', 'و', 'سازمان', 'ـ', 'نوعی', 'شمع', '0', 'ـ', 'حرف', 'جمع', 'مؤنث', 'ـ', 'در', 'ایران', 'به', 'تولیدکننده', 'کتاب', 'اطلاق', 'می\u200cشود', 'ـ', 'از', 'شهرهای', 'باختری', 'افغانستان', 'كه', 'تا', 'عصر', 'ناصرالدین\u200cشاه', 'جزئی', 'از', 'خراسان', 'بود', 'ـ', 'ویتامین', 'انعقاد', '0', 'ـ', 'سبزی', 'غده\u200cای', 'ـ', 'دوستی', 'و', 'محبت', 'ـ', 'داستان', 'بلند', 'ـ', 'شهری', 'در', 'آلمان', '0', 'ـ', 'سلول', 'بدن', 'موجودات', 'ـ', 'از', 'انواع', 'کالباس', '0', 'ـ', 'حاشیه', 'و', 'هامش', 'ـ', 'پیدا', 'نشدنی', 'ـ', 'خ

In [25]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-pers', 'B-pro', 'B-event', 'B-fac', 'I-loc', 'B-loc', 'I-fac', 'B-org', 'O', 'B-pers', 'I-pro', 'I-event', 'I-org'}
intersection: {'O'}
model_labels-dataset_labels: ['B-PRO', 'B-LOC', 'I-DAT', 'B-PER', 'I-PER', 'I-PRO', 'I-LOC', 'I-PCT', 'I-TIM', 'I-FAC', 'B-EVE', 'I-MON', 'B-PCT', 'B-ORG', 'B-FAC', 'B-TIM', 'I-EVE', 'I-ORG', 'B-DAT', 'B-MON']
dataset_labels-model_labels: ['B-org', 'I-pers', 'B-pro', 'B-event', 'B-pers', 'B-fac', 'I-pro', 'I-event', 'I-loc', 'B-loc', 'I-fac', 'I-org']
False


In [26]:
label_translate = {
    'B-org': 'B-ORG', 
    'I-org': 'I-ORG',
    'B-loc': 'B-LOC',
    'I-loc': 'I-LOC',
    'B-pers': 'B-PER', 
    'I-pers': 'I-PER',
    'B-event': 'B-EVE', 
    'I-event': 'I-EVE',
    'B-pro': 'B-PRO', 
    'I-pro': 'I-PRO', 
    'B-fac': 'B-FAC', 
    'I-fac': 'I-FAC',
    'O': 'O'
}
labels = ner_model.resolve_input_label_consistency(labels, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-PRO', 'B-LOC', 'B-EVE', 'B-PER', 'I-PER', 'I-PRO', 'B-ORG', 'I-LOC', 'B-FAC', 'O', 'I-EVE', 'I-ORG', 'I-FAC'}
intersection: {'B-PRO', 'B-LOC', 'B-FAC', 'O', 'I-EVE', 'B-EVE', 'I-ORG', 'B-ORG', 'I-FAC', 'B-PER', 'I-PER', 'I-PRO', 'I-LOC'}
model_labels-dataset_labels: ['I-DAT', 'I-PCT', 'B-TIM', 'I-MON', 'I-TIM', 'B-DAT', 'B-PCT', 'B-MON']
dataset_labels-model_labels: []
True


batch size=256 -> inference time for one batch is about 205 s

batch size=512 -> inference time for one batch is about 410 s

batch size=1024 -> crach

In [27]:
!nvidia-smi
!lscpu

Mon Aug 16 13:15:07 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P0    78W / 149W |   8023MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [28]:
inference_output_arman = ner_model.ner_evaluation(sentences, labels, device, batch_size=256)

max_len: 343
#samples: 7681
#batch: 31
Start to evaluate test data ...
inference time for step 0: 0.030384463000018513
inference time for step 1: 0.015388011999959872
inference time for step 2: 0.017534320000038406
inference time for step 3: 0.014890434999983881
inference time for step 4: 0.014860805000012078
inference time for step 5: 0.024457469000026322
inference time for step 6: 0.014809802000002037
inference time for step 7: 0.016128756999989946
inference time for step 8: 0.014580296999952225
inference time for step 9: 0.014843377999909535
inference time for step 10: 0.014998206999962349
inference time for step 11: 0.021414721000041936
inference time for step 12: 0.015230474999953003
inference time for step 13: 0.015563865999979498
inference time for step 14: 0.017451900999958525
inference time for step 15: 0.014979799000002458
inference time for step 16: 0.014994709000006878
inference time for step 17: 0.015278211000008923
inference time for step 18: 0.015186240000048201
inferenc

In [29]:
for sample_output in inference_output_arman[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

افقی	O	O
	O	O
:	O	O
0	O	O
	O	O
<unk>	O	O
از	O	O
عوامل	O	O
دوران	O	O
پهلوی	O	O
و	O	O
نخست	O	O
[ZWNJ]	O	O
وزیر	O	O
ایران	B-LOC	B-LOC
در	O	O
سالهای	O	O
ابتدا	O	O
<unk>	O	O
ی	O	O
دهه	O	O
چهل	O	O
خورشیدی	O	O
	O	O
<unk>	O	O
ه	O	O
جلد	O	O
سوم	O	O
یادداشت	O	O
هایش	O	O
هم	O	O
چندی	O	O
پیش	O	O
در	O	O
تهران	B-LOC	B-LOC
منتشر	O	O
شد	O	O
0	O	O
	O	O
<unk>	O	O
پرستاری	O	O
از	O	O
ناخو	O	O
ش	O	O
[ZWNJ]	O	O
اح	O	O
وال	O	O
	O	O
<unk>	O	O
پوشاک	O	O
و	O	O
جامه	O	O
	O	O
<unk>	O	O
فانتزی	O	O
و	O	O
شیک	O	O
0	O	O
	O	O
<unk>	O	O
در	O	O
حال	O	O
و	O	O
زید	O	O
ن	O	O
	O	O
<unk>	O	O
اطلاعیه	O	O
	O	O
<unk>	O	O
پایتخت	O	O
جمهوری	O	O
استونی	B-LOC	B-LOC
در	I-LOC	I-LOC
حوضه	I-LOC	I-LOC
بالتیک	I-LOC	I-LOC
0	O	O
	O	O
<unk>	O	O
علم	O	O
راهبرد	O	O
موسسه	O	O
و	O	O
سازمان	O	O
	O	O
<unk>	O	O
نوعی	O	O
شمع	O	O
0	O	O
	O	O
<unk>	O	O
حرف	O	O
جمع	O	O
مونث	O	O
	O	O
<unk>	O	O
در	O	O
ایران	B-LOC	B-LOC
به	O	O
تولیدکننده	O	O
کتاب	O	O
اطلاق	O	O
می	O	O
[ZWNJ]	O	O
شود	O	O
	O	O
<unk>	O	O
از	O	O
شهرهای	O	O
باختری	O	O
افغانستان	B-LOC	B-LOC
	O	O

In [30]:
ner_model.evaluate_prediction_results(labels, inference_output_arman)

Test Accuracy: 0.1834939937713184
Test Precision: 0.018716899374097255
Test Recall: 0.03560185450174575
Test F1-Score: 0.02453503737451433
Test classification Report:
              precision    recall  f1-score   support

         EVE  0.0818181818 0.0120967742 0.0210772834       744
         FAC  0.1600000000 0.0119402985 0.0222222222       670
         LOC  0.3242009132 0.0160018030 0.0304982818      4437
         ORG  0.0128781810 0.0739168877 0.0219347729      5655
         PER  0.3585657371 0.0184350676 0.0350672122      4882
         PRO  0.1805555556 0.0240073869 0.0423797881      1083

   micro avg  0.0187168994 0.0356018545 0.0245350374     17471
   macro avg  0.1863364281 0.0260663697 0.0288632601     17471
weighted avg  0.2075117559 0.0356018545 0.0290211118     17471



In [31]:
output_file_name = "ner_arman_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_arman:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Arman+Peyma

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1WZxpFRtEs5HZWyWQ2Pyg9CCuIBs1Kmvx'})
download.GetContentFile('peyma.zip')
!ls

In [None]:
!unzip peyma.zip
!ls
!ls peyma

In [32]:
sentences_peyma, labels_peyma = ner_model.load_test_datasets(dataset_name="peyma", dataset_dir="./peyma/")
print(len(sentences_peyma), len(labels_peyma))
print(sentences_peyma[0])
print(labels_peyma[0])

1026 1026
['کنایه', 'سرلشگر', 'فیروزآبادی', 'به', 'پادشاه', 'عربستان', 'و', 'پسرش']
['O', 'O', 'B_ORG', 'O', 'O', 'B_LOC', 'O', 'O']


In [33]:
is_consistent = ner_model.check_input_label_consistency(labels_peyma)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I_PCT', 'B_PER', 'B_PCT', 'I_LOC', 'I_DAT', 'B_MON', 'B_TIM', 'I_TIM', 'I_PER', 'B_ORG', 'I_ORG', 'O', 'B_DAT', 'I_MON', 'B_LOC'}
intersection: {'O'}
model_labels-dataset_labels: ['B-PRO', 'B-LOC', 'I-DAT', 'B-PER', 'I-PER', 'I-PRO', 'I-LOC', 'I-PCT', 'I-TIM', 'I-FAC', 'B-EVE', 'I-MON', 'B-PCT', 'B-ORG', 'B-FAC', 'B-TIM', 'I-EVE', 'I-ORG', 'B-DAT', 'B-MON']
dataset_labels-model_labels: ['I_PCT', 'B_PER', 'B_DAT', 'B_PCT', 'I_LOC', 'I_MON', 'I_DAT', 'B_MON', 'B_TIM', 'I_TIM', 'I_PER', 'B_ORG', 'B_LOC', 'I_ORG']
False


In [34]:
label_translate = {
    'B_PER': 'B-PER', 
    'I_PER': 'I-PER',
    'B_LOC': 'B-LOC',
    'I_LOC': 'I-LOC',
    'B_ORG': 'B-ORG',
    'I_ORG': 'I-ORG', 
    'B_MON': 'B-MON',
    'I_MON': 'I-MON', 
    'B_DAT': 'B-DAT', 
    'I_DAT': 'I-DAT',
    'B_TIM': 'B-TIM',
    'I_TIM': 'I-TIM', 
    'B_PCT': 'B-PCT',
    'I_PCT': 'I-PCT',
    'O': 'O'
}
labels_peyma = ner_model.resolve_input_label_consistency(labels_peyma, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels_peyma)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-DAT', 'B-LOC', 'I-MON', 'B-PER', 'I-PER', 'B-PCT', 'B-ORG', 'I-LOC', 'I-PCT', 'O', 'B-TIM', 'I-ORG', 'I-TIM', 'B-DAT', 'B-MON'}
intersection: {'I-DAT', 'B-LOC', 'I-PCT', 'O', 'B-TIM', 'I-MON', 'I-ORG', 'I-TIM', 'B-ORG', 'B-PER', 'I-PER', 'B-DAT', 'B-PCT', 'B-MON', 'I-LOC'}
model_labels-dataset_labels: ['B-PRO', 'B-FAC', 'I-EVE', 'B-EVE', 'I-FAC', 'I-PRO']
dataset_labels-model_labels: []
True


In [None]:
!wget https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
!ls

In [None]:
!unzip ArmanPersoNERCorpus.zip -d arman
!ls

In [35]:
sentences_arman, labels_arman = ner_model.load_test_datasets(dataset_name="arman", dataset_dir="./arman/")
print(len(sentences_arman), len(labels_arman))
print(sentences_arman[0])
print(labels_arman[0])

7681 7681
['افقی', ':', '0', 'ـ', 'از', 'عوامل', 'دوران', 'پهلوی', 'و', 'نخست\u200cوزیر', 'ایران', 'در', 'سالهای', 'ابتدائی', 'دهه', 'چهل', 'خورشیدی', 'كه', 'جلد', 'سوم', 'یادداشتهایش', 'هم', 'چندی', 'پیش', 'در', 'تهران', 'منتشر', 'شد', '0', 'ـ', 'پرستاری', 'از', 'ناخوش\u200cاحوال', 'ـ', 'پوشاک', 'و', 'جامه', 'ـ', 'فانتزی', 'و', 'شیک', '0', 'ـ', 'در', 'حال', 'وزیدن', 'ـ', 'اطلاعیه', 'ـ', 'پایتخت', 'جمهوری', 'استونی', 'در', 'حوضه', 'بالتیک', '0', 'ـ', 'علم', 'راهبرد', 'مؤسسه', 'و', 'سازمان', 'ـ', 'نوعی', 'شمع', '0', 'ـ', 'حرف', 'جمع', 'مؤنث', 'ـ', 'در', 'ایران', 'به', 'تولیدکننده', 'کتاب', 'اطلاق', 'می\u200cشود', 'ـ', 'از', 'شهرهای', 'باختری', 'افغانستان', 'كه', 'تا', 'عصر', 'ناصرالدین\u200cشاه', 'جزئی', 'از', 'خراسان', 'بود', 'ـ', 'ویتامین', 'انعقاد', '0', 'ـ', 'سبزی', 'غده\u200cای', 'ـ', 'دوستی', 'و', 'محبت', 'ـ', 'داستان', 'بلند', 'ـ', 'شهری', 'در', 'آلمان', '0', 'ـ', 'سلول', 'بدن', 'موجودات', 'ـ', 'از', 'انواع', 'کالباس', '0', 'ـ', 'حاشیه', 'و', 'هامش', 'ـ', 'پیدا', 'نشدنی', 'ـ', 'خ

In [36]:
is_consistent = ner_model.check_input_label_consistency(labels_arman)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-pers', 'B-pro', 'B-event', 'B-fac', 'I-loc', 'B-loc', 'I-fac', 'B-org', 'O', 'B-pers', 'I-pro', 'I-event', 'I-org'}
intersection: {'O'}
model_labels-dataset_labels: ['B-PRO', 'B-LOC', 'I-DAT', 'B-PER', 'I-PER', 'I-PRO', 'I-LOC', 'I-PCT', 'I-TIM', 'I-FAC', 'B-EVE', 'I-MON', 'B-PCT', 'B-ORG', 'B-FAC', 'B-TIM', 'I-EVE', 'I-ORG', 'B-DAT', 'B-MON']
dataset_labels-model_labels: ['B-org', 'I-pers', 'B-pro', 'B-event', 'B-pers', 'B-fac', 'I-pro', 'I-event', 'I-loc', 'B-loc', 'I-fac', 'I-org']
False


In [37]:
label_translate = {
    'B-org': 'B-ORG', 
    'I-org': 'I-ORG',
    'B-loc': 'B-LOC',
    'I-loc': 'I-LOC',
    'B-pers': 'B-PER', 
    'I-pers': 'I-PER',
    'B-event': 'B-EVE', 
    'I-event': 'I-EVE',
    'B-pro': 'B-PRO', 
    'I-pro': 'I-PRO', 
    'B-fac': 'B-FAC', 
    'I-fac': 'I-FAC',
    'O': 'O'
}
labels_arman = ner_model.resolve_input_label_consistency(labels_arman, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels_arman)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-PRO', 'B-LOC', 'B-EVE', 'B-PER', 'I-PER', 'I-PRO', 'B-ORG', 'I-LOC', 'B-FAC', 'O', 'I-EVE', 'I-ORG', 'I-FAC'}
intersection: {'B-PRO', 'B-LOC', 'B-FAC', 'O', 'I-EVE', 'B-EVE', 'I-ORG', 'B-ORG', 'I-FAC', 'B-PER', 'I-PER', 'I-PRO', 'I-LOC'}
model_labels-dataset_labels: ['I-DAT', 'I-PCT', 'B-TIM', 'I-MON', 'I-TIM', 'B-DAT', 'B-PCT', 'B-MON']
dataset_labels-model_labels: []
True


In [38]:
sentences = sentences_arman + sentences_peyma
labels = labels_arman + labels_peyma
print(len(sentences), len(labels))

8707 8707


In [39]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-PRO', 'B-LOC', 'I-DAT', 'B-EVE', 'I-MON', 'B-PER', 'I-PER', 'I-PRO', 'B-ORG', 'B-PCT', 'I-LOC', 'I-PCT', 'B-FAC', 'O', 'B-TIM', 'I-EVE', 'I-ORG', 'I-TIM', 'I-FAC', 'B-DAT', 'B-MON'}
intersection: {'B-PRO', 'B-LOC', 'I-DAT', 'B-PER', 'I-PER', 'I-PRO', 'I-LOC', 'I-PCT', 'O', 'I-TIM', 'I-FAC', 'B-EVE', 'I-MON', 'B-ORG', 'B-PCT', 'B-FAC', 'B-TIM', 'I-EVE', 'I-ORG', 'B-DAT', 'B-MON'}
model_labels-dataset_labels: []
dataset_labels-model_labels: []
True


In [40]:
!nvidia-smi
!lscpu

Mon Aug 16 13:23:57 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    77W / 149W |   9639MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [41]:
inference_output = ner_model.ner_evaluation(sentences, labels, device, batch_size=256)

max_len: 343
#samples: 8707
#batch: 35
Start to evaluate test data ...
inference time for step 0: 0.030784146999963014
inference time for step 1: 0.015310986999907072
inference time for step 2: 0.016548491999969883
inference time for step 3: 0.015697609000085322
inference time for step 4: 0.014840446999983214
inference time for step 5: 0.01476733200001945
inference time for step 6: 0.014927071000101932
inference time for step 7: 0.014240316000041275
inference time for step 8: 0.015230921000011222
inference time for step 9: 0.015423871999928451
inference time for step 10: 0.015870541999902343
inference time for step 11: 0.01548267099997247
inference time for step 12: 0.014560278999852017
inference time for step 13: 0.014719055999876218
inference time for step 14: 0.015067217000023447
inference time for step 15: 0.015021690999901693
inference time for step 16: 0.016800516000103016
inference time for step 17: 0.018599319000031755
inference time for step 18: 0.014718133000087619
inference 

In [42]:
for sample_output in inference_output[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

افقی	O	O
	O	O
:	O	O
0	O	O
	O	O
<unk>	O	O
از	O	O
عوامل	O	O
دوران	O	O
پهلوی	O	O
و	O	O
نخست	O	O
[ZWNJ]	O	O
وزیر	O	O
ایران	B-LOC	B-LOC
در	O	O
سالهای	O	O
ابتدا	O	O
<unk>	O	O
ی	O	O
دهه	O	O
چهل	O	O
خورشیدی	O	O
	O	O
<unk>	O	O
ه	O	O
جلد	O	O
سوم	O	O
یادداشت	O	O
هایش	O	O
هم	O	O
چندی	O	O
پیش	O	O
در	O	O
تهران	B-LOC	B-LOC
منتشر	O	O
شد	O	O
0	O	O
	O	O
<unk>	O	O
پرستاری	O	O
از	O	O
ناخو	O	O
ش	O	O
[ZWNJ]	O	O
اح	O	O
وال	O	O
	O	O
<unk>	O	O
پوشاک	O	O
و	O	O
جامه	O	O
	O	O
<unk>	O	O
فانتزی	O	O
و	O	O
شیک	O	O
0	O	O
	O	O
<unk>	O	O
در	O	O
حال	O	O
و	O	O
زید	O	O
ن	O	O
	O	O
<unk>	O	O
اطلاعیه	O	O
	O	O
<unk>	O	O
پایتخت	O	O
جمهوری	O	O
استونی	B-LOC	B-LOC
در	I-LOC	I-LOC
حوضه	I-LOC	I-LOC
بالتیک	I-LOC	I-LOC
0	O	O
	O	O
<unk>	O	O
علم	O	O
راهبرد	O	O
موسسه	O	O
و	O	O
سازمان	O	O
	O	O
<unk>	O	O
نوعی	O	O
شمع	O	O
0	O	O
	O	O
<unk>	O	O
حرف	O	O
جمع	O	O
مونث	O	O
	O	O
<unk>	O	O
در	O	O
ایران	B-LOC	B-LOC
به	O	O
تولیدکننده	O	O
کتاب	O	O
اطلاق	O	O
می	O	O
[ZWNJ]	O	O
شود	O	O
	O	O
<unk>	O	O
از	O	O
شهرهای	O	O
باختری	O	O
افغانستان	B-LOC	B-LOC
	O	O

In [43]:
ner_model.evaluate_prediction_results(labels, inference_output)

Test Accuracy: 0.18735897047449707
Test Precision: 0.018871444944958286
Test Recall: 0.03556002009040683
Test F1-Score: 0.02465739112264266


  _warn_prf(average, modifier, msg_start, len(result))


Test classification Report:
              precision    recall  f1-score   support

         DAT  0.0000000000 0.0000000000 0.0000000000       222
         EVE  0.0796460177 0.0120967742 0.0210035006       744
         FAC  0.1600000000 0.0119402985 0.0222222222       670
         LOC  0.3200000000 0.0140570090 0.0269309893      5122
         MON  0.0000000000 0.0000000000 0.0000000000        28
         ORG  0.0136993763 0.0774084334 0.0232789541      6498
         PCT  0.0000000000 0.0000000000 0.0000000000        59
         PER  0.3474903475 0.0164774808 0.0314630309      5462
         PRO  0.1805555556 0.0240073869 0.0423797881      1083
         TIM  0.0000000000 0.0000000000 0.0000000000        22

   micro avg  0.0188714449 0.0355600201 0.0246573911     19910
   macro avg  0.1101391297 0.0155987383 0.0167278485     19910
weighted avg  0.2003038237 0.0355600201 0.0269950301     19910



In [44]:
output_file_name = "ner_arman-and-peyma_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### WikiAnn

https://elisa-ie.github.io/wikiann/

In [45]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX'})
download.GetContentFile('fa.tar.gz')
!ls

adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner_arman-and-peyma_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
ner_arman_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
ner_peyma_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
peyma
peyma.zip
sample_data


In [46]:
!tar -zxvf fa.tar.gz
!ls

README.txt
wikiann-fa.bio
adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner_arman-and-peyma_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
ner_arman_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
ner_peyma_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
peyma
peyma.zip
README.txt
sample_data
wikiann-fa.bio


In [47]:
sentences_all, labels_all, sentences_test, labels_test = ner_model.load_datasets(dataset_name="wikiann", dataset_dir="./")
print(len(sentences_all), len(sentences_all))
print(len(sentences_test), len(labels_test))
print(sentences_test[0])
print(labels_test[0])

all data: #data: 272266, #labels: 272266


  return array(a, dtype, copy=False, order=order)


without stratify
test part:
 #data: 27227, #labels: 27227
272266 272266
27227 27227
['**', 'زاغی', 'نوک\u200cزرد', ',', "''Pica", 'nuttalli', "''"]
['O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O']


In [48]:
is_consistent = ner_model.check_input_label_consistency(labels_test)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-LOC', 'O', 'I-ORG', 'B-PER', 'I-PER', 'B-ORG', 'I-LOC'}
intersection: {'B-LOC', 'O', 'I-ORG', 'B-PER', 'I-PER', 'B-ORG', 'I-LOC'}
model_labels-dataset_labels: ['B-PRO', 'I-DAT', 'I-PCT', 'B-FAC', 'B-TIM', 'I-EVE', 'B-EVE', 'I-MON', 'I-TIM', 'B-PCT', 'I-FAC', 'B-DAT', 'I-PRO', 'B-MON']
dataset_labels-model_labels: []
True


In [49]:
!nvidia-smi
!lscpu

Mon Aug 16 13:34:03 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    78W / 149W |   8607MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [50]:
inference_output_wikiann = ner_model.ner_evaluation_2(sentences_test, labels_test, device, batch_size=512)

len(input_text): 27227
len(input_labels): 27227
c: 10000
c: 20000
max_len: 144
#samples: 27227
#batch: 54
Start to evaluate test data ...
inference time for step 0: 0.03113492500006032
inference time for step 1: 0.014784042999963276
inference time for step 2: 0.016757637999944563
inference time for step 3: 0.01454860099988764
inference time for step 4: 0.014294115999973656
inference time for step 5: 0.015484579000030863
inference time for step 6: 0.01401005200000327
inference time for step 7: 0.014825750999989395
inference time for step 8: 0.015161127999817836
inference time for step 9: 0.014270001000113552
inference time for step 10: 0.01436139399993408
inference time for step 11: 0.014094546000023911
inference time for step 12: 0.014244909000126427
inference time for step 13: 0.014801188999854276
inference time for step 14: 0.013884763000078237
inference time for step 15: 0.014361991999976453
inference time for step 16: 0.014980329999843889
inference time for step 17: 0.0150310799999

In [51]:
for sample_output in inference_output_wikiann[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

*	O	I-ORG
*	O	I-ORG
زاغ	B-LOC	I-ORG
ی	B-LOC	I-ORG
نوک	I-LOC	I-ORG
[ZWNJ]	I-LOC	I-ORG
زرد	I-LOC	I-ORG
	O	I-ORG
,	O	I-ORG
	O	I-ORG
"	O	I-ORG
P	O	I-ORG
ica	O	I-ORG
n	O	I-ORG
ut	O	I-ORG
tal	O	I-ORG
li	O	I-ORG
	O	I-ORG
"	O	I-ORG

تغییر	O	I-ORG
مسیر	O	I-ORG
مک	B-LOC	I-ORG
[ZWNJ]	B-LOC	I-ORG
ویل	B-LOC	I-ORG
،	B-LOC	I-ORG
داکوتا	I-LOC	I-ORG
ی	I-LOC	I-ORG
شمالی	I-LOC	I-ORG

و	B-LOC	I-ORG
ست	B-LOC	I-ORG
یونی	I-LOC	O
ور	I-LOC	I-ORG
سیتی	I-LOC	I-ORG
پلیس	I-LOC	O
،	I-LOC	I-ORG
تگزاس	I-LOC	I-ORG

تغییر	O	I-ORG
مسیر	O	I-ORG
دل	B-PER	I-ORG
تف	B-PER	I-ORG
فون	I-PER	I-ORG
	I-PER	I-ORG
لیل	I-PER	I-ORG
نس	I-PER	I-ORG
رون	I-PER	I-ORG

تغییر	O	I-ORG
مسیر	O	I-ORG
نیروگاه	B-ORG	I-ORG
[ZWNJ]	B-ORG	I-ORG
های	B-ORG	I-ORG
زنجیره	I-ORG	I-ORG
[ZWNJ]	I-ORG	I-ORG
ای	I-ORG	I-ORG
یاسوج	I-ORG	I-ORG



In [52]:
ner_model.evaluate_prediction_results(labels_test, inference_output_wikiann)

Test Accuracy: 0.2269050249505068
Test Precision: 0.10408163265306122
Test Recall: 0.06974286720676294
Test F1-Score: 0.08352046449883999
Test classification Report:
              precision    recall  f1-score   support

         LOC  0.2052238806 0.0020503262 0.0040600893     26825
         ORG  0.1032792005 0.2421291551 0.1447961820     13658
         PER  0.0769230769 0.0005141388 0.0010214505      7780

   micro avg  0.1040816327 0.0697428672 0.0835204645     48263
   macro avg  0.1284753860 0.0815645400 0.0499592406     48263
weighted avg  0.1556923410 0.0697428672 0.0433973237     48263



In [53]:
output_file_name = "ner_wikiann_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_wikiann:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Hooshvare - Arman+Peyma+WikiAnn

https://github.com/hooshvare/parsner

In [54]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1fC2WGlpqumUTaT9Dr_U1jO2no3YMKFJ4'})
download.GetContentFile('ner-v1.zip')
!ls

adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner_arman-and-peyma_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
ner_arman_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
ner_peyma_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
ner-v1.zip
ner_wikiann_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
peyma
peyma.zip
README.txt
sample_data
wikiann-fa.bio


In [55]:
!unzip ner-v1.zip
!ls
!ls ner

Archive:  ner-v1.zip
   creating: ner/
  inflating: ner/valid.csv           
  inflating: ner/ner.csv             
  inflating: ner/test.csv            
  inflating: ner/train.csv           
adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner
ner_arman-and-peyma_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
ner_arman_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
ner_peyma_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
ner-v1.zip
ner_wikiann_HooshvareLab-albert-fa-zwnj-base-v2-ner_outputs.txt
peyma
peyma.zip
README.txt
sample_data
wikiann-fa.bio
ner.csv  test.csv  train.csv  valid.csv


In [56]:
sentences_paw, labels_paw = ner_model.load_test_datasets(dataset_name="hooshvare-peyman+arman+wikiann", dataset_dir="./ner/")
print(len(sentences_paw), len(labels_paw))
print(sentences_paw[0])
print(labels_paw[0])

test part:
 #sentences: 6049, #sentences_tags: 6049
6049 6049
['همچنین', 'عملیات', 'لرزه\u200cنگاری', 'دوبعدی', 'نیز', 'با', 'فعالیت', 'مستمر', 'چهار', 'گروه', 'کاری', 'در', 'مناطقی', 'که', 'از', 'نظر', 'اکتشافی', 'مورد', 'نظر', 'بود', '،', 'به', 'پایان', 'رسید', 'که', 'نتایج', 'آن', 'در', 'حال', 'بررسی', 'است', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [57]:
is_consistent = ner_model.check_input_label_consistency(labels_paw)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-PRO', 'B-LOC', 'I-DAT', 'B-EVE', 'I-MON', 'B-PER', 'I-PER', 'B-ORG', 'I-PRO', 'B-PCT', 'I-LOC', 'I-PCT', 'B-FAC', 'O', 'B-TIM', 'I-EVE', 'I-ORG', 'I-TIM', 'I-FAC', 'B-DAT', 'B-MON'}
intersection: {'B-PRO', 'B-LOC', 'I-DAT', 'B-PER', 'I-PER', 'I-PRO', 'I-LOC', 'I-PCT', 'O', 'I-TIM', 'I-FAC', 'B-EVE', 'I-MON', 'B-PCT', 'B-ORG', 'B-FAC', 'B-TIM', 'I-EVE', 'I-ORG', 'B-DAT', 'B-MON'}
model_labels-dataset_labels: []
dataset_labels-model_labels: []
True


In [58]:
!nvidia-smi
!lscpu

Mon Aug 16 13:46:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    78W / 149W |   7681MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [59]:
inference_output = ner_model.ner_evaluation_2(sentences_paw, labels_paw, device, batch_size=256)

len(input_text): 6049
len(input_labels): 6049
max_len: 512
#samples: 6049
#batch: 24
Start to evaluate test data ...
inference time for step 0: 0.036710098999719776
inference time for step 1: 0.015747501000078046
inference time for step 2: 0.015774784999848634
inference time for step 3: 0.015400015000068379
inference time for step 4: 0.015320683000027202
inference time for step 5: 0.014486323000255652
inference time for step 6: 0.01574393000009877
inference time for step 7: 0.01594740400014416
inference time for step 8: 0.014879582000048686
inference time for step 9: 0.014527852000355779
inference time for step 10: 0.015269529999841325
inference time for step 11: 0.01529612400008773
inference time for step 12: 0.015790452999681293
inference time for step 13: 0.015261627999734628
inference time for step 14: 0.014638084000125673
inference time for step 15: 0.014629793999574758
inference time for step 16: 0.016195525000057387
inference time for step 17: 0.015210816000035265
inference time

In [60]:
for sample_output in inference_output[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

همچنین	O	I-ORG
عملیات	O	I-ORG
لرزه	O	I-ORG
[ZWNJ]	O	I-ORG
نگاری	O	I-ORG
دوبعدی	O	I-ORG
نیز	O	I-ORG
با	O	I-ORG
فعالیت	O	I-ORG
مستمر	O	I-ORG
چهار	O	I-ORG
گروه	O	I-ORG
کاری	O	I-ORG
در	O	I-ORG
مناطقی	O	I-ORG
که	O	I-ORG
از	O	I-ORG
نظر	O	I-ORG
اکتشافی	O	I-ORG
مورد	O	I-ORG
نظر	O	I-ORG
بود	O	I-ORG
	O	I-ORG
،	O	I-ORG
به	O	I-ORG
پایان	O	I-ORG
رسید	O	I-ORG
که	O	I-ORG
نتایج	O	I-ORG
ان	O	O
در	O	I-ORG
حال	O	I-ORG
بررسی	O	O
است	O	O
	O	I-ORG
.	O	I-ORG

محدث	B-PER	I-ORG
در	O	I-ORG
مورد	O	I-ORG
مشارکت	O	I-ORG
شرکتهای	O	I-ORG
خارجی	O	I-ORG
در	O	I-ORG
فعالیتهای	O	I-ORG
اکتشافی	O	I-ORG
کشور	O	I-ORG
گفت	O	I-ORG
	O	I-ORG
:	O	I-ORG
تاکنون	O	I-ORG
چند	O	I-ORG
منطقه	O	I-ORG
اکتشافی	O	O
را	O	I-ORG
برای	O	O
مشارکت	O	I-ORG
و	O	I-ORG
سرمایه	O	I-ORG
[ZWNJ]	O	I-ORG
گذاری	O	I-ORG
شرکتهای	O	I-ORG
خارجی	O	I-ORG
اعلام	O	I-ORG
کرده	O	I-ORG
[ZWNJ]	O	I-ORG
ایم	O	I-ORG
و	O	I-ORG
در	O	I-ORG
حال	O	I-ORG
مذاکره	O	I-ORG
با	O	I-ORG
طرف	O	I-ORG
های	O	I-ORG
خارجی	O	I-ORG
هستیم	O	O
و	O	I-ORG
انتظار	O	I-ORG
می	O	I-ORG
[ZWNJ]	O	I-ORG


In [61]:
ner_model.evaluate_prediction_results(labels_paw, inference_output)

Test Accuracy: 0.08108407167076342
Test Precision: 0.013866850638151086
Test Recall: 0.015889328063241108
Test F1-Score: 0.014809357156013997


  _warn_prf(average, modifier, msg_start, len(result))


Test classification Report:
              precision    recall  f1-score   support

         DAT  0.3750000000 0.0132158590 0.0255319149       454
         EVE  0.0000000000 0.0000000000 0.0000000000       331
         FAC  0.1818181818 0.0064102564 0.0123839009       312
         LOC  0.3108108108 0.0132183908 0.0253583241      3480
         MON  0.0000000000 0.0000000000 0.0000000000       128
         ORG  0.0084620267 0.0317544324 0.0133630290      3779
         PCT  0.0000000000 0.0000000000 0.0000000000       108
         PER  0.2574257426 0.0072972214 0.0141921397      3563
         PRO  0.1666666667 0.0022727273 0.0044843049       440
         TIM  0.0000000000 0.0000000000 0.0000000000        55

   micro avg  0.0138668506 0.0158893281 0.0148093572     12650
   macro avg  0.1300183429 0.0074168887 0.0095313614     12650
weighted avg  0.1842781144 0.0158893281 0.0163431470     12650



In [62]:
output_file_name = "ner_arman-and-peyma-and-wikiann_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

###New Test

In [None]:
from transformers import pipeline
nlp = pipeline("ner", model=ner_model.model, tokenizer=ner_model.tokenizer)
example = "در سال ۲۰۱۳ درگذشت و آندرتیکر و کین برای او مراسم یادبود گرفتند."

ner_results = nlp(example)
print(ner_results)

In [None]:
from transformers import pipeline
nlp = pipeline("ner", model=ner_model.model, tokenizer=ner_model.tokenizer)
example = "کنایه سرلشگر فیروزآبادی به پادشاه عربستان و پسرش"

ner_results = nlp(example)
for ent in ner_results:
  print(ent)