# DistilbertNER
This model fine-tuned for the Named Entity Recognition (NER) task on a mixed NER dataset collected from ARMAN, PEYMA, and WikiANN that covered ten types of entities:

* Date (DAT)
* Event (EVE)
* Facility (FAC)
* Location (LOC)
* Money (MON)
* Organization (ORG)
* Percent (PCT)
* Person (PER)
* Product (PRO)
* Time (TIM)


In [1]:
!nvidia-smi
!lscpu

Mon Aug 16 15:01:35 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install transformers==4.7.0

Collecting hazm==0.7.0
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 7.0 MB/s 
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 60.2 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 60.4 MB/s 
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394486 sha256=c240346373173665a29e83209275daa8223a8221a83e857c5bc9f7a06c74c19a
  Stored in directory: /root/.cache/pip/wheels/9b/fd/0c/d92302c876e5de87ebd7fc0979d82edb93e2d8d768bf71fac4
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp37-cp37m-linux_x86_64.whl size=154599 sha256=007b3574ef3026f88d9e72a63f10353f8344b45cd9465481df618770ef824873
 

In [3]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [4]:
import os
import gc
import ast
import time
import hazm
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import transformers
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForTokenClassification

from IPython.display import display, HTML, clear_output
from ipywidgets import widgets, Layout

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [5]:
class NER:
    def __init__(self, model_name):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        self.config = AutoConfig.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        # self.labels = list(self.config.label2id.keys())
        self.id2label = self.config.id2label

    @staticmethod
    def load_ner_data(file_path, word_index, tag_index, delimiter, join=False):
        dataset, labels = [], []
        with open(file_path, encoding="utf8") as infile:
            sample_text, sample_label = [], []
            for line in infile:
                parts = line.strip().split(delimiter)
                if len(parts) > 1:
                    word, tag = parts[word_index], parts[tag_index]
                    if not word:
                        continue
                    sample_text.append(word)
                    sample_label.append(tag)
                else:
                    # end of sample
                    if sample_text and sample_label:
                        if join:
                            dataset.append(' '.join(sample_text))
                            labels.append(' '.join(sample_label))
                        else:
                            dataset.append(sample_text)
                            labels.append(sample_label)
                    sample_text, sample_label = [], []
        if sample_text and sample_label:
            if join:
                dataset.append(' '.join(sample_text))
                labels.append(' '.join(sample_label))
            else:
                dataset.append(sample_text)
                labels.append(sample_label)
        return dataset, labels

    def load_test_datasets(self, dataset_name, dataset_dir, **kwargs):
        if dataset_name.lower() == "peyma":
            ner_file_path = dataset_dir + 'test.txt'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            return self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter='|',
                                      join=kwargs.get('join', False))
        elif dataset_name.lower() == "arman":
            dataset, labels = [], []
            for i in range(1, 4):
                ner_file_path = dataset_dir + f'test_fold{i}.txt'
                if not os.path.exists(ner_file_path):
                    print(ner_file_path)
                dataset_part, labels_part = self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter=' ',
                                                               join=kwargs.get('join', False))
                dataset += dataset_part
                labels += labels_part
            return dataset, labels
        elif dataset_name.lower() == "hooshvare-peyman+arman+wikiann":
            ner_file_path = dataset_dir + 'test.csv'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            data = pd.read_csv(ner_file_path, delimiter="\t")
            sentences, sentences_tags = data['tokens'].values.tolist(), data['ner_tags'].values.tolist()
            sentences = [ast.literal_eval(ss) for ss in sentences]
            sentences_tags = [ast.literal_eval(ss) for ss in sentences_tags]
            print(f'test part:\n #sentences: {len(sentences)}, #sentences_tags: {len(sentences_tags)}')
            return sentences, sentences_tags

    def load_datasets(self, dataset_name, dataset_dir, **kwargs):
        if dataset_name.lower() == "farsiyar":
            dataset, labels = [], []
            for i in range(1, 6):
                ner_file_path = dataset_dir + 'Persian-NER-part{i}.txt'
                if not os.path.exists(ner_file_path):
                    print(ner_file_path)
                dataset_part, labels_part = self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter='\t',
                                                               join=kwargs.get('join', False))
                dataset += dataset_part
                labels += labels_part
            return dataset, labels
        elif dataset_name.lower() == "wikiann":
            ner_file_path = dataset_dir + 'wikiann-fa.bio'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            dataset_all, labels_all = self.load_ner_data(ner_file_path, word_index=0, tag_index=-1, delimiter=' ',
                                                         join=kwargs.get('join', False))
            print(f'all data: #data: {len(dataset_all)}, #labels: {len(labels_all)}')

            try:
                _, data_test, _, label_test = train_test_split(dataset_all, labels_all, test_size=0.1, random_state=1,
                                                               stratify=labels_all)
                print("with stratify")
            except:
                _, data_test, _, label_test = train_test_split(dataset_all, labels_all, test_size=0.1, random_state=1)
                print("without stratify")
            print(f'test part:\n #data: {len(data_test)}, #labels: {len(label_test)}')
            return dataset_all, labels_all, data_test, label_test

    def ner_inference(self, input_text, device, max_length):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        pt_batch = self.tokenizer(
            [self.normalizer.normalize(sequence) for sequence in input_text],
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        pt_batch = pt_batch.to(device)
        pt_outputs = self.model(**pt_batch)
        pt_predictions = torch.argmax(pt_outputs.logits, dim=-1)
        pt_predictions = pt_predictions.cpu().detach().numpy().tolist()

        output_predictions = []
        for i, sequence in enumerate(input_text):
            tokens = self.tokenizer.tokenize(self.tokenizer.decode(self.tokenizer.encode(sequence)))
            predictions = [(token, self.id2label[prediction]) for token, prediction in
                           zip(tokens, pt_predictions[i])]
            output_predictions.append(predictions)
        return output_predictions

    def ner_evaluation(self, input_text, input_labels, device, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        max_len = 0
        tokenized_texts, new_labels = [], []
        for sentence, sentence_label in zip(input_text, input_labels):
            if type(sentence) == str:
                sentence = sentence.strip().split()
            if len(sentence) != len(sentence_label):
                print('Something wrong has been happened! Length of a sentence and its label is not equal!')
                return
            tokenized_sentence, new_sentence_label = [], []
            for word, label in zip(sentence, sentence_label):
                # Tokenize the word and count # of subwords the word is broken into
                tokenized_word = self.tokenizer.tokenize(word)
                n_subwords = len(tokenized_word)

                # Add the tokenized word to the final tokenized word list
                tokenized_sentence.extend(tokenized_word)
                # Add the same label to the new list of labels `n_subwords` times
                new_sentence_label.extend([label] * n_subwords)

            max_len = max(max_len, len(tokenized_sentence))
            tokenized_texts.append(tokenized_sentence)
            new_labels.append(new_sentence_label)

        max_len = min(max_len, self.config.max_position_embeddings)
        print("max_len:", max_len)
        input_ids = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                                  maxlen=max_len, dtype="long", value=self.config.pad_token_id,
                                  truncating="post", padding="post")
        del tokenized_texts
        input_labels = pad_sequences([[self.config.label2id.get(l) for l in lab] for lab in new_labels],
                                     maxlen=max_len, value=self.config.label2id.get('O'), padding="post",
                                     dtype="long", truncating="post")
        del new_labels

        train_data = TensorDataset(torch.tensor(input_ids), torch.tensor(input_labels))
        data_loader = DataLoader(train_data, batch_size=batch_size)
        # data_loader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
        print("#samples:", len(input_ids))
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_loss, total_time = 0, 0
        output_predictions = []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids, b_labels = batch

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_labels = b_labels.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                outputs = self.model(b_input_ids, labels=b_labels)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')
            # get the loss
            total_loss += outputs.loss.item()

            b_predictions = torch.argmax(outputs.logits, dim=2)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_labels = b_labels.cpu().detach().numpy().tolist()

            for i, sample in enumerate(b_input_ids):
                sample_input = list(sample)
                # remove pad tokens
                while sample_input[-1] == self.config.pad_token_id:
                    sample_input.pop()
                # tokens = self.tokenizer.tokenize(self.tokenizer.decode(sample_input))
                tokens = [self.tokenizer.decode([t]) for t in sample_input]
                sample_true_labels = [self.id2label[e] for e in b_labels[i][:len(sample_input)]]
                sample_predictions = [self.id2label[e] for e in b_predictions[i][:len(sample_input)]]
                output_predictions.append(
                    [(t, sample_true_labels[j], sample_predictions[j]) for j, t in enumerate(tokens)])

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(data_loader)
        print("average loss:", avg_train_loss)
        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_ids))

        return output_predictions

    def ner_evaluation_2(self, input_text, input_labels, device, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        print("len(input_text):", len(input_text))
        print("len(input_labels):", len(input_labels))
        c = 0
        max_len = 0
        tokenized_texts, new_labels = [], []
        for sentence, sentence_label in zip(input_text, input_labels):
            if type(sentence) == str:
                sentence = sentence.strip().split()
            if len(sentence) != len(sentence_label):
                print('Something wrong has been happened! Length of a sentence and its label is not equal!')
                return
            tokenized_words = self.tokenizer(sentence, padding=False, add_special_tokens=False).input_ids
            tokenized_sentence_ids, new_sentence_label = [], []
            for i, tokenized_word in enumerate(tokenized_words):
                # Add the tokenized word to the final tokenized word list
                tokenized_sentence_ids += tokenized_word
                # Add the same label to the new list of labels `number of subwords` times
                new_sentence_label.extend([self.config.label2id.get(sentence_label[i])] * len(tokenized_word))

            max_len = max(max_len, len(tokenized_sentence_ids))
            tokenized_texts.append(tokenized_sentence_ids)
            new_labels.append(new_sentence_label)
            c += 1
            if c % 10000 == 0:
                print("c:", c)
        max_len = min(max_len, self.config.max_position_embeddings)
        print("max_len:", max_len)
        input_ids = pad_sequences(tokenized_texts, maxlen=max_len, dtype="long", value=self.config.pad_token_id,
                                  truncating="post", padding="post")
        del tokenized_texts
        input_labels = pad_sequences(new_labels, maxlen=max_len, value=self.config.label2id.get('O'), padding="post",
                                     dtype="long", truncating="post")
        del new_labels

        train_data = TensorDataset(torch.tensor(input_ids), torch.tensor(input_labels))
        data_loader = DataLoader(train_data, batch_size=batch_size)
        # data_loader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
        print("#samples:", len(input_ids))
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_time = 0
        output_predictions = []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids, b_labels = batch

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_labels = b_labels.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                outputs = self.model(b_input_ids, labels=b_labels)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')

            b_predictions = torch.argmax(outputs.logits, dim=2)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_labels = b_labels.cpu().detach().numpy().tolist()

            for i, sample in enumerate(b_input_ids):
                sample_input = list(sample)
                # remove pad tokens
                while sample_input[-1] == self.config.pad_token_id:
                    sample_input.pop()
                # tokens = self.tokenizer.tokenize(self.tokenizer.decode(sample_input))
                tokens = [self.tokenizer.decode([t]) for t in sample_input]
                sample_true_labels = [self.id2label[e] for e in b_labels[i][:len(sample_input)]]
                sample_predictions = [self.id2label[e] for e in b_predictions[i][:len(sample_input)]]
                output_predictions.append(
                    [(t, sample_true_labels[j], sample_predictions[j]) for j, t in enumerate(tokens)])

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_ids))

        return output_predictions

    def check_input_label_consistency(self, labels):
        model_labels = self.config.label2id.keys()
        dataset_labels = set()
        for l in labels:
            dataset_labels.update(set(l))
        print("model labels:", model_labels)
        print("dataset labels:", dataset_labels)
        print("intersection:", set(model_labels).intersection(dataset_labels))
        print("model_labels-dataset_labels:", list(set(model_labels) - set(dataset_labels)))
        print("dataset_labels-model_labels:", list(set(dataset_labels) - set(model_labels)))
        if list(set(dataset_labels) - set(model_labels)):
            return False
        return True

    @staticmethod
    def resolve_input_label_consistency(labels, label_translation_map):
        for i, sentence_labels in enumerate(labels):
            for j, label in enumerate(sentence_labels):
                labels[i][j] = label_translation_map.get(label)
        return labels

    @staticmethod
    def evaluate_prediction_results(labels, output_predictions):
        dataset_labels = set()
        for label in labels:
            dataset_labels.update(set(label))

        true_labels, predictions = [], []
        for sample_output in output_predictions:
            sample_true_labels = []
            sample_predicted_labels = []
            for token, true_label, predicted_label in sample_output:
                sample_true_labels.append(true_label)
                if predicted_label in dataset_labels:
                    sample_predicted_labels.append(predicted_label)
                else:
                    sample_predicted_labels.append('O')
            true_labels.append(sample_true_labels)
            predictions.append(sample_predicted_labels)

        print("Test Accuracy: {}".format(accuracy_score(true_labels, predictions)))
        print("Test Precision: {}".format(precision_score(true_labels, predictions)))
        print("Test Recall: {}".format(recall_score(true_labels, predictions)))
        print("Test F1-Score: {}".format(f1_score(true_labels, predictions)))
        print("Test classification Report:\n{}".format(classification_report(true_labels, predictions, digits=10)))


In [6]:
model_name ='HooshvareLab/distilbert-fa-zwnj-base-ner'
ner_model = NER(model_name)

Downloading:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/301M [00:00<?, ?B/s]

In [7]:
print(ner_model.config)

DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "ner",
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-DAT",
    "2": "B-EVE",
    "3": "B-FAC",
    "4": "B-LOC",
    "5": "B-MON",
    "6": "B-ORG",
    "7": "B-PCT",
    "8": "B-PER",
    "9": "B-PRO",
    "10": "B-TIM",
    "11": "I-DAT",
    "12": "I-EVE",
    "13": "I-FAC",
    "14": "I-LOC",
    "15": "I-MON",
    "16": "I-ORG",
    "17": "I-PCT",
    "18": "I-PER",
    "19": "I-PRO",
    "20": "I-TIM"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-DAT": 1,
    "B-EVE": 2,
    "B-FAC": 3,
    "B-LOC": 4,
    "B-MON": 5,
    "B-ORG": 6,
    "B-PCT": 7,
    "B-PER": 8,
    "B-PRO": 9,
    "B-TIM": 10,
    "I-DAT": 11,
    "I-EVE": 12,
    "I-FAC": 13,
    "I-LOC": 14,
    "I-MON": 15,
    "I-ORG": 16,
    "I-PCT": 17,
    "I-PER": 18,
    "I-PRO": 19,
    "I-TIM": 20,
 

#### Sample Inference:

In [8]:
texts = [
    "مدیرکل محیط زیست استان البرز با بیان اینکه با بیان اینکه موضوع شیرابه‌های زباله‌های انتقال یافته در منطقه حلقه دره خطری برای این استان است، گفت: در این مورد گزارشاتی در ۲۵ مرداد ۱۳۹۷ تقدیم مدیران استان شده است.",
    "به گزارش خبرگزاری تسنیم از کرج، حسین محمدی در نشست خبری مشترک با معاون خدمات شهری شهرداری کرج که با حضور مدیرعامل سازمان‌های پسماند، پارک‌ها و فضای سبز و نماینده منابع طبیعی در سالن کنفرانس شهرداری کرج برگزار شد، اظهار داشت: ۸۰٪  جمعیت استان البرز در کلانشهر کرج زندگی می‌کنند.",
    "وی افزود: با همکاری‌های مشترک بین اداره کل محیط زیست و شهرداری کرج برنامه‌های مشترکی برای حفاظت از محیط زیست در شهر کرج در دستور کار قرار گرفته که این اقدامات آثار مثبتی داشته و تاکنون نزدیک به ۱۰۰ میلیارد هزینه جهت خریداری اکس-ریس صورت گرفته است.",
]

In [9]:
inference_output = ner_model.ner_inference(texts, device, ner_model.config.max_position_embeddings)

In [10]:
print(inference_output)

[[('[CLS]', 'O'), ('مدیرکل', 'O'), ('محیط', 'B-ORG'), ('زیست', 'I-ORG'), ('استان', 'I-ORG'), ('البرز', 'I-ORG'), ('با', 'O'), ('بیان', 'O'), ('اینکه', 'O'), ('با', 'O'), ('بیان', 'O'), ('اینکه', 'O'), ('موضوع', 'O'), ('شیر', 'O'), ('##ابه', 'O'), ('[ZWNJ]', 'O'), ('های', 'O'), ('زباله', 'O'), ('[ZWNJ]', 'O'), ('های', 'O'), ('انتقال', 'O'), ('یافته', 'O'), ('در', 'O'), ('منطقه', 'O'), ('حلقه', 'O'), ('دره', 'O'), ('خطری', 'O'), ('برای', 'O'), ('این', 'O'), ('استان', 'O'), ('است', 'O'), ('،', 'O'), ('گفت', 'O'), (':', 'O'), ('در', 'O'), ('این', 'O'), ('مورد', 'O'), ('گزارشاتی', 'O'), ('در', 'O'), ('۲۵', 'B-DAT'), ('مرداد', 'I-DAT'), ('۱۳۹۷', 'I-DAT'), ('تقدیم', 'O'), ('مدیران', 'O'), ('استان', 'O'), ('شده', 'O'), ('است', 'O'), ('.', 'O'), ('[SEP]', 'O')], [('[CLS]', 'O'), ('به', 'O'), ('گزارش', 'O'), ('خبرگزاری', 'B-ORG'), ('تسنیم', 'I-ORG'), ('از', 'O'), ('کرج', 'B-LOC'), ('،', 'O'), ('حسین', 'B-PER'), ('محمدی', 'I-PER'), ('در', 'O'), ('نشست', 'O'), ('خبری', 'O'), ('مشترک', 'O'), ('با',

In [11]:
#@title Live Playground { display-mode: "form" }

css_is_load = False
css = """<style>
.ner-box {
    direction: rtl;
    font-size: 18px !important;
    line-height: 20px !important;
    margin: 0 0 15px;
    padding: 10px;
    text-align: justify;
    color: #343434 !important;
}
.token, .token span {
    display: inline-block !important;
    padding: 2px;
    margin: 2px 0;
}
.token.token-ner {
    background-color: #f6cd61;
    font-weight: bold;
    color: #000;
}
.token.token-ner .ner-label {
    color: #9a1f40;
    margin: 0px 2px;
}
</style>"""

if not css_is_load:
    display(HTML(css))
    css_is_load = True

submit_wd = widgets.Button(description='Send', disabled=False, button_style='success', tooltip='Submit')
text_wd = widgets.Textarea(placeholder='Please enter you text ...', rows=5, layout=Layout(width='90%'))
output_wd = widgets.Output()

display(HTML("""
<h2>Test NER model</h2>
<p style="padding: 2px 20px; margin: 0 0 20px;">
</p>
<br /><br />
"""))

display(text_wd)
display(submit_wd)
display(output_wd)

def submit_text(sender):
    with output_wd:
        clear_output(wait=True)
        text = text_wd.value
        _output = ner_model.ner_inference([text], device, ner_model.config.max_position_embeddings)
        # print(_output)
        pred_sequence = []
        for token, label in _output[0]:
            if token not in ['[CLS]', '[SEP]']:
                if label != 'O':
                    pred_sequence.append(
                        '<span class="token token-ner">%s<span class="ner-label">%s</span></span>' 
                        % (token, label))
                else:
                    pred_sequence.append(
                        '<span class="token">%s</span>' 
                        % token)
            
        html = '<p class="ner-box">%s</p>' % ' '.join(pred_sequence) 
        display(HTML(html))

submit_wd.on_click(submit_text)

Textarea(value='', layout=Layout(width='90%'), placeholder='Please enter you text ...', rows=5)

Button(button_style='success', description='Send', style=ButtonStyle(), tooltip='Submit')

Output()

#### PEYMA dataset:
PEYMA dataset includes 7,145 sentences with a total of 302,530 tokens from which 41,148 tokens are tagged with seven different classes: 

- Organization
- Money
- Location
- Date
- Time
- Person
- Percent

|     Label    |   #   |
|:------------:|:-----:|
| Organization | 16964 |
|     Money    |  2037 |
|   Location   |  8782 |
|     Date     |  4259 |
|     Time     |  732  |
|    Person    |  7675 |
|    Percent   |  699  |

Download
You can download the dataset from [here](https://hooshvare.github.io/docs/datasets/ner) with leads to following google drive file of HooshvareLab:

In [12]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1WZxpFRtEs5HZWyWQ2Pyg9CCuIBs1Kmvx'})
download.GetContentFile('peyma.zip')
!ls

adc.json  peyma.zip  sample_data


In [13]:
!unzip peyma.zip
!ls
!ls peyma

Archive:  peyma.zip
   creating: peyma/
  inflating: peyma/dev.txt           
  inflating: peyma/test.txt          
  inflating: peyma/train.txt         
adc.json  peyma  peyma.zip  sample_data
dev.txt  test.txt  train.txt


In [14]:
sentences, labels = ner_model.load_test_datasets(dataset_name="peyma", dataset_dir="./peyma/")
print(len(sentences), len(labels))
print(sentences[0])
print(labels[0])

1026 1026
['کنایه', 'سرلشگر', 'فیروزآبادی', 'به', 'پادشاه', 'عربستان', 'و', 'پسرش']
['O', 'O', 'B_ORG', 'O', 'O', 'B_LOC', 'O', 'O']


In [15]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B_PER', 'B_LOC', 'O', 'I_PER', 'B_TIM', 'I_TIM', 'B_PCT', 'B_DAT', 'B_ORG', 'I_LOC', 'I_MON', 'B_MON', 'I_DAT', 'I_ORG', 'I_PCT'}
intersection: {'O'}
model_labels-dataset_labels: ['I-DAT', 'B-PCT', 'I-TIM', 'I-FAC', 'I-PRO', 'B-PRO', 'B-FAC', 'B-MON', 'B-ORG', 'B-DAT', 'I-PER', 'I-MON', 'B-LOC', 'I-ORG', 'I-EVE', 'B-TIM', 'I-LOC', 'I-PCT', 'B-PER', 'B-EVE']
dataset_labels-model_labels: ['B_PER', 'B_ORG', 'I_MON', 'I_LOC', 'B_DAT', 'B_LOC', 'B_MON', 'I_PER', 'I_DAT', 'B_TIM', 'I_ORG', 'I_TIM', 'I_PCT', 'B_PCT']
False


In [16]:
label_translate = {
    'B_PER': 'B-PER', 
    'I_PER': 'I-PER',
    'B_LOC': 'B-LOC',
    'I_LOC': 'I-LOC',
    'B_ORG': 'B-ORG',
    'I_ORG': 'I-ORG', 
    'B_MON': 'B-MON',
    'I_MON': 'I-MON', 
    'B_DAT': 'B-DAT', 
    'I_DAT': 'I-DAT',
    'B_TIM': 'B-TIM',
    'I_TIM': 'I-TIM', 
    'B_PCT': 'B-PCT',
    'I_PCT': 'I-PCT',
    'O': 'O'
}
labels = ner_model.resolve_input_label_consistency(labels, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-ORG', 'B-DAT', 'I-PER', 'I-DAT', 'B-PCT', 'I-MON', 'O', 'B-LOC', 'I-TIM', 'I-ORG', 'B-TIM', 'I-LOC', 'I-PCT', 'B-PER', 'B-MON'}
intersection: {'B-ORG', 'B-DAT', 'I-PER', 'I-DAT', 'B-PCT', 'I-MON', 'O', 'B-LOC', 'I-PCT', 'I-TIM', 'I-ORG', 'B-TIM', 'B-MON', 'I-LOC', 'B-PER'}
model_labels-dataset_labels: ['B-PRO', 'B-FAC', 'I-EVE', 'I-FAC', 'B-EVE', 'I-PRO']
dataset_labels-model_labels: []
True


In [17]:
!nvidia-smi
!lscpu

Mon Aug 16 15:03:34 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    27W /  70W |   1398MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [18]:
inference_output_peyma = ner_model.ner_evaluation(sentences, labels, device, batch_size=512)

max_len: 155
#samples: 1026
#batch: 3
Start to evaluate test data ...
inference time for step 0: 0.025755564000064624
inference time for step 1: 0.006958781999856001
inference time for step 2: 0.006614872000000105
average loss: 0.4362834294637044
total inference time: 0.03932921799992073
total inference time / #samples: 3.83325711500202e-05


In [19]:
for sample_output in inference_output_peyma[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

کنایه	O	O
سرل	O	O
##شگر	O	O
[UNK]	B-ORG	O
به	O	O
پادشاه	O	O
عربستان	B-LOC	B-LOC
و	O	O
پسرش	O	O

[UNK]	O	O
سابق	O	O
ستاد	B-ORG	O
کل	I-ORG	O
نیروهای	I-ORG	I-ORG
مسلح	I-ORG	I-ORG
با	O	O
بیان	O	O
اینکه	O	O
آ	O	O
##ل	O	O
سعود	O	B-PER
با	O	O
حمایت	O	O
همه	O	O
جانبه	O	O
غرب	O	O
بر	O	O
سرزمین	B-LOC	O
حجاز	I-LOC	O
حاکم	O	O
شد	O	O
گفت	O	O
:	O	O
غرب	O	O
با	O	O
حاکم	O	O
کرد	O	O
##د	O	O
آ	O	O
##ل	O	O
سعود	O	O
بر	O	O
حجاز	B-LOC	O
هدفی	O	O
جز	O	O
##نا	O	O
##بود	O	O
##ی	O	O
اسلام	O	O
نداشته	O	O
و	O	O
این	O	O
نقشه	O	O
انگلیس	B-LOC	B-LOC
بود	O	O
.	O	O

سرل	O	O
##شگر	O	O
حسن	B-PER	B-PER
[UNK]	I-PER	O
روز	O	O
دوشنبه	O	O
درح	O	O
##اشیه	O	O
[UNK]	O	O
ختم	O	O
مادر	O	O
حیدر	B-PER	B-PER
مصلح	I-PER	I-PER
##ی	I-PER	I-PER
درج	O	O
##مع	O	O
خبرنگاران	O	O
درباره	O	O
موضوع	O	O
یمن	B-LOC	B-LOC
افزود	O	O
:	O	O
ماهیت	O	O
آ	O	O
##ن	O	O
##چه	O	O
در	O	O
یمن	B-LOC	B-LOC
اتفاق	O	O
می	O	O
افتد	O	O
وهاب	O	O
##یت	O	O
است	O	O
وهاب	O	O
##یت	O	O
یک	O	O
مذهب	O	O
انگلیسی	O	O
است	O	O
.	O	O

وی	O	O
ادامه	O	O
داد	O	O
:	O	O
وقتی	O	O
که	

In [20]:
ner_model.evaluate_prediction_results(labels, inference_output_peyma)

Test Accuracy: 0.9471615599656785
Test Precision: 0.6385729058945191
Test Recall: 0.5191256830601093
Test F1-Score: 0.5726872246696034
Test classification Report:
              precision    recall  f1-score   support

         DAT  0.6285714286 0.3928571429 0.4835164835       224
         LOC  0.7747747748 0.6134094151 0.6847133758       701
         MON  0.8400000000 0.7241379310 0.7777777778        29
         ORG  0.5980113636 0.5302267003 0.5620827770       794
         PCT  0.7777777778 0.5600000000 0.6511627907        50
         PER  0.5204301075 0.4329159213 0.4726562500       559
         TIM  0.5555555556 0.2272727273 0.3225806452        22

   micro avg  0.6385729059 0.5191256831 0.5726872247      2379
   macro avg  0.6707315725 0.4972599768 0.5649271571      2379
weighted avg  0.6410801518 0.5191256831 0.5720936556      2379



In [21]:
output_file_name = "ner_peyma_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_peyma:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Arman dataset:
ARMAN dataset holds 7,682 sentences with 250,015 sentences tagged over six different classes.

1. Organization
2. Location
3. Facility
4. Event
5. Product
6. Person


|     Label    |   #   |
|:------------:|:-----:|
| Organization | 30108 |
|   Location   | 12924 |
|   Facility   |  4458 |
|     Event    |  7557 |
|    Product   |  4389 |
|    Person    | 15645 |

**Download**
You can download the dataset from [here](https://github.com/HaniehP/PersianNER)


In [22]:
!wget https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
!ls

--2021-08-16 15:03:48--  https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/HaniehP/PersianNER/master/ArmanPersoNERCorpus.zip [following]
--2021-08-16 15:03:49--  https://raw.githubusercontent.com/HaniehP/PersianNER/master/ArmanPersoNERCorpus.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1931170 (1.8M) [application/zip]
Saving to: ‘ArmanPersoNERCorpus.zip’


2021-08-16 15:03:49 (49.4 MB/s) - ‘ArmanPersoNERCorpus.zip’ saved [1931170/1931170]

adc.json							peyma
ArmanPersoNERCorpus.zip						peyma.zip
ner_peyma_HooshvareL

In [23]:
!unzip ArmanPersoNERCorpus.zip -d arman
!ls

Archive:  ArmanPersoNERCorpus.zip
  inflating: arman/test_fold1.txt    
  inflating: arman/ReadMe.txt        
  inflating: arman/train_fold3.txt   
  inflating: arman/train_fold2.txt   
  inflating: arman/train_fold1.txt   
  inflating: arman/test_fold3.txt    
  inflating: arman/test_fold2.txt    
adc.json							peyma
arman								peyma.zip
ArmanPersoNERCorpus.zip						sample_data
ner_peyma_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt


In [24]:
sentences, labels = ner_model.load_test_datasets(dataset_name="arman", dataset_dir="./arman/")
print(len(sentences), len(labels))
print(sentences[0])
print(labels[0])

7681 7681
['افقی', ':', '0', 'ـ', 'از', 'عوامل', 'دوران', 'پهلوی', 'و', 'نخست\u200cوزیر', 'ایران', 'در', 'سالهای', 'ابتدائی', 'دهه', 'چهل', 'خورشیدی', 'كه', 'جلد', 'سوم', 'یادداشتهایش', 'هم', 'چندی', 'پیش', 'در', 'تهران', 'منتشر', 'شد', '0', 'ـ', 'پرستاری', 'از', 'ناخوش\u200cاحوال', 'ـ', 'پوشاک', 'و', 'جامه', 'ـ', 'فانتزی', 'و', 'شیک', '0', 'ـ', 'در', 'حال', 'وزیدن', 'ـ', 'اطلاعیه', 'ـ', 'پایتخت', 'جمهوری', 'استونی', 'در', 'حوضه', 'بالتیک', '0', 'ـ', 'علم', 'راهبرد', 'مؤسسه', 'و', 'سازمان', 'ـ', 'نوعی', 'شمع', '0', 'ـ', 'حرف', 'جمع', 'مؤنث', 'ـ', 'در', 'ایران', 'به', 'تولیدکننده', 'کتاب', 'اطلاق', 'می\u200cشود', 'ـ', 'از', 'شهرهای', 'باختری', 'افغانستان', 'كه', 'تا', 'عصر', 'ناصرالدین\u200cشاه', 'جزئی', 'از', 'خراسان', 'بود', 'ـ', 'ویتامین', 'انعقاد', '0', 'ـ', 'سبزی', 'غده\u200cای', 'ـ', 'دوستی', 'و', 'محبت', 'ـ', 'داستان', 'بلند', 'ـ', 'شهری', 'در', 'آلمان', '0', 'ـ', 'سلول', 'بدن', 'موجودات', 'ـ', 'از', 'انواع', 'کالباس', '0', 'ـ', 'حاشیه', 'و', 'هامش', 'ـ', 'پیدا', 'نشدنی', 'ـ', 'خ

In [25]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-pro', 'B-pro', 'O', 'B-loc', 'I-pers', 'B-org', 'B-event', 'B-fac', 'I-org', 'B-pers', 'I-event', 'I-fac', 'I-loc'}
intersection: {'O'}
model_labels-dataset_labels: ['I-DAT', 'B-PCT', 'I-TIM', 'I-FAC', 'I-PRO', 'B-PRO', 'B-FAC', 'B-MON', 'B-ORG', 'B-DAT', 'I-PER', 'I-MON', 'B-LOC', 'I-ORG', 'I-EVE', 'B-TIM', 'I-LOC', 'I-PCT', 'B-PER', 'B-EVE']
dataset_labels-model_labels: ['B-org', 'I-pro', 'B-pro', 'B-loc', 'B-event', 'B-fac', 'I-org', 'B-pers', 'I-event', 'I-fac', 'I-pers', 'I-loc']
False


In [26]:
label_translate = {
    'B-org': 'B-ORG', 
    'I-org': 'I-ORG',
    'B-loc': 'B-LOC',
    'I-loc': 'I-LOC',
    'B-pers': 'B-PER', 
    'I-pers': 'I-PER',
    'B-event': 'B-EVE', 
    'I-event': 'I-EVE',
    'B-pro': 'B-PRO', 
    'I-pro': 'I-PRO', 
    'B-fac': 'B-FAC', 
    'I-fac': 'I-FAC',
    'O': 'O'
}
labels = ner_model.resolve_input_label_consistency(labels, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-ORG', 'I-PER', 'O', 'B-LOC', 'I-ORG', 'I-EVE', 'I-FAC', 'I-LOC', 'I-PRO', 'B-PRO', 'B-FAC', 'B-PER', 'B-EVE'}
intersection: {'B-ORG', 'B-PRO', 'I-PER', 'B-FAC', 'B-PER', 'O', 'B-LOC', 'I-ORG', 'I-EVE', 'I-FAC', 'I-LOC', 'B-EVE', 'I-PRO'}
model_labels-dataset_labels: ['B-DAT', 'I-DAT', 'B-PCT', 'I-MON', 'I-PCT', 'I-TIM', 'B-TIM', 'B-MON']
dataset_labels-model_labels: []
True


In [27]:
!nvidia-smi
!lscpu

Mon Aug 16 15:03:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    27W /  70W |   5782MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [28]:
inference_output_arman = ner_model.ner_evaluation(sentences, labels, device, batch_size=512)

max_len: 287
#samples: 7681
#batch: 16
Start to evaluate test data ...
inference time for step 0: 0.02669194199984304
inference time for step 1: 0.007592595000005531
inference time for step 2: 0.004633436999938567
inference time for step 3: 0.005185672999914459
inference time for step 4: 0.0045541719998709596
inference time for step 5: 0.004972495999936655
inference time for step 6: 0.004717387000027884
inference time for step 7: 0.004596398999865414
inference time for step 8: 0.004604427999993277
inference time for step 9: 0.004650617999914175
inference time for step 10: 0.005160434999879726
inference time for step 11: 0.004705661000116379
inference time for step 12: 0.005041816999892035
inference time for step 13: 0.007667059999903358
inference time for step 14: 0.004545687000018006
inference time for step 15: 0.005548156000031668
average loss: 0.28428298234939575
total inference time: 0.10486796299915113
total inference time / #samples: 1.3652904960181113e-05


In [29]:
for sample_output in inference_output_arman[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

افقی	O	O
:	O	O
0	O	O
[UNK]	O	O
از	O	O
عوامل	O	O
دوران	O	O
پهلوی	O	O
و	O	O
نخست	O	O
[ZWNJ]	O	O
وزیر	O	O
ایران	B-LOC	B-LOC
در	O	O
سالهای	O	O
[UNK]	O	O
دهه	O	O
چهل	O	O
خورشیدی	O	O
[UNK]	O	O
جلد	O	O
سوم	O	O
یادداشت	O	O
##هایش	O	O
هم	O	O
چندی	O	O
پیش	O	O
در	O	O
تهران	B-LOC	B-LOC
منتشر	O	O
شد	O	O
0	O	O
[UNK]	O	O
پرستاری	O	O
از	O	O
ناخوش	O	O
[ZWNJ]	O	O
احوال	O	O
[UNK]	O	O
پوشاک	O	O
و	O	O
جامه	O	O
[UNK]	O	O
فانتزی	O	O
و	O	O
شیک	O	O
0	O	O
[UNK]	O	O
در	O	O
حال	O	O
وزی	O	O
##دن	O	O
[UNK]	O	O
اطلاعیه	O	O
[UNK]	O	O
پایتخت	O	O
جمهوری	O	O
استونی	B-LOC	B-LOC
در	I-LOC	I-LOC
حوضه	I-LOC	I-LOC
بالتیک	I-LOC	I-LOC
0	O	O
[UNK]	O	O
علم	O	O
راهبرد	O	O
[UNK]	O	O
و	O	O
سازمان	O	O
[UNK]	O	O
نوعی	O	O
شمع	O	O
0	O	O
[UNK]	O	O
حرف	O	O
جمع	O	O
[UNK]	O	O
[UNK]	O	O
در	O	O
ایران	B-LOC	B-LOC
به	O	O
تولیدکننده	O	O
کتاب	O	O
اطلاق	O	O
می	O	O
[ZWNJ]	O	O
شود	O	O
[UNK]	O	O
از	O	O
شهرهای	O	O
باختری	O	O
افغانستان	B-LOC	B-LOC
[UNK]	O	O
تا	O	O
عصر	O	O
ناصرالدین	B-PER	B-PER
[ZWNJ]	B-PER	O
شاه	B-PER	O
[UNK]	O	O
از	O	O
خراسان	B-LOC	B-

In [30]:
ner_model.evaluate_prediction_results(labels, inference_output_arman)

Test Accuracy: 0.9658612190275909
Test Precision: 0.6247908533184606
Test Recall: 0.5375479846449136
Test F1-Score: 0.5778952798555583
Test classification Report:
              precision    recall  f1-score   support

         EVE  0.3751451800 0.4806547619 0.4213959556       672
         FAC  0.5924596050 0.5348460292 0.5621805792       617
         LOC  0.6892712551 0.5849259180 0.6328261122      4657
         ORG  0.6605166052 0.5628930818 0.6078098472      5406
         PER  0.5900128041 0.5294117647 0.5580719390      4352
         PRO  0.5151515152 0.2458677686 0.3328671329       968

   micro avg  0.6247908533 0.5375479846 0.5778952799     16672
   macro avg  0.5704261608 0.4897665540 0.5191919277     16672
weighted avg  0.6276833331 0.5375479846 0.5766482246     16672



In [31]:
output_file_name = "ner_arman_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_arman:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Arman+Peyma

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1WZxpFRtEs5HZWyWQ2Pyg9CCuIBs1Kmvx'})
download.GetContentFile('peyma.zip')
!ls

In [None]:
!unzip peyma.zip
!ls
!ls peyma

In [32]:
sentences_peyma, labels_peyma = ner_model.load_test_datasets(dataset_name="peyma", dataset_dir="./peyma/")
print(len(sentences_peyma), len(labels_peyma))
print(sentences_peyma[0])
print(labels_peyma[0])

1026 1026
['کنایه', 'سرلشگر', 'فیروزآبادی', 'به', 'پادشاه', 'عربستان', 'و', 'پسرش']
['O', 'O', 'B_ORG', 'O', 'O', 'B_LOC', 'O', 'O']


In [33]:
is_consistent = ner_model.check_input_label_consistency(labels_peyma)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B_PER', 'B_LOC', 'O', 'I_PER', 'B_TIM', 'I_TIM', 'B_PCT', 'B_DAT', 'B_ORG', 'I_LOC', 'I_MON', 'B_MON', 'I_DAT', 'I_ORG', 'I_PCT'}
intersection: {'O'}
model_labels-dataset_labels: ['I-DAT', 'B-PCT', 'I-TIM', 'I-FAC', 'I-PRO', 'B-PRO', 'B-FAC', 'B-MON', 'B-ORG', 'B-DAT', 'I-PER', 'I-MON', 'B-LOC', 'I-ORG', 'I-EVE', 'B-TIM', 'I-LOC', 'I-PCT', 'B-PER', 'B-EVE']
dataset_labels-model_labels: ['B_PER', 'B_ORG', 'I_MON', 'I_LOC', 'B_DAT', 'B_LOC', 'B_MON', 'I_PER', 'I_DAT', 'B_TIM', 'I_ORG', 'I_TIM', 'I_PCT', 'B_PCT']
False


In [34]:
label_translate = {
    'B_PER': 'B-PER', 
    'I_PER': 'I-PER',
    'B_LOC': 'B-LOC',
    'I_LOC': 'I-LOC',
    'B_ORG': 'B-ORG',
    'I_ORG': 'I-ORG', 
    'B_MON': 'B-MON',
    'I_MON': 'I-MON', 
    'B_DAT': 'B-DAT', 
    'I_DAT': 'I-DAT',
    'B_TIM': 'B-TIM',
    'I_TIM': 'I-TIM', 
    'B_PCT': 'B-PCT',
    'I_PCT': 'I-PCT',
    'O': 'O'
}
labels_peyma = ner_model.resolve_input_label_consistency(labels_peyma, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels_peyma)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-ORG', 'B-DAT', 'I-PER', 'I-DAT', 'B-PCT', 'I-MON', 'O', 'B-LOC', 'I-TIM', 'I-ORG', 'B-TIM', 'I-LOC', 'I-PCT', 'B-PER', 'B-MON'}
intersection: {'B-ORG', 'B-DAT', 'I-PER', 'I-DAT', 'B-PCT', 'I-MON', 'O', 'B-LOC', 'I-PCT', 'I-TIM', 'I-ORG', 'B-TIM', 'B-MON', 'I-LOC', 'B-PER'}
model_labels-dataset_labels: ['B-PRO', 'B-FAC', 'I-EVE', 'I-FAC', 'B-EVE', 'I-PRO']
dataset_labels-model_labels: []
True


In [None]:
!wget https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
!ls

In [None]:
!unzip ArmanPersoNERCorpus.zip -d arman
!ls

In [35]:
sentences_arman, labels_arman = ner_model.load_test_datasets(dataset_name="arman", dataset_dir="./arman/")
print(len(sentences_arman), len(labels_arman))
print(sentences_arman[0])
print(labels_arman[0])

7681 7681
['افقی', ':', '0', 'ـ', 'از', 'عوامل', 'دوران', 'پهلوی', 'و', 'نخست\u200cوزیر', 'ایران', 'در', 'سالهای', 'ابتدائی', 'دهه', 'چهل', 'خورشیدی', 'كه', 'جلد', 'سوم', 'یادداشتهایش', 'هم', 'چندی', 'پیش', 'در', 'تهران', 'منتشر', 'شد', '0', 'ـ', 'پرستاری', 'از', 'ناخوش\u200cاحوال', 'ـ', 'پوشاک', 'و', 'جامه', 'ـ', 'فانتزی', 'و', 'شیک', '0', 'ـ', 'در', 'حال', 'وزیدن', 'ـ', 'اطلاعیه', 'ـ', 'پایتخت', 'جمهوری', 'استونی', 'در', 'حوضه', 'بالتیک', '0', 'ـ', 'علم', 'راهبرد', 'مؤسسه', 'و', 'سازمان', 'ـ', 'نوعی', 'شمع', '0', 'ـ', 'حرف', 'جمع', 'مؤنث', 'ـ', 'در', 'ایران', 'به', 'تولیدکننده', 'کتاب', 'اطلاق', 'می\u200cشود', 'ـ', 'از', 'شهرهای', 'باختری', 'افغانستان', 'كه', 'تا', 'عصر', 'ناصرالدین\u200cشاه', 'جزئی', 'از', 'خراسان', 'بود', 'ـ', 'ویتامین', 'انعقاد', '0', 'ـ', 'سبزی', 'غده\u200cای', 'ـ', 'دوستی', 'و', 'محبت', 'ـ', 'داستان', 'بلند', 'ـ', 'شهری', 'در', 'آلمان', '0', 'ـ', 'سلول', 'بدن', 'موجودات', 'ـ', 'از', 'انواع', 'کالباس', '0', 'ـ', 'حاشیه', 'و', 'هامش', 'ـ', 'پیدا', 'نشدنی', 'ـ', 'خ

In [36]:
is_consistent = ner_model.check_input_label_consistency(labels_arman)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-pro', 'B-pro', 'O', 'B-loc', 'I-pers', 'B-org', 'B-event', 'B-fac', 'I-org', 'B-pers', 'I-event', 'I-fac', 'I-loc'}
intersection: {'O'}
model_labels-dataset_labels: ['I-DAT', 'B-PCT', 'I-TIM', 'I-FAC', 'I-PRO', 'B-PRO', 'B-FAC', 'B-MON', 'B-ORG', 'B-DAT', 'I-PER', 'I-MON', 'B-LOC', 'I-ORG', 'I-EVE', 'B-TIM', 'I-LOC', 'I-PCT', 'B-PER', 'B-EVE']
dataset_labels-model_labels: ['B-org', 'I-pro', 'B-pro', 'B-loc', 'B-event', 'B-fac', 'I-org', 'B-pers', 'I-event', 'I-fac', 'I-pers', 'I-loc']
False


In [37]:
label_translate = {
    'B-org': 'B-ORG', 
    'I-org': 'I-ORG',
    'B-loc': 'B-LOC',
    'I-loc': 'I-LOC',
    'B-pers': 'B-PER', 
    'I-pers': 'I-PER',
    'B-event': 'B-EVE', 
    'I-event': 'I-EVE',
    'B-pro': 'B-PRO', 
    'I-pro': 'I-PRO', 
    'B-fac': 'B-FAC', 
    'I-fac': 'I-FAC',
    'O': 'O'
}
labels_arman = ner_model.resolve_input_label_consistency(labels_arman, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels_arman)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-ORG', 'I-PER', 'O', 'B-LOC', 'I-ORG', 'I-EVE', 'I-FAC', 'I-LOC', 'I-PRO', 'B-PRO', 'B-FAC', 'B-PER', 'B-EVE'}
intersection: {'B-ORG', 'B-PRO', 'I-PER', 'B-FAC', 'B-PER', 'O', 'B-LOC', 'I-ORG', 'I-EVE', 'I-FAC', 'I-LOC', 'B-EVE', 'I-PRO'}
model_labels-dataset_labels: ['B-DAT', 'I-DAT', 'B-PCT', 'I-MON', 'I-PCT', 'I-TIM', 'B-TIM', 'B-MON']
dataset_labels-model_labels: []
True


In [38]:
sentences = sentences_arman + sentences_peyma
labels = labels_arman + labels_peyma
print(len(sentences), len(labels))

8707 8707


In [39]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-ORG', 'B-DAT', 'I-PER', 'I-DAT', 'B-PCT', 'I-MON', 'O', 'B-LOC', 'I-TIM', 'I-ORG', 'I-EVE', 'I-FAC', 'I-LOC', 'B-TIM', 'I-PRO', 'B-PRO', 'B-FAC', 'I-PCT', 'B-PER', 'B-MON', 'B-EVE'}
intersection: {'I-DAT', 'B-PCT', 'I-TIM', 'I-FAC', 'I-PRO', 'B-PRO', 'B-FAC', 'B-MON', 'B-ORG', 'B-DAT', 'I-PER', 'I-MON', 'O', 'B-LOC', 'I-ORG', 'B-TIM', 'I-EVE', 'I-LOC', 'I-PCT', 'B-PER', 'B-EVE'}
model_labels-dataset_labels: []
dataset_labels-model_labels: []
True


In [40]:
!nvidia-smi
!lscpu

Mon Aug 16 15:06:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    32W /  70W |   8708MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [41]:
inference_output = ner_model.ner_evaluation(sentences, labels, device, batch_size=512)

max_len: 287
#samples: 8707
#batch: 18
Start to evaluate test data ...
inference time for step 0: 0.10638825399996676
inference time for step 1: 0.009052853000184768
inference time for step 2: 0.0047055109998837
inference time for step 3: 0.004536938000001101
inference time for step 4: 0.004735155999924245
inference time for step 5: 0.00460266999994019
inference time for step 6: 0.004774518999965949
inference time for step 7: 0.004520519999914541
inference time for step 8: 0.00525290100017628
inference time for step 9: 0.0057256889999734994
inference time for step 10: 0.004818182000008164
inference time for step 11: 0.005073764000144365
inference time for step 12: 0.004949151000118945
inference time for step 13: 0.004720280999890747
inference time for step 14: 0.007451193999941097
inference time for step 15: 0.004615144999888798
inference time for step 16: 0.005036668999991889
inference time for step 17: 0.005368672000031438
average loss: 0.27665317555268604
total inference time: 0.196

In [42]:
for sample_output in inference_output[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

افقی	O	O
:	O	O
0	O	O
[UNK]	O	O
از	O	O
عوامل	O	O
دوران	O	O
پهلوی	O	O
و	O	O
نخست	O	O
[ZWNJ]	O	O
وزیر	O	O
ایران	B-LOC	B-LOC
در	O	O
سالهای	O	O
[UNK]	O	O
دهه	O	O
چهل	O	O
خورشیدی	O	O
[UNK]	O	O
جلد	O	O
سوم	O	O
یادداشت	O	O
##هایش	O	O
هم	O	O
چندی	O	O
پیش	O	O
در	O	O
تهران	B-LOC	B-LOC
منتشر	O	O
شد	O	O
0	O	O
[UNK]	O	O
پرستاری	O	O
از	O	O
ناخوش	O	O
[ZWNJ]	O	O
احوال	O	O
[UNK]	O	O
پوشاک	O	O
و	O	O
جامه	O	O
[UNK]	O	O
فانتزی	O	O
و	O	O
شیک	O	O
0	O	O
[UNK]	O	O
در	O	O
حال	O	O
وزی	O	O
##دن	O	O
[UNK]	O	O
اطلاعیه	O	O
[UNK]	O	O
پایتخت	O	O
جمهوری	O	O
استونی	B-LOC	B-LOC
در	I-LOC	I-LOC
حوضه	I-LOC	I-LOC
بالتیک	I-LOC	I-LOC
0	O	O
[UNK]	O	O
علم	O	O
راهبرد	O	O
[UNK]	O	O
و	O	O
سازمان	O	O
[UNK]	O	O
نوعی	O	O
شمع	O	O
0	O	O
[UNK]	O	O
حرف	O	O
جمع	O	O
[UNK]	O	O
[UNK]	O	O
در	O	O
ایران	B-LOC	B-LOC
به	O	O
تولیدکننده	O	O
کتاب	O	O
اطلاق	O	O
می	O	O
[ZWNJ]	O	O
شود	O	O
[UNK]	O	O
از	O	O
شهرهای	O	O
باختری	O	O
افغانستان	B-LOC	B-LOC
[UNK]	O	O
تا	O	O
عصر	O	O
ناصرالدین	B-PER	B-PER
[ZWNJ]	B-PER	O
شاه	B-PER	O
[UNK]	O	O
از	O	O
خراسان	B-LOC	B-

In [43]:
ner_model.evaluate_prediction_results(labels, inference_output)

Test Accuracy: 0.9628902510553076
Test Precision: 0.6189484005392818
Test Recall: 0.5301558973282242
Test F1-Score: 0.5711216036642256
Test classification Report:
              precision    recall  f1-score   support

         DAT  0.5140845070 0.3258928571 0.3989071038       224
         EVE  0.3621076233 0.4806547619 0.4130434783       672
         FAC  0.5593220339 0.5348460292 0.5468102734       617
         LOC  0.6980418336 0.5854796566 0.6368250102      5358
         MON  0.8000000000 0.6896551724 0.7407407407        29
         ORG  0.6468366383 0.5524193548 0.5959112658      6200
         PCT  0.7714285714 0.5400000000 0.6352941176        50
         PER  0.5779354543 0.5141519039 0.5441810345      4911
         PRO  0.5096359743 0.2458677686 0.3317073171       968
         TIM  0.2222222222 0.0909090909 0.1290322581        22

   micro avg  0.6189484005 0.5301558973 0.5711216037     19051
   macro avg  0.5661614858 0.4559876595 0.4972452599     19051
weighted avg  0.622136224

In [44]:
output_file_name = "ner_arman-and-peyma_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### WikiAnn dataset:

In [45]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX'})
download.GetContentFile('fa.tar.gz')
!ls

adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner_arman-and-peyma_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
ner_arman_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
ner_peyma_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
peyma
peyma.zip
sample_data


In [46]:
!tar -zxvf fa.tar.gz
!ls

README.txt
wikiann-fa.bio
adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner_arman-and-peyma_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
ner_arman_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
ner_peyma_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
peyma
peyma.zip
README.txt
sample_data
wikiann-fa.bio


In [47]:
sentences_all, labels_all, sentences_test, labels_test = ner_model.load_datasets(dataset_name="wikiann", dataset_dir="./")
print(len(sentences_all), len(sentences_all))
print(len(sentences_test), len(labels_test))
print(sentences_test[0])
print(labels_test[0])

all data: #data: 272266, #labels: 272266


  return array(a, dtype, copy=False, order=order)


without stratify
test part:
 #data: 27227, #labels: 27227
272266 272266
27227 27227
['**', 'زاغی', 'نوک\u200cزرد', ',', "''Pica", 'nuttalli', "''"]
['O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O']


In [48]:
is_consistent = ner_model.check_input_label_consistency(labels_test)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-ORG', 'I-PER', 'O', 'B-LOC', 'I-ORG', 'B-PER', 'I-LOC'}
intersection: {'B-ORG', 'I-PER', 'O', 'B-LOC', 'I-ORG', 'B-PER', 'I-LOC'}
model_labels-dataset_labels: ['B-PRO', 'B-MON', 'B-DAT', 'B-FAC', 'I-DAT', 'B-PCT', 'I-MON', 'I-PCT', 'I-EVE', 'I-TIM', 'B-TIM', 'I-FAC', 'B-EVE', 'I-PRO']
dataset_labels-model_labels: []
True


In [49]:
!nvidia-smi
!lscpu

Mon Aug 16 15:09:40 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    31W /  70W |   8702MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [50]:
inference_output_wikiann = ner_model.ner_evaluation_2(sentences_test, labels_test, device, batch_size=512)

len(input_text): 27227
len(input_labels): 27227
c: 10000
c: 20000
max_len: 115
#samples: 27227
#batch: 54
Start to evaluate test data ...
inference time for step 0: 0.021685804000071585
inference time for step 1: 0.010413172999960807
inference time for step 2: 0.004601234999881854
inference time for step 3: 0.004526743000042188
inference time for step 4: 0.004527381000116293
inference time for step 5: 0.004652103999887913
inference time for step 6: 0.004454561000102331
inference time for step 7: 0.005328787000053126
inference time for step 8: 0.005011921000004804
inference time for step 9: 0.0049179090001416625
inference time for step 10: 0.00449420200015993
inference time for step 11: 0.004747830000042086
inference time for step 12: 0.005163613999911831
inference time for step 13: 0.005239515000084793
inference time for step 14: 0.0045624020001469034
inference time for step 15: 0.005154333000064071
inference time for step 16: 0.004911175999950501
inference time for step 17: 0.00546013

In [51]:
for sample_output in inference_output_wikiann[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

*	O	O
*	O	O
زاغ	B-LOC	B-PER
##ی	B-LOC	I-PER
نوک	I-LOC	I-PER
[ZWNJ]	I-LOC	O
زرد	I-LOC	I-PER
,	O	O
'	O	O
'	O	O
Pic	O	O
##a	O	O
n	O	O
##ut	O	I-PER
##ta	O	O
##ll	O	O
##i	O	O
'	O	O
'	O	O

تغییر	O	O
##مس	O	B-PER
##یر	O	I-PER
مک	B-LOC	B-PER
[ZWNJ]	B-LOC	I-PER
ویل	B-LOC	I-PER
،	B-LOC	O
داکوتای	I-LOC	O
شمالی	I-LOC	I-LOC

وست	B-LOC	B-ORG
یونیور	I-LOC	I-ORG
##سیتی	I-LOC	I-ORG
پلیس	I-LOC	O
،	I-LOC	O
تگزاس	I-LOC	B-LOC

تغییر	O	O
##مس	O	B-PER
##یر	O	I-PER
دلت	B-PER	B-PER
##ف	B-PER	I-PER
فون	I-PER	B-PER
لیل	I-PER	I-PER
##نس	I-PER	I-PER
##رون	I-PER	I-PER

تغییر	O	O
##مس	O	O
##یر	O	O
نیروگاه	B-ORG	O
[ZWNJ]	B-ORG	O
های	B-ORG	O
زنجیره	I-ORG	O
[ZWNJ]	I-ORG	O
ای	I-ORG	O
یاسوج	I-ORG	O



In [52]:
ner_model.evaluate_prediction_results(labels_test, inference_output_wikiann)

Test Accuracy: 0.5251423186251599
Test Precision: 0.14692179339935424
Test Recall: 0.1441326530612245
Test F1-Score: 0.14551385922929164
Test classification Report:
              precision    recall  f1-score   support

         LOC  0.1309402971 0.0532757650 0.0757365500     26635
         ORG  0.5346691653 0.2799456563 0.3674824136     13249
         PER  0.0582243682 0.2308552264 0.0929944552      7156

   micro avg  0.1469217934 0.1441326531 0.1455138592     47040
   macro avg  0.2412779436 0.1880255492 0.1787378062     47040
weighted avg  0.2335901396 0.1441326531 0.1605332871     47040



In [53]:
output_file_name = "ner_wikiann_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_wikiann:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Hooshvare - Arman+Peyma+WikiAnn

https://github.com/hooshvare/parsner

In [54]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1fC2WGlpqumUTaT9Dr_U1jO2no3YMKFJ4'})
download.GetContentFile('ner-v1.zip')
!ls

adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner_arman-and-peyma_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
ner_arman_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
ner_peyma_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
ner-v1.zip
ner_wikiann_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
peyma
peyma.zip
README.txt
sample_data
wikiann-fa.bio


In [55]:
!unzip ner-v1.zip
!ls
!ls ner

Archive:  ner-v1.zip
   creating: ner/
  inflating: ner/valid.csv           
  inflating: ner/ner.csv             
  inflating: ner/test.csv            
  inflating: ner/train.csv           
adc.json
arman
ArmanPersoNERCorpus.zip
fa.tar.gz
ner
ner_arman-and-peyma_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
ner_arman_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
ner_peyma_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
ner-v1.zip
ner_wikiann_HooshvareLab-distilbert-fa-zwnj-base-ner_outputs.txt
peyma
peyma.zip
README.txt
sample_data
wikiann-fa.bio
ner.csv  test.csv  train.csv  valid.csv


In [56]:
sentences_paw, labels_paw = ner_model.load_test_datasets(dataset_name="hooshvare-peyman+arman+wikiann", dataset_dir="./ner/")
print(len(sentences_paw), len(labels_paw))
print(sentences_paw[0])
print(labels_paw[0])

test part:
 #sentences: 6049, #sentences_tags: 6049
6049 6049
['همچنین', 'عملیات', 'لرزه\u200cنگاری', 'دوبعدی', 'نیز', 'با', 'فعالیت', 'مستمر', 'چهار', 'گروه', 'کاری', 'در', 'مناطقی', 'که', 'از', 'نظر', 'اکتشافی', 'مورد', 'نظر', 'بود', '،', 'به', 'پایان', 'رسید', 'که', 'نتایج', 'آن', 'در', 'حال', 'بررسی', 'است', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [57]:
is_consistent = ner_model.check_input_label_consistency(labels_paw)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-ORG', 'B-DAT', 'I-PER', 'I-DAT', 'B-PCT', 'I-MON', 'O', 'B-LOC', 'I-TIM', 'I-ORG', 'I-EVE', 'I-FAC', 'I-LOC', 'B-TIM', 'I-PRO', 'B-PRO', 'B-FAC', 'I-PCT', 'B-PER', 'B-MON', 'B-EVE'}
intersection: {'I-DAT', 'B-PCT', 'I-TIM', 'I-FAC', 'I-PRO', 'B-PRO', 'B-FAC', 'B-MON', 'B-ORG', 'B-DAT', 'I-PER', 'I-MON', 'O', 'B-LOC', 'I-ORG', 'B-TIM', 'I-EVE', 'I-LOC', 'I-PCT', 'B-PER', 'B-EVE'}
model_labels-dataset_labels: []
dataset_labels-model_labels: []
True


In [58]:
!nvidia-smi
!lscpu

Mon Aug 16 15:13:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    31W /  70W |   4428MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [59]:
inference_output = ner_model.ner_evaluation_2(sentences_paw, labels_paw, device, batch_size=256)

len(input_text): 6049
len(input_labels): 6049
max_len: 512
#samples: 6049
#batch: 24
Start to evaluate test data ...
inference time for step 0: 0.029701506000037625
inference time for step 1: 0.006894465000186756
inference time for step 2: 0.004744762999962404
inference time for step 3: 0.00460168200015687
inference time for step 4: 0.00465232500005186
inference time for step 5: 0.005133420999982263
inference time for step 6: 0.0049072850001721235
inference time for step 7: 0.004835834000004979
inference time for step 8: 0.004677837999906842
inference time for step 9: 0.004840032999936739
inference time for step 10: 0.0049352129999533645
inference time for step 11: 0.004636635999986538
inference time for step 12: 0.00465452200000982
inference time for step 13: 0.00466565100009575
inference time for step 14: 0.004829979999840361
inference time for step 15: 0.004546470000150293
inference time for step 16: 0.005176254000161862
inference time for step 17: 0.004620509999767819
inference tim

In [60]:
for sample_output in inference_output[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

همچنین	O	O
عملیات	O	O
لرزه	O	O
[ZWNJ]	O	O
نگاری	O	O
دوبعدی	O	O
نیز	O	O
با	O	O
فعالیت	O	O
مستمر	O	O
چهار	O	O
گروه	O	O
کاری	O	O
در	O	O
مناطقی	O	O
که	O	O
از	O	O
نظر	O	O
اکتشافی	O	O
مورد	O	O
نظر	O	O
بود	O	O
،	O	O
به	O	O
پایان	O	O
رسید	O	O
که	O	O
نتایج	O	O
آ	O	O
##ن	O	O
در	O	O
حال	O	O
بررسی	O	O
است	O	O
.	O	O

محدث	B-PER	O
در	O	O
مورد	O	O
مشارکت	O	O
شرکتهای	O	O
خارجی	O	O
در	O	O
فعالیتهای	O	O
اکتشافی	O	O
کشور	O	O
گفت	O	O
:	O	O
تاکنون	O	O
چند	O	O
منطقه	O	O
اکتشافی	O	O
را	O	O
برای	O	O
مشارکت	O	O
و	O	O
سرمایه	O	O
[ZWNJ]	O	O
گذاری	O	O
شرکتهای	O	O
خارجی	O	O
اعلام	O	O
کرده	O	O
[ZWNJ]	O	O
ایم	O	O
و	O	O
در	O	O
حال	O	O
مذاکره	O	O
با	O	O
طرفه	O	O
##ای	O	O
خارجی	O	O
هستیم	O	O
و	O	O
انتظار	O	O
می	O	O
[ZWNJ]	O	O
رود	O	O
تا	O	O
آ	O	O
##خر	O	O
امسال	O	O
بتوانیم	O	O
چند	O	O
قرارداد	O	O
را	O	O
نهایی	O	O
کنیم	O	O
.	O	O

مدیر	O	O
امور	B-ORG	O
اکتشاف	I-ORG	O
شرکت	I-ORG	B-ORG
ملی	I-ORG	I-ORG
نفت	I-ORG	I-ORG
فرو	O	O
##افتادگی	O	O
دزفول	B-LOC	B-LOC
و	O	O
منطقه	B-LOC	B-LOC
گسل	I-LOC	I-LOC
کازرون	I-LOC	I-LOC
تا	O	O
ب

In [61]:
ner_model.evaluate_prediction_results(labels_paw, inference_output)

Test Accuracy: 0.9577871460727908
Test Precision: 0.599314742552584
Test Recall: 0.49280012521521366
Test F1-Score: 0.5408632166630879
Test classification Report:
              precision    recall  f1-score   support

         DAT  0.5678670360 0.4617117117 0.5093167702       444
         EVE  0.2464183381 0.2885906040 0.2658423493       298
         FAC  0.4844444444 0.3824561404 0.4274509804       285
         LOC  0.7304534095 0.5391304348 0.6203759854      3795
         MON  0.5894736842 0.4552845528 0.5137614679       123
         ORG  0.5410447761 0.4896103896 0.5140441778      3850
         PCT  0.8068181818 0.7171717172 0.7593582888        99
         PER  0.5994650619 0.5268880400 0.5608382859      3403
         PRO  0.3814432990 0.0864485981 0.1409523810       428
         TIM  0.5625000000 0.1698113208 0.2608695652        53

   micro avg  0.5993147426 0.4928001252 0.5408632167     12778
   macro avg  0.5509928231 0.4117103509 0.4572810252     12778
weighted avg  0.602923421

In [62]:
output_file_name = "ner_arman-and-peyma-and-wikiann_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### New Test

In [None]:
from transformers import pipeline
nlp = pipeline("ner", model=ner_model.model, tokenizer=ner_model.tokenizer)
example = "کنایه سرلشگر فیروزآبادی به پادشاه عربستان و پسرش"

ner_results = nlp(example)
for ent in ner_results:
  print(ent)