# ParsBERT (v3.0)
## BertNER
This model fine-tuned for the Named Entity Recognition (NER) task on a mixed NER dataset collected from ARMAN, PEYMA, and WikiANN that covered ten types of entities:

* Date (DAT)
* Event (EVE)
* Facility (FAC)
* Location (LOC)
* Money (MON)
* Organization (ORG)
* Percent (PCT)
* Person (PER)
* Product (PRO)
* Time (TIM)

In [None]:
!nvidia-smi
!lscpu

In [None]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install transformers==4.7.0

In [None]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [4]:
import os
import gc
import ast
import time
import hazm
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import transformers
from tokenizers.decoders import ByteLevel
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForTokenClassification

from IPython.display import display, HTML, clear_output
from ipywidgets import widgets, Layout

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [5]:
class NER:
    def __init__(self, model_name):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        self.config = AutoConfig.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        # self.labels = list(self.config.label2id.keys())
        self.id2label = self.config.id2label

    @staticmethod
    def load_ner_data(file_path, word_index, tag_index, delimiter, join=False):
        dataset, labels = [], []
        with open(file_path, encoding="utf8") as infile:
            sample_text, sample_label = [], []
            for line in infile:
                parts = line.strip().split(delimiter)
                if len(parts) > 1:
                    word, tag = parts[word_index], parts[tag_index]
                    if not word:
                        continue
                    sample_text.append(word)
                    sample_label.append(tag)
                else:
                    # end of sample
                    if sample_text and sample_label:
                        if join:
                            dataset.append(' '.join(sample_text))
                            labels.append(' '.join(sample_label))
                        else:
                            dataset.append(sample_text)
                            labels.append(sample_label)
                    sample_text, sample_label = [], []
        if sample_text and sample_label:
            if join:
                dataset.append(' '.join(sample_text))
                labels.append(' '.join(sample_label))
            else:
                dataset.append(sample_text)
                labels.append(sample_label)
        return dataset, labels

    def load_test_datasets(self, dataset_name, dataset_dir, **kwargs):
        if dataset_name.lower() == "peyma":
            ner_file_path = dataset_dir + 'test.txt'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            return self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter='|',
                                      join=kwargs.get('join', False))
        elif dataset_name.lower() == "arman":
            dataset, labels = [], []
            for i in range(1, 4):
                ner_file_path = dataset_dir + f'test_fold{i}.txt'
                if not os.path.exists(ner_file_path):
                    print(ner_file_path)
                dataset_part, labels_part = self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter=' ',
                                                               join=kwargs.get('join', False))
                dataset += dataset_part
                labels += labels_part
            return dataset, labels
        elif dataset_name.lower() == "hooshvare-peyman+arman+wikiann":
            ner_file_path = dataset_dir + 'test.csv'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            data = pd.read_csv(ner_file_path, delimiter="\t")
            sentences, sentences_tags = data['tokens'].values.tolist(), data['ner_tags'].values.tolist()
            sentences = [ast.literal_eval(ss) for ss in sentences]
            sentences_tags = [ast.literal_eval(ss) for ss in sentences_tags]
            print(f'test part:\n #sentences: {len(sentences)}, #sentences_tags: {len(sentences_tags)}')
            return sentences, sentences_tags

    def load_datasets(self, dataset_name, dataset_dir, **kwargs):
        if dataset_name.lower() == "farsiyar":
            dataset, labels = [], []
            for i in range(1, 6):
                ner_file_path = dataset_dir + 'Persian-NER-part{i}.txt'
                if not os.path.exists(ner_file_path):
                    print(ner_file_path)
                dataset_part, labels_part = self.load_ner_data(ner_file_path, word_index=0, tag_index=1, delimiter='\t',
                                                               join=kwargs.get('join', False))
                dataset += dataset_part
                labels += labels_part
            return dataset, labels
        elif dataset_name.lower() == "wikiann":
            ner_file_path = dataset_dir + 'wikiann-fa.bio'
            if not os.path.exists(ner_file_path):
                print(ner_file_path)
                exit(1)
            dataset_all, labels_all = self.load_ner_data(ner_file_path, word_index=0, tag_index=-1, delimiter=' ',
                                                         join=kwargs.get('join', False))
            print(f'all data: #data: {len(dataset_all)}, #labels: {len(labels_all)}')

            try:
                _, data_test, _, label_test = train_test_split(dataset_all, labels_all, test_size=0.1, random_state=1,
                                                               stratify=labels_all)
                print("with stratify")
            except:
                _, data_test, _, label_test = train_test_split(dataset_all, labels_all, test_size=0.1, random_state=1)
                print("without stratify")
            print(f'test part:\n #data: {len(data_test)}, #labels: {len(label_test)}')
            return dataset_all, labels_all, data_test, label_test

    def ner_inference(self, input_text, device, max_length):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        pt_batch = self.tokenizer(
            [self.normalizer.normalize(sequence) for sequence in input_text],
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        pt_batch = pt_batch.to(device)
        pt_outputs = self.model(**pt_batch)
        pt_predictions = torch.argmax(pt_outputs.logits, dim=-1)
        pt_predictions = pt_predictions.cpu().detach().numpy().tolist()

        output_predictions = []
        for i, sequence in enumerate(input_text):
            tokens = self.tokenizer.tokenize(self.tokenizer.decode(self.tokenizer.encode(sequence)))
            predictions = [(token, self.id2label[prediction]) for token, prediction in
                           zip(tokens, pt_predictions[i])]
            output_predictions.append(predictions)
        return output_predictions

    def roberta_ner_inference(self, input_text, device, max_length):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        byte_level_decoder = ByteLevel()
        pt_batch = self.tokenizer(
            [self.normalizer.normalize(sequence) for sequence in input_text],
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        pt_batch = pt_batch.to(device)
        pt_outputs = self.model(**pt_batch)
        pt_predictions = torch.argmax(pt_outputs.logits, dim=-1)
        pt_predictions = pt_predictions.cpu().detach().numpy().tolist()

        output_predictions = []
        for i, sequence in enumerate(input_text):
            tokens = self.tokenizer.tokenize(self.tokenizer.decode(self.tokenizer.encode(sequence)))
            predictions = [(byte_level_decoder.decode(token), self.id2label[prediction]) for token, prediction in
                           zip(tokens, pt_predictions[i])]
            output_predictions.append(predictions)
        return output_predictions

    def ner_evaluation(self, input_text, input_labels, device, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        max_len = 0
        tokenized_texts, new_labels = [], []
        for sentence, sentence_label in zip(input_text, input_labels):
            if type(sentence) == str:
                sentence = sentence.strip().split()
            if len(sentence) != len(sentence_label):
                print('Something wrong has been happened! Length of a sentence and its label is not equal!')
                return
            tokenized_sentence, new_sentence_label = [], []
            for word, label in zip(sentence, sentence_label):
                # Tokenize the word and count # of subwords the word is broken into
                tokenized_word = self.tokenizer.tokenize(word)
                n_subwords = len(tokenized_word)

                # Add the tokenized word to the final tokenized word list
                tokenized_sentence.extend(tokenized_word)
                # Add the same label to the new list of labels `n_subwords` times
                new_sentence_label.extend([label] * n_subwords)

            max_len = max(max_len, len(tokenized_sentence))
            tokenized_texts.append(tokenized_sentence)
            new_labels.append(new_sentence_label)

        max_len = min(max_len, self.config.max_position_embeddings)
        print("max_len:", max_len)
        input_ids = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                                  maxlen=max_len, dtype="long", value=self.config.pad_token_id,
                                  truncating="post", padding="post")
        del tokenized_texts
        input_labels = pad_sequences([[self.config.label2id.get(l) for l in lab] for lab in new_labels],
                                     maxlen=max_len, value=self.config.label2id.get('O'), padding="post",
                                     dtype="long", truncating="post")
        del new_labels

        train_data = TensorDataset(torch.tensor(input_ids), torch.tensor(input_labels))
        data_loader = DataLoader(train_data, batch_size=batch_size)
        # data_loader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
        print("#samples:", len(input_ids))
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_loss, total_time = 0, 0
        output_predictions = []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids, b_labels = batch

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_labels = b_labels.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                outputs = self.model(b_input_ids, labels=b_labels)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')
            # get the loss
            total_loss += outputs.loss.item()

            b_predictions = torch.argmax(outputs.logits, dim=2)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_labels = b_labels.cpu().detach().numpy().tolist()

            for i, sample in enumerate(b_input_ids):
                sample_input = list(sample)
                # remove pad tokens
                while sample_input[-1] == self.config.pad_token_id:
                    sample_input.pop()
                # tokens = self.tokenizer.tokenize(self.tokenizer.decode(sample_input))
                tokens = [self.tokenizer.decode([t]) for t in sample_input]
                sample_true_labels = [self.id2label[e] for e in b_labels[i][:len(sample_input)]]
                sample_predictions = [self.id2label[e] for e in b_predictions[i][:len(sample_input)]]
                output_predictions.append(
                    [(t, sample_true_labels[j], sample_predictions[j]) for j, t in enumerate(tokens)])

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(data_loader)
        print("average loss:", avg_train_loss)
        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_ids))

        return output_predictions

    def ner_evaluation_2(self, input_text, input_labels, device, max_position_embeddings=None, batch_size=4):
        if not self.model or not self.tokenizer or not self.id2label:
            print('Something wrong has been happened!')
            return

        print("len(input_text):", len(input_text))
        print("len(input_labels):", len(input_labels))
        c = 0
        max_len = 0
        tokenized_texts, new_labels = [], []
        for sentence, sentence_label in zip(input_text, input_labels):
            if type(sentence) == str:
                sentence = sentence.strip().split()
            if len(sentence) != len(sentence_label):
                print('Something wrong has been happened! Length of a sentence and its label is not equal!')
                return
            tokenized_words = self.tokenizer(sentence, padding=False, add_special_tokens=False).input_ids
            tokenized_sentence_ids, new_sentence_label = [], []
            for i, tokenized_word in enumerate(tokenized_words):
                # Add the tokenized word to the final tokenized word list
                tokenized_sentence_ids += tokenized_word
                # Add the same label to the new list of labels `number of subwords` times
                new_sentence_label.extend([self.config.label2id.get(sentence_label[i])] * len(tokenized_word))

            max_len = max(max_len, len(tokenized_sentence_ids))
            tokenized_texts.append(tokenized_sentence_ids)
            new_labels.append(new_sentence_label)
            c += 1
            if c % 10000 == 0:
                print("c:", c)
        if max_position_embeddings is None:
            max_len = min(max_len, self.config.max_position_embeddings)
        else:
            max_len = min(max_len, max_position_embeddings)
        print("max_len:", max_len)
        input_ids = pad_sequences(tokenized_texts, maxlen=max_len, dtype="long", value=self.config.pad_token_id,
                                  truncating="post", padding="post")
        del tokenized_texts
        input_labels = pad_sequences(new_labels, maxlen=max_len, value=self.config.label2id.get('O'), padding="post",
                                     dtype="long", truncating="post")
        del new_labels

        train_data = TensorDataset(torch.tensor(input_ids), torch.tensor(input_labels))
        data_loader = DataLoader(train_data, batch_size=batch_size)
        # data_loader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
        print("#samples:", len(input_ids))
        print("#batch:", len(data_loader))

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_time = 0
        output_predictions = []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids, b_labels = batch

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_labels = b_labels.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                outputs = self.model(b_input_ids, labels=b_labels)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')

            b_predictions = torch.argmax(outputs.logits, dim=2)
            b_predictions = b_predictions.cpu().detach().numpy().tolist()
            b_labels = b_labels.cpu().detach().numpy().tolist()

            for i, sample in enumerate(b_input_ids):
                sample_input = list(sample)
                # remove pad tokens
                while sample_input[-1] == self.config.pad_token_id:
                    sample_input.pop()
                # tokens = self.tokenizer.tokenize(self.tokenizer.decode(sample_input))
                tokens = [self.tokenizer.decode([t]) for t in sample_input]
                sample_true_labels = [self.id2label[e] for e in b_labels[i][:len(sample_input)]]
                sample_predictions = [self.id2label[e] for e in b_predictions[i][:len(sample_input)]]
                output_predictions.append(
                    [(t, sample_true_labels[j], sample_predictions[j]) for j, t in enumerate(tokens)])

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_ids))

        return output_predictions

    def check_input_label_consistency(self, labels):
        model_labels = self.config.label2id.keys()
        dataset_labels = set()
        for l in labels:
            dataset_labels.update(set(l))
        print("model labels:", model_labels)
        print("dataset labels:", dataset_labels)
        print("intersection:", set(model_labels).intersection(dataset_labels))
        print("model_labels-dataset_labels:", list(set(model_labels) - set(dataset_labels)))
        print("dataset_labels-model_labels:", list(set(dataset_labels) - set(model_labels)))
        if list(set(dataset_labels) - set(model_labels)):
            return False
        return True

    @staticmethod
    def resolve_input_label_consistency(labels, label_translation_map):
        for i, sentence_labels in enumerate(labels):
            for j, label in enumerate(sentence_labels):
                labels[i][j] = label_translation_map.get(label)
        return labels

    @staticmethod
    def evaluate_prediction_results(labels, output_predictions):
        dataset_labels = set()
        for label in labels:
            dataset_labels.update(set(label))

        true_labels, predictions = [], []
        for sample_output in output_predictions:
            sample_true_labels = []
            sample_predicted_labels = []
            for token, true_label, predicted_label in sample_output:
                sample_true_labels.append(true_label)
                if predicted_label in dataset_labels:
                    sample_predicted_labels.append(predicted_label)
                else:
                    sample_predicted_labels.append('O')
            true_labels.append(sample_true_labels)
            predictions.append(sample_predicted_labels)

        print("Test Accuracy: {}".format(accuracy_score(true_labels, predictions)))
        print("Test Precision: {}".format(precision_score(true_labels, predictions)))
        print("Test Recall: {}".format(recall_score(true_labels, predictions)))
        print("Test F1-Score: {}".format(f1_score(true_labels, predictions)))
        print("Test classification Report:\n{}".format(classification_report(true_labels, predictions, digits=10)))


In [6]:
model_name='HooshvareLab/roberta-fa-zwnj-base-ner'
ner_model = NER(model_name)

Downloading:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/875k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.75M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/315 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/358 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

In [7]:
print(ner_model.config)

RobertaConfig {
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": "ner",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-DAT",
    "2": "B-EVE",
    "3": "B-FAC",
    "4": "B-LOC",
    "5": "B-MON",
    "6": "B-ORG",
    "7": "B-PCT",
    "8": "B-PER",
    "9": "B-PRO",
    "10": "B-TIM",
    "11": "I-DAT",
    "12": "I-EVE",
    "13": "I-FAC",
    "14": "I-LOC",
    "15": "I-MON",
    "16": "I-ORG",
    "17": "I-PCT",
    "18": "I-PER",
    "19": "I-PRO",
    "20": "I-TIM"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-DAT": 1,
    "B-EVE": 2,
    "B-FAC": 3,
    "B-LOC": 4,
    "B-MON": 5,
    "B-ORG": 6,
    "B-PCT": 7,
    "B-PER": 8,
    "B-PRO": 9,
    "B-TIM": 10,
    "I-DAT": 11,
    "I-EVE": 12,
    "I-FAC": 13,
    "I-LOC"

#### Sample Inference:

In [None]:
texts = [
    "مدیرکل محیط زیست استان البرز با بیان اینکه با بیان اینکه موضوع شیرابه‌های زباله‌های انتقال یافته در منطقه حلقه دره خطری برای این استان است، گفت: در این مورد گزارشاتی در ۲۵ مرداد ۱۳۹۷ تقدیم مدیران استان شده است.",
    "به گزارش خبرگزاری تسنیم از کرج، حسین محمدی در نشست خبری مشترک با معاون خدمات شهری شهرداری کرج که با حضور مدیرعامل سازمان‌های پسماند، پارک‌ها و فضای سبز و نماینده منابع طبیعی در سالن کنفرانس شهرداری کرج برگزار شد، اظهار داشت: ۸۰٪  جمعیت استان البرز در کلانشهر کرج زندگی می‌کنند.",
    "وی افزود: با همکاری‌های مشترک بین اداره کل محیط زیست و شهرداری کرج برنامه‌های مشترکی برای حفاظت از محیط زیست در شهر کرج در دستور کار قرار گرفته که این اقدامات آثار مثبتی داشته و تاکنون نزدیک به ۱۰۰ میلیارد هزینه جهت خریداری اکس-ریس صورت گرفته است.",
]

In [None]:
inference_output = ner_model.roberta_ner_inference(texts, device, ner_model.config.max_position_embeddings)

In [None]:
print(inference_output)

[[('<s>', 'O'), (' مدیرکل', 'O'), (' محیط', 'B-ORG'), (' زیست', 'I-ORG'), (' استان', 'I-ORG'), (' البرز', 'I-ORG'), (' با', 'O'), (' بیان', 'O'), (' اینکه', 'O'), (' با', 'O'), (' بیان', 'O'), (' اینکه', 'O'), (' موضوع', 'O'), (' شیر', 'O'), ('ابه', 'O'), ('\u200c', 'O'), ('های', 'O'), (' زباله', 'O'), ('\u200c', 'O'), ('های', 'O'), (' انتقال', 'O'), (' یافته', 'O'), (' در', 'O'), (' منطقه', 'B-LOC'), (' حلقه', 'I-LOC'), (' دره', 'I-LOC'), (' خطری', 'O'), (' برای', 'O'), (' این', 'O'), (' استان', 'O'), (' است', 'O'), ('،', 'O'), (' گفت', 'O'), (':', 'O'), (' در', 'O'), (' این', 'O'), (' مورد', 'O'), (' گزارشاتی', 'O'), (' در', 'O'), (' ۲۵', 'B-DAT'), (' مرداد', 'I-DAT'), (' ۱۳۹۷', 'I-DAT'), (' تقدیم', 'O'), (' مدیران', 'O'), (' استان', 'O'), (' شده', 'O'), (' است', 'O'), ('.', 'O'), ('</s>', 'O')], [('<s>', 'O'), (' به', 'O'), (' گزارش', 'O'), (' خبرگزاری', 'B-ORG'), (' تسنیم', 'I-ORG'), (' از', 'O'), (' کرج', 'B-LOC'), ('،', 'O'), (' حسین', 'B-PER'), (' محمدی', 'I-PER'), (' در', 'O'),

In [None]:
#@title Live Playground { display-mode: "form" }

css_is_load = False
css = """<style>
.ner-box {
    direction: rtl;
    font-size: 18px !important;
    line-height: 20px !important;
    margin: 0 0 15px;
    padding: 10px;
    text-align: justify;
    color: #343434 !important;
}
.token, .token span {
    display: inline-block !important;
    padding: 2px;
    margin: 2px 0;
}
.token.token-ner {
    background-color: #f6cd61;
    font-weight: bold;
    color: #000;
}
.token.token-ner .ner-label {
    color: #9a1f40;
    margin: 0px 2px;
}
</style>"""

if not css_is_load:
    display(HTML(css))
    css_is_load = True

submit_wd = widgets.Button(description='Send', disabled=False, button_style='success', tooltip='Submit')
text_wd = widgets.Textarea(placeholder='Please enter you text ...', rows=5, layout=Layout(width='90%'))
output_wd = widgets.Output()

display(HTML("""
<h2>Test NER model</h2>
<p style="padding: 2px 20px; margin: 0 0 20px;">
</p>
<br /><br />
"""))

display(text_wd)
display(submit_wd)
display(output_wd)

def submit_text(sender):
    with output_wd:
        clear_output(wait=True)
        text = text_wd.value
        _output = ner_model.roberta_ner_inference([text], device, ner_model.config.max_position_embeddings)
        # print(_output)
        pred_sequence = []
        for token, label in _output[0]:
            if token not in ['[CLS]', '[SEP]']:
                if label != 'O':
                    pred_sequence.append(
                        '<span class="token token-ner">%s<span class="ner-label">%s</span></span>' 
                        % (token, label))
                else:
                    pred_sequence.append(
                        '<span class="token">%s</span>' 
                        % token)
            
        html = '<p class="ner-box">%s</p>' % ' '.join(pred_sequence) 
        display(HTML(html))

submit_wd.on_click(submit_text)

Textarea(value='', layout=Layout(width='90%'), placeholder='Please enter you text ...', rows=5)

Button(button_style='success', description='Send', style=ButtonStyle(), tooltip='Submit')

Output()

#### PEYMA dataset:
PEYMA dataset includes 7,145 sentences with a total of 302,530 tokens from which 41,148 tokens are tagged with seven different classes: 

- Organization
- Money
- Location
- Date
- Time
- Person
- Percent

|     Label    |   #   |
|:------------:|:-----:|
| Organization | 16964 |
|     Money    |  2037 |
|   Location   |  8782 |
|     Date     |  4259 |
|     Time     |  732  |
|    Person    |  7675 |
|    Percent   |  699  |

Download
You can download the dataset from [here](https://hooshvare.github.io/docs/datasets/ner) with leads to following google drive file of HooshvareLab:

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1WZxpFRtEs5HZWyWQ2Pyg9CCuIBs1Kmvx'})
download.GetContentFile('peyma.zip')
!ls

adc.json  peyma.zip  sample_data


In [None]:
!unzip peyma.zip
!ls
!ls peyma

Archive:  peyma.zip
   creating: peyma/
  inflating: peyma/dev.txt           
  inflating: peyma/test.txt          
  inflating: peyma/train.txt         
adc.json  peyma  peyma.zip  sample_data
dev.txt  test.txt  train.txt


In [None]:
sentences, labels = ner_model.load_test_datasets(dataset_name="peyma", dataset_dir="./peyma/")
print(len(sentences), len(labels))
print(sentences[0])
print(labels[0])

1026 1026
['کنایه', 'سرلشگر', 'فیروزآبادی', 'به', 'پادشاه', 'عربستان', 'و', 'پسرش']
['O', 'O', 'B_ORG', 'O', 'O', 'B_LOC', 'O', 'O']


In [None]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I_LOC', 'B_TIM', 'I_MON', 'I_ORG', 'I_PER', 'O', 'B_ORG', 'B_PER', 'I_TIM', 'B_LOC', 'B_PCT', 'I_DAT', 'I_PCT', 'B_MON', 'B_DAT'}
intersection: {'O'}
model_labels-dataset_labels: ['I-ORG', 'B-PCT', 'B-MON', 'B-PRO', 'I-MON', 'B-DAT', 'B-TIM', 'I-DAT', 'I-LOC', 'B-ORG', 'I-PER', 'B-EVE', 'B-FAC', 'I-TIM', 'I-PRO', 'I-PCT', 'B-PER', 'I-FAC', 'I-EVE', 'B-LOC']
dataset_labels-model_labels: ['I_LOC', 'B_TIM', 'I_MON', 'I_ORG', 'I_PER', 'I_TIM', 'B_LOC', 'B_ORG', 'B_PCT', 'I_DAT', 'I_PCT', 'B_MON', 'B_DAT', 'B_PER']
False


In [None]:
label_translate = {
    'B_LOC': 'B-LOC', 
    'I_LOC': 'I-LOC', 
    'B_DAT': 'B-DAT', 
    'I_DAT': 'I-DAT',
    'B_ORG': 'B-ORG', 
    'I_ORG': 'I-ORG', 
    'B_PER': 'B-PER', 
    'I_PER': 'I-PER', 
    'B_MON': 'B-MON',
    'I_MON': 'I-MON', 
    'B_PCT': 'B-PCT', 
    'I_PCT': 'I-PCT',
    'B_TIM': 'B-TIM', 
    'I_TIM': 'I-TIM', 
    'O': 'O'
}
labels = ner_model.resolve_input_label_consistency(labels, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-ORG', 'B-PCT', 'I-TIM', 'B-MON', 'I-MON', 'B-LOC', 'O', 'I-PCT', 'B-DAT', 'B-TIM', 'I-DAT', 'I-LOC', 'B-ORG', 'I-PER', 'B-PER'}
intersection: {'I-LOC', 'I-ORG', 'B-PCT', 'I-TIM', 'B-ORG', 'B-MON', 'I-MON', 'I-PER', 'O', 'I-PCT', 'B-DAT', 'B-PER', 'B-TIM', 'B-LOC', 'I-DAT'}
model_labels-dataset_labels: ['B-FAC', 'I-PRO', 'B-PRO', 'I-FAC', 'I-EVE', 'B-EVE']
dataset_labels-model_labels: []
True


In [None]:
!nvidia-smi
!lscpu

Fri Sep  3 14:25:29 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    56W / 149W |    971MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
inference_output_peyma = ner_model.ner_evaluation_2(sentences, labels, device, max_position_embeddings=ner_model.config.max_position_embeddings-2, batch_size=512)

len(input_text): 1026
len(input_labels): 1026
max_len: 155
#samples: 1026
#batch: 3
Start to evaluate test data ...
inference time for step 0: 0.030369700999983706
inference time for step 1: 0.013852599000017563
inference time for step 2: 0.013612318999946638
total inference time: 0.05783461899994791
total inference time / #samples: 5.636902436642096e-05


In [None]:
for sample_output in inference_output_peyma[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

 کنایه	O	B-PRO
 سرل	O	B-PER
شگر	O	I-PER
 فیروزآبادی	B-ORG	I-PER
 به	O	I-PER
 پادشاه	O	O
 عربستان	B-LOC	B-LOC
 و	O	O
 پسرش	O	O

 رئیس	O	O
 سابق	O	O
 ستاد	B-ORG	B-ORG
 کل	I-ORG	I-ORG
 نیروهای	I-ORG	I-ORG
 مسلح	I-ORG	I-ORG
 با	O	O
 بیان	O	O
 اینکه	O	O
 آل	O	O
 سعود	O	I-PER
 با	O	O
 حمایت	O	O
 همه	O	O
 جانبه	O	O
 غرب	O	O
 بر	O	O
 سرزمین	B-LOC	O
 حجاز	I-LOC	B-LOC
 حاکم	O	O
 شد	O	O
 گفت	O	O
 :	O	O
 غرب	O	O
 با	O	O
 حاکم	O	O
 کرد	O	O
د	O	O
 آل	O	O
 سعود	O	O
 بر	O	O
 حجاز	B-LOC	B-LOC
 هدفی	O	O
 جز	O	O
ناب	O	O
ودی	O	O
 اسلام	O	O
 نداشته	O	O
 و	O	O
 این	O	O
 نقشه	O	O
 انگلیس	B-LOC	B-LOC
 بود	O	O
.	O	O

 سرل	O	O
شگر	O	O
 حسن	B-PER	B-PER
 فیروزآبادی	I-PER	I-PER
 روز	O	O
 دوشنبه	O	O
 درح	O	O
اشیه	O	O
 آئین	O	O
 ختم	O	O
 مادر	O	O
 حیدر	B-PER	B-PER
 مص	I-PER	I-PER
ل	I-PER	I-PER
حی	I-PER	I-PER
 در	O	O
جمع	O	O
 خبرنگاران	O	O
 درباره	O	O
 موضوع	O	O
 یمن	B-LOC	B-LOC
 افزود	O	O
 :	O	O
 ماهیت	O	O
 آنچه	O	O
 در	O	O
 یمن	B-LOC	B-LOC
 اتفاق	O	O
 می	O	O
 افتد	O	O
 و	O	O
ها	O	O
بیت	O	O
 است	O	O
 و	O	O
ها	O	O
بی

In [None]:
ner_model.evaluate_prediction_results(labels, inference_output_peyma)

Test Accuracy: 0.9394649868647779
Test Precision: 0.653317393903168
Test Recall: 0.47791867074770444
Test F1-Score: 0.552020202020202
Test classification Report:
              precision    recall  f1-score   support

         DAT  0.6835443038 0.2454545455 0.3612040134       220
         LOC  0.8350515464 0.6532258065 0.7330316742       620
         MON  0.4565217391 0.7777777778 0.5753424658        27
         ORG  0.6104746318 0.4751592357 0.5343839542       785
         PCT  0.7777777778 0.5185185185 0.6222222222        54
         PER  0.5172413793 0.3756708408 0.4352331606       559
         TIM  0.2000000000 0.0909090909 0.1250000000        22

   micro avg  0.6533173939 0.4779186707 0.5520202020      2287
   macro avg  0.5829444826 0.4481022594 0.4837739272      2287
weighted avg  0.6537815083 0.4779186707 0.5459621812      2287



In [None]:
output_file_name = "ner_peyma_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_peyma:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Arman dataset:
ARMAN dataset holds 7,682 sentences with 250,015 sentences tagged over six different classes.

1. Organization
2. Location
3. Facility
4. Event
5. Product
6. Person


|     Label    |   #   |
|:------------:|:-----:|
| Organization | 30108 |
|   Location   | 12924 |
|   Facility   |  4458 |
|     Event    |  7557 |
|    Product   |  4389 |
|    Person    | 15645 |

**Download**
You can download the dataset from [here](https://github.com/HaniehP/PersianNER)


In [None]:
!wget https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
!ls

--2021-09-03 14:26:17--  https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/HaniehP/PersianNER/master/ArmanPersoNERCorpus.zip [following]
--2021-09-03 14:26:17--  https://raw.githubusercontent.com/HaniehP/PersianNER/master/ArmanPersoNERCorpus.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1931170 (1.8M) [application/zip]
Saving to: ‘ArmanPersoNERCorpus.zip’


2021-09-03 14:26:18 (32.1 MB/s) - ‘ArmanPersoNERCorpus.zip’ saved [1931170/1931170]

adc.json						     peyma
ArmanPersoNERCorpus.zip					     peyma.zip
ner_peyma_Ho

In [None]:
!unzip ArmanPersoNERCorpus.zip -d arman
!ls

Archive:  ArmanPersoNERCorpus.zip
  inflating: arman/test_fold1.txt    
  inflating: arman/ReadMe.txt        
  inflating: arman/train_fold3.txt   
  inflating: arman/train_fold2.txt   
  inflating: arman/train_fold1.txt   
  inflating: arman/test_fold3.txt    
  inflating: arman/test_fold2.txt    
adc.json						     peyma
arman							     peyma.zip
ArmanPersoNERCorpus.zip					     sample_data
ner_peyma_HooshvareLab-roberta-fa-zwnj-base-ner_outputs.txt


In [None]:
sentences, labels = ner_model.load_test_datasets(dataset_name="arman", dataset_dir="./arman/")
print(len(sentences), len(labels))
print(sentences[0])
print(labels[0])

7681 7681
['افقی', ':', '0', 'ـ', 'از', 'عوامل', 'دوران', 'پهلوی', 'و', 'نخست\u200cوزیر', 'ایران', 'در', 'سالهای', 'ابتدائی', 'دهه', 'چهل', 'خورشیدی', 'كه', 'جلد', 'سوم', 'یادداشتهایش', 'هم', 'چندی', 'پیش', 'در', 'تهران', 'منتشر', 'شد', '0', 'ـ', 'پرستاری', 'از', 'ناخوش\u200cاحوال', 'ـ', 'پوشاک', 'و', 'جامه', 'ـ', 'فانتزی', 'و', 'شیک', '0', 'ـ', 'در', 'حال', 'وزیدن', 'ـ', 'اطلاعیه', 'ـ', 'پایتخت', 'جمهوری', 'استونی', 'در', 'حوضه', 'بالتیک', '0', 'ـ', 'علم', 'راهبرد', 'مؤسسه', 'و', 'سازمان', 'ـ', 'نوعی', 'شمع', '0', 'ـ', 'حرف', 'جمع', 'مؤنث', 'ـ', 'در', 'ایران', 'به', 'تولیدکننده', 'کتاب', 'اطلاق', 'می\u200cشود', 'ـ', 'از', 'شهرهای', 'باختری', 'افغانستان', 'كه', 'تا', 'عصر', 'ناصرالدین\u200cشاه', 'جزئی', 'از', 'خراسان', 'بود', 'ـ', 'ویتامین', 'انعقاد', '0', 'ـ', 'سبزی', 'غده\u200cای', 'ـ', 'دوستی', 'و', 'محبت', 'ـ', 'داستان', 'بلند', 'ـ', 'شهری', 'در', 'آلمان', '0', 'ـ', 'سلول', 'بدن', 'موجودات', 'ـ', 'از', 'انواع', 'کالباس', '0', 'ـ', 'حاشیه', 'و', 'هامش', 'ـ', 'پیدا', 'نشدنی', 'ـ', 'خ

In [None]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-pro', 'I-event', 'O', 'I-pro', 'I-pers', 'B-org', 'B-loc', 'I-loc', 'B-fac', 'B-pers', 'I-org', 'B-event', 'I-fac'}
intersection: {'O'}
model_labels-dataset_labels: ['I-ORG', 'B-PCT', 'B-MON', 'B-PRO', 'I-MON', 'B-DAT', 'B-TIM', 'I-DAT', 'I-LOC', 'B-ORG', 'I-PER', 'B-EVE', 'B-FAC', 'I-TIM', 'I-PRO', 'I-PCT', 'B-PER', 'I-FAC', 'I-EVE', 'B-LOC']
dataset_labels-model_labels: ['B-fac', 'B-pro', 'B-pers', 'I-event', 'I-org', 'I-pro', 'I-pers', 'B-org', 'B-event', 'B-loc', 'I-loc', 'I-fac']
False


In [None]:
label_translate = {
    'B-pers': 'B-PER', 
    'I-pers': 'I-PER', 
    'B-org': 'B-ORG', 
    'I-org': 'I-ORG', 
    'B-pro': 'B-PRO', 
    'I-pro': 'I-PRO', 
    'B-loc': 'B-LOC', 
    'I-loc': 'I-LOC', 
    'B-event': 'B-EVE', 
    'I-event': 'I-EVE', 
    'B-fac': 'B-FAC', 
    'I-fac': 'I-FAC',
    'O': 'O'
}
labels = ner_model.resolve_input_label_consistency(labels, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-ORG', 'B-FAC', 'I-PRO', 'B-PRO', 'B-LOC', 'O', 'I-LOC', 'B-ORG', 'I-PER', 'I-FAC', 'I-EVE', 'B-EVE', 'B-PER'}
intersection: {'I-LOC', 'I-ORG', 'B-FAC', 'B-ORG', 'I-PRO', 'B-PRO', 'I-PER', 'O', 'I-FAC', 'I-EVE', 'B-EVE', 'B-PER', 'B-LOC'}
model_labels-dataset_labels: ['B-PCT', 'I-TIM', 'B-MON', 'I-MON', 'I-PCT', 'B-DAT', 'B-TIM', 'I-DAT']
dataset_labels-model_labels: []
True


batch size=256 -> inference time for one batch is about 205 s

batch size=512 -> inference time for one batch is about 410 s

batch size=1024 -> crach

In [None]:
!nvidia-smi
!lscpu

Fri Sep  3 14:26:38 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    57W / 149W |   5341MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
inference_output_arman = ner_model.ner_evaluation_2(sentences, labels, device, max_position_embeddings=ner_model.config.max_position_embeddings-2, batch_size=512)

len(input_text): 7681
len(input_labels): 7681
max_len: 321
#samples: 7681
#batch: 16
Start to evaluate test data ...
inference time for step 0: 0.0365022119999594
inference time for step 1: 0.015493517999971118
inference time for step 2: 0.013421192000009796
inference time for step 3: 0.013632655999913368
inference time for step 4: 0.01367522700002155
inference time for step 5: 0.01335916499999712
inference time for step 6: 0.01348359399992205
inference time for step 7: 0.013063488999932815
inference time for step 8: 0.01329828500001895
inference time for step 9: 0.013712101999999504
inference time for step 10: 0.013781611000013072
inference time for step 11: 0.0175598229999423
inference time for step 12: 0.013130260000025373
inference time for step 13: 0.013480453999932251
inference time for step 14: 0.013490558000057717
inference time for step 15: 0.01343107099989993
total inference time: 0.2445152169996163
total inference time / #samples: 3.183377385752068e-05


In [None]:
for sample_output in inference_output_arman[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

 افقی	O	O
 :	O	O
 0	O	O
 �	O	O
�	O	O
 از	O	O
 عوامل	O	O
 دوران	O	O
 پهلوی	O	O
 و	O	O
 نخست	O	O
‌	O	O
وزیر	O	O
 ایران	B-LOC	B-LOC
 در	O	O
 سالهای	O	O
 ابتدائی	O	O
 دهه	O	O
 چهل	O	O
 خورشیدی	O	O
 �	O	O
�	O	O
ه	O	O
 جلد	O	O
 سوم	O	O
 یادداشت	O	O
هایش	O	O
 هم	O	O
 چندی	O	O
 پیش	O	O
 در	O	O
 تهران	B-LOC	B-LOC
 منتشر	O	O
 شد	O	O
 0	O	O
 �	O	O
�	O	O
 پرستاری	O	O
 از	O	O
 ناخوش	O	O
‌	O	O
اح	O	O
وال	O	O
 �	O	O
�	O	O
 پوشاک	O	O
 و	O	O
 جامه	O	O
 �	O	O
�	O	O
 فانتزی	O	O
 و	O	O
 شیک	O	O
 0	O	O
 �	O	O
�	O	O
 در	O	O
 حال	O	O
 وز	O	O
یدن	O	O
 �	O	O
�	O	O
 اطلاعیه	O	O
 �	O	O
�	O	O
 پایتخت	O	O
 جمهوری	O	O
 استونی	B-LOC	B-LOC
 در	I-LOC	O
 حوضه	I-LOC	B-LOC
 بالتیک	I-LOC	I-LOC
 0	O	O
 �	O	O
�	O	O
 علم	O	O
 راهبرد	O	O
 مؤسسه	O	O
 و	O	O
 سازمان	O	O
 �	O	I-ORG
�	O	I-ORG
 نوعی	O	O
 شمع	O	O
 0	O	O
 �	O	O
�	O	O
 حرف	O	O
 جمع	O	O
 مؤنث	O	O
 �	O	O
�	O	O
 در	O	O
 ایران	B-LOC	B-LOC
 به	O	O
 تولیدکننده	O	O
 کتاب	O	O
 اطلاق	O	O
 می	O	O
‌	O	O
شود	O	O
 �	O	O
�	O	O
 از	O	O
 شهرهای	O	O
 باختری	O	O
 افغانستان	B-LOC	B-LOC


In [None]:
ner_model.evaluate_prediction_results(labels, inference_output_arman)

Test Accuracy: 0.943627524283298
Test Precision: 0.41597479243773133
Test Recall: 0.527694943214263
Test F1-Score: 0.4652216473220529
Test classification Report:
              precision    recall  f1-score   support

         EVE  0.4880000000 0.4511834320 0.4688700999       676
         FAC  0.1205673759 0.6612641815 0.2039490127       617
         LOC  0.7665615142 0.6040268456 0.6756568886      4023
         ORG  0.7325725626 0.5799293009 0.6473747671      5092
         PER  0.4993408911 0.4298683613 0.4620075619      4406
         PRO  0.0655179323 0.3453009504 0.1101380936       947

   micro avg  0.4159747924 0.5276949432 0.4652216473     15761
   macro avg  0.4454267127 0.5119288453 0.4279994040     15761
weighted avg  0.6015199530 0.5276949432 0.5454792715     15761



In [None]:
output_file_name = "ner_arman_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_arman:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Arman+Peyma

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1WZxpFRtEs5HZWyWQ2Pyg9CCuIBs1Kmvx'})
download.GetContentFile('peyma.zip')
!ls

In [None]:
!unzip peyma.zip
!ls
!ls peyma

In [None]:
sentences_peyma, labels_peyma = ner_model.load_test_datasets(dataset_name="peyma", dataset_dir="./peyma/")
print(len(sentences_peyma), len(labels_peyma))
print(sentences_peyma[0])
print(labels_peyma[0])

1026 1026
['کنایه', 'سرلشگر', 'فیروزآبادی', 'به', 'پادشاه', 'عربستان', 'و', 'پسرش']
['O', 'O', 'B_ORG', 'O', 'O', 'B_LOC', 'O', 'O']


In [None]:
is_consistent = ner_model.check_input_label_consistency(labels_peyma)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I_LOC', 'B_TIM', 'I_MON', 'I_ORG', 'I_PER', 'O', 'B_ORG', 'B_PER', 'I_TIM', 'B_LOC', 'B_PCT', 'I_DAT', 'I_PCT', 'B_MON', 'B_DAT'}
intersection: {'O'}
model_labels-dataset_labels: ['I-ORG', 'B-PCT', 'B-MON', 'B-PRO', 'I-MON', 'B-DAT', 'B-TIM', 'I-DAT', 'I-LOC', 'B-ORG', 'I-PER', 'B-EVE', 'B-FAC', 'I-TIM', 'I-PRO', 'I-PCT', 'B-PER', 'I-FAC', 'I-EVE', 'B-LOC']
dataset_labels-model_labels: ['I_LOC', 'B_TIM', 'I_MON', 'I_ORG', 'I_PER', 'I_TIM', 'B_LOC', 'B_ORG', 'B_PCT', 'I_DAT', 'I_PCT', 'B_MON', 'B_DAT', 'B_PER']
False


In [None]:
label_translate = {
    'B_LOC': 'B-LOC', 
    'I_LOC': 'I-LOC', 
    'B_DAT': 'B-DAT', 
    'I_DAT': 'I-DAT',
    'B_ORG': 'B-ORG', 
    'I_ORG': 'I-ORG', 
    'B_PER': 'B-PER', 
    'I_PER': 'I-PER', 
    'B_MON': 'B-MON',
    'I_MON': 'I-MON', 
    'B_PCT': 'B-PCT', 
    'I_PCT': 'I-PCT',
    'B_TIM': 'B-TIM', 
    'I_TIM': 'I-TIM', 
    'O': 'O'
}
labels_peyma = ner_model.resolve_input_label_consistency(labels_peyma, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels_peyma)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-ORG', 'B-PCT', 'I-TIM', 'B-MON', 'I-MON', 'B-LOC', 'O', 'I-PCT', 'B-DAT', 'B-TIM', 'I-DAT', 'I-LOC', 'B-ORG', 'I-PER', 'B-PER'}
intersection: {'I-LOC', 'I-ORG', 'B-PCT', 'I-TIM', 'B-ORG', 'B-MON', 'I-MON', 'I-PER', 'O', 'I-PCT', 'B-DAT', 'B-PER', 'B-TIM', 'B-LOC', 'I-DAT'}
model_labels-dataset_labels: ['B-FAC', 'I-PRO', 'B-PRO', 'I-FAC', 'I-EVE', 'B-EVE']
dataset_labels-model_labels: []
True


In [None]:
!wget https://github.com/HaniehP/PersianNER/raw/master/ArmanPersoNERCorpus.zip
!ls

In [None]:
!unzip ArmanPersoNERCorpus.zip -d arman
!ls

In [None]:
sentences_arman, labels_arman = ner_model.load_test_datasets(dataset_name="arman", dataset_dir="./arman/")
print(len(sentences_arman), len(labels_arman))
print(sentences_arman[0])
print(labels_arman[0])

7681 7681
['افقی', ':', '0', 'ـ', 'از', 'عوامل', 'دوران', 'پهلوی', 'و', 'نخست\u200cوزیر', 'ایران', 'در', 'سالهای', 'ابتدائی', 'دهه', 'چهل', 'خورشیدی', 'كه', 'جلد', 'سوم', 'یادداشتهایش', 'هم', 'چندی', 'پیش', 'در', 'تهران', 'منتشر', 'شد', '0', 'ـ', 'پرستاری', 'از', 'ناخوش\u200cاحوال', 'ـ', 'پوشاک', 'و', 'جامه', 'ـ', 'فانتزی', 'و', 'شیک', '0', 'ـ', 'در', 'حال', 'وزیدن', 'ـ', 'اطلاعیه', 'ـ', 'پایتخت', 'جمهوری', 'استونی', 'در', 'حوضه', 'بالتیک', '0', 'ـ', 'علم', 'راهبرد', 'مؤسسه', 'و', 'سازمان', 'ـ', 'نوعی', 'شمع', '0', 'ـ', 'حرف', 'جمع', 'مؤنث', 'ـ', 'در', 'ایران', 'به', 'تولیدکننده', 'کتاب', 'اطلاق', 'می\u200cشود', 'ـ', 'از', 'شهرهای', 'باختری', 'افغانستان', 'كه', 'تا', 'عصر', 'ناصرالدین\u200cشاه', 'جزئی', 'از', 'خراسان', 'بود', 'ـ', 'ویتامین', 'انعقاد', '0', 'ـ', 'سبزی', 'غده\u200cای', 'ـ', 'دوستی', 'و', 'محبت', 'ـ', 'داستان', 'بلند', 'ـ', 'شهری', 'در', 'آلمان', '0', 'ـ', 'سلول', 'بدن', 'موجودات', 'ـ', 'از', 'انواع', 'کالباس', '0', 'ـ', 'حاشیه', 'و', 'هامش', 'ـ', 'پیدا', 'نشدنی', 'ـ', 'خ

In [None]:
is_consistent = ner_model.check_input_label_consistency(labels_arman)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'B-pro', 'I-event', 'O', 'I-pro', 'I-pers', 'B-org', 'B-loc', 'I-loc', 'B-fac', 'B-pers', 'I-org', 'B-event', 'I-fac'}
intersection: {'O'}
model_labels-dataset_labels: ['I-ORG', 'B-PCT', 'B-MON', 'B-PRO', 'I-MON', 'B-DAT', 'B-TIM', 'I-DAT', 'I-LOC', 'B-ORG', 'I-PER', 'B-EVE', 'B-FAC', 'I-TIM', 'I-PRO', 'I-PCT', 'B-PER', 'I-FAC', 'I-EVE', 'B-LOC']
dataset_labels-model_labels: ['B-fac', 'B-pro', 'B-pers', 'I-event', 'I-org', 'I-pro', 'I-pers', 'B-org', 'B-event', 'B-loc', 'I-loc', 'I-fac']
False


In [None]:
label_translate = {
    'B-pers': 'B-PER', 
    'I-pers': 'I-PER', 
    'B-org': 'B-ORG', 
    'I-org': 'I-ORG', 
    'B-pro': 'B-PRO', 
    'I-pro': 'I-PRO', 
    'B-loc': 'B-LOC', 
    'I-loc': 'I-LOC', 
    'B-event': 'B-EVE', 
    'I-event': 'I-EVE', 
    'B-fac': 'B-FAC', 
    'I-fac': 'I-FAC',
    'O': 'O'
}
labels_arman = ner_model.resolve_input_label_consistency(labels_arman, label_translate)
is_consistent = ner_model.check_input_label_consistency(labels_arman)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-ORG', 'B-FAC', 'I-PRO', 'B-PRO', 'B-LOC', 'O', 'I-LOC', 'B-ORG', 'I-PER', 'I-FAC', 'I-EVE', 'B-EVE', 'B-PER'}
intersection: {'I-LOC', 'I-ORG', 'B-FAC', 'B-ORG', 'I-PRO', 'B-PRO', 'I-PER', 'O', 'I-FAC', 'I-EVE', 'B-EVE', 'B-PER', 'B-LOC'}
model_labels-dataset_labels: ['B-PCT', 'I-TIM', 'B-MON', 'I-MON', 'I-PCT', 'B-DAT', 'B-TIM', 'I-DAT']
dataset_labels-model_labels: []
True


In [None]:
sentences = sentences_arman + sentences_peyma
labels = labels_arman + labels_peyma
print(len(sentences), len(labels))

8707 8707


In [None]:
is_consistent = ner_model.check_input_label_consistency(labels)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'I-ORG', 'B-FAC', 'B-PCT', 'I-PRO', 'I-TIM', 'B-MON', 'B-PRO', 'B-LOC', 'O', 'I-MON', 'I-PCT', 'B-DAT', 'B-TIM', 'I-DAT', 'I-LOC', 'B-ORG', 'I-PER', 'I-FAC', 'I-EVE', 'B-EVE', 'B-PER'}
intersection: {'I-ORG', 'B-PCT', 'B-MON', 'B-PRO', 'I-MON', 'O', 'B-DAT', 'B-TIM', 'I-DAT', 'I-LOC', 'B-ORG', 'I-PER', 'B-EVE', 'B-FAC', 'I-TIM', 'I-PRO', 'I-PCT', 'B-PER', 'I-FAC', 'I-EVE', 'B-LOC'}
model_labels-dataset_labels: []
dataset_labels-model_labels: []
True


In [None]:
!nvidia-smi
!lscpu

Fri Sep  3 14:34:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    58W / 149W |   9637MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
inference_output = ner_model.ner_evaluation_2(sentences, labels, device, max_position_embeddings=ner_model.config.max_position_embeddings-2, batch_size=512)

len(input_text): 8707
len(input_labels): 8707
max_len: 321
#samples: 8707
#batch: 18
Start to evaluate test data ...
inference time for step 0: 0.03019422800002758
inference time for step 1: 0.01498562699998729
inference time for step 2: 0.013617829999930109
inference time for step 3: 0.013293451999970785
inference time for step 4: 0.013147649000075035
inference time for step 5: 0.013058023999974466
inference time for step 6: 0.013822092000054909
inference time for step 7: 0.013356275999967693
inference time for step 8: 0.013037110999903234
inference time for step 9: 0.013204780000023675
inference time for step 10: 0.012983474000066053
inference time for step 11: 0.01423021300001892
inference time for step 12: 0.013413059999948018
inference time for step 13: 0.013336793999997099
inference time for step 14: 0.013504065000006449
inference time for step 15: 0.013487273999999161
inference time for step 16: 0.01366225099991425
inference time for step 17: 0.01360733599995001
total inference 

In [None]:
for sample_output in inference_output[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

 افقی	O	O
 :	O	O
 0	O	O
 �	O	O
�	O	O
 از	O	O
 عوامل	O	O
 دوران	O	O
 پهلوی	O	O
 و	O	O
 نخست	O	O
‌	O	O
وزیر	O	O
 ایران	B-LOC	B-LOC
 در	O	O
 سالهای	O	O
 ابتدائی	O	O
 دهه	O	O
 چهل	O	O
 خورشیدی	O	O
 �	O	O
�	O	O
ه	O	O
 جلد	O	O
 سوم	O	O
 یادداشت	O	O
هایش	O	O
 هم	O	O
 چندی	O	O
 پیش	O	O
 در	O	O
 تهران	B-LOC	B-LOC
 منتشر	O	O
 شد	O	O
 0	O	O
 �	O	O
�	O	O
 پرستاری	O	O
 از	O	O
 ناخوش	O	O
‌	O	O
اح	O	O
وال	O	O
 �	O	O
�	O	O
 پوشاک	O	O
 و	O	O
 جامه	O	O
 �	O	O
�	O	O
 فانتزی	O	O
 و	O	O
 شیک	O	O
 0	O	O
 �	O	O
�	O	O
 در	O	O
 حال	O	O
 وز	O	O
یدن	O	O
 �	O	O
�	O	O
 اطلاعیه	O	O
 �	O	O
�	O	O
 پایتخت	O	O
 جمهوری	O	O
 استونی	B-LOC	B-LOC
 در	I-LOC	O
 حوضه	I-LOC	B-LOC
 بالتیک	I-LOC	I-LOC
 0	O	O
 �	O	O
�	O	O
 علم	O	O
 راهبرد	O	O
 مؤسسه	O	O
 و	O	O
 سازمان	O	O
 �	O	I-ORG
�	O	I-ORG
 نوعی	O	O
 شمع	O	O
 0	O	O
 �	O	O
�	O	O
 حرف	O	O
 جمع	O	O
 مؤنث	O	O
 �	O	O
�	O	O
 در	O	O
 ایران	B-LOC	B-LOC
 به	O	O
 تولیدکننده	O	O
 کتاب	O	O
 اطلاق	O	O
 می	O	O
‌	O	O
شود	O	O
 �	O	O
�	O	O
 از	O	O
 شهرهای	O	O
 باختری	O	O
 افغانستان	B-LOC	B-LOC


In [None]:
ner_model.evaluate_prediction_results(labels, inference_output)

Test Accuracy: 0.9393687114959206
Test Precision: 0.40725609485323505
Test Recall: 0.5081449468085106
Test F1-Score: 0.4521409027041684
Test classification Report:
              precision    recall  f1-score   support

         DAT  0.4590163934 0.1272727273 0.1992882562       220
         EVE  0.4663608563 0.4511834320 0.4586466165       676
         FAC  0.1068622315 0.6612641815 0.1839909808       617
         LOC  0.7727914197 0.5897049322 0.6689469827      4643
         MON  0.0380313199 0.6296296296 0.0717299578        27
         ORG  0.7133187773 0.5558958652 0.6248446017      5877
         PCT  0.8064516129 0.4629629630 0.5882352941        54
         PER  0.4967367658 0.4138972810 0.4515491101      4965
         PRO  0.0623569794 0.3453009504 0.1056372153       947
         TIM  0.2500000000 0.0454545455 0.0769230769        22

   micro avg  0.4072560949 0.5081449468 0.4521409027     18048
   macro avg  0.4171926356 0.4282566508 0.3429792092     18048
weighted avg  0.60050132

In [None]:
output_file_name = "ner_arman-and-peyma_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### WikiAnn dataset:

In [8]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX'})
download.GetContentFile('fa.tar.gz')
!ls

adc.json  fa.tar.gz  sample_data


In [9]:
!tar -zxvf fa.tar.gz
!ls

README.txt
wikiann-fa.bio
adc.json  fa.tar.gz  README.txt  sample_data  wikiann-fa.bio


In [10]:
sentences_all, labels_all, sentences_test, labels_test = ner_model.load_datasets(dataset_name="wikiann", dataset_dir="./")
print(len(sentences_all), len(sentences_all))
print(len(sentences_test), len(labels_test))
print(sentences_test[0])
print(labels_test[0])

all data: #data: 272266, #labels: 272266


  return array(a, dtype, copy=False, order=order)


without stratify
test part:
 #data: 27227, #labels: 27227
272266 272266
27227 27227
['**', 'زاغی', 'نوک\u200cزرد', ',', "''Pica", 'nuttalli', "''"]
['O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O']


In [11]:
is_consistent = ner_model.check_input_label_consistency(labels_test)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'O', 'I-PER', 'B-ORG', 'B-PER', 'B-LOC', 'I-LOC', 'I-ORG'}
intersection: {'O', 'I-PER', 'B-ORG', 'B-PER', 'B-LOC', 'I-LOC', 'I-ORG'}
model_labels-dataset_labels: ['B-MON', 'B-TIM', 'B-EVE', 'B-FAC', 'B-PRO', 'I-PRO', 'I-TIM', 'B-PCT', 'B-DAT', 'I-MON', 'I-DAT', 'I-PCT', 'I-FAC', 'I-EVE']
dataset_labels-model_labels: []
True


In [12]:
!nvidia-smi
!lscpu

Wed Sep  8 05:06:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8    27W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [19]:
inference_output_wikiann = ner_model.ner_evaluation_2(sentences_test, labels_test, device, max_position_embeddings=ner_model.config.max_position_embeddings-2, batch_size=128)

len(input_text): 27227
len(input_labels): 27227
c: 10000
c: 20000
max_len: 512
#samples: 27227
#batch: 213
Start to evaluate test data ...
inference time for step 0: 0.02430991799997173
inference time for step 1: 0.013518256999986988
inference time for step 2: 0.013963026000055834
inference time for step 3: 0.014115537000179756
inference time for step 4: 0.01281885800017335
inference time for step 5: 0.013168571999813139
inference time for step 6: 0.014181066999753966
inference time for step 7: 0.013499936000243906
inference time for step 8: 0.013417993000075512
inference time for step 9: 0.013854730000275595
inference time for step 10: 0.014428329000111262
inference time for step 11: 0.014202121999915107
inference time for step 12: 0.013590466000096058
inference time for step 13: 0.014163651999751892
inference time for step 14: 0.01366932100017948
inference time for step 15: 0.013797142999919743
inference time for step 16: 0.013796338000247488
inference time for step 17: 0.01308827200

In [20]:
for sample_output in inference_output_wikiann[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

 **	O	B-FAC
 زا	B-LOC	O
غی	B-LOC	O
 نوک	I-LOC	O
‌	I-LOC	I-PRO
زرد	I-LOC	O
,	O	O
 '	O	O
'	O	O
P	O	O
ica	O	O
 n	O	O
ut	O	O
t	O	O
all	O	O
i	O	O
 '	O	O
'	O	O

 تغییر	O	B-FAC
مسیر	O	O
 مک	B-LOC	O
‌	B-LOC	I-PRO
ویل	B-LOC	I-PRO
،	B-LOC	O
 داکوتای	I-LOC	I-PRO
 شمالی	I-LOC	I-PRO

 وست	B-LOC	B-FAC
 یونیور	I-LOC	O
سیتی	I-LOC	O
 پلیس	I-LOC	O
،	I-LOC	O
 تگزاس	I-LOC	O

 تغییر	O	B-FAC
مسیر	O	O
 دلت	B-PER	O
ف	B-PER	O
 فون	I-PER	O
 لیل	I-PER	O
نسر	I-PER	O
ون	I-PER	O

 تغییر	O	B-PRO
مسیر	O	O
 نیروگاه	B-ORG	O
‌	B-ORG	I-PRO
های	B-ORG	O
 زنجیره	I-ORG	O
‌	I-ORG	I-PRO
ای	I-ORG	O
 یاسوج	I-ORG	O



In [21]:
ner_model.evaluate_prediction_results(labels_test, inference_output_wikiann)

Test Accuracy: 0.40432582564477837
Test Precision: 0.1296986798347274
Test Recall: 0.028802255841016918
Test F1-Score: 0.04713681396158002
Test classification Report:
              precision    recall  f1-score   support

         LOC  0.3298902518 0.0203205154 0.0382828888     25147
         ORG  0.2053956835 0.0447281842 0.0734594108     12766
         PER  0.0366464069 0.0302761778 0.0331581076      6771

   micro avg  0.1296986798 0.0288022558 0.0471368140     44684
   macro avg  0.1906441140 0.0317749591 0.0483001357     44684
weighted avg  0.2498873037 0.0288022558 0.0475560870     44684



In [22]:
output_file_name = "ner_wikiann_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output_wikiann:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

#### Hooshvare - Arman+Peyma+WikiAnn

https://github.com/hooshvare/parsner

In [23]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
download = drive.CreateFile({'id': '1fC2WGlpqumUTaT9Dr_U1jO2no3YMKFJ4'})
download.GetContentFile('ner-v1.zip')
!ls

adc.json						       README.txt
fa.tar.gz						       sample_data
ner-v1.zip						       wikiann-fa.bio
ner_wikiann_HooshvareLab-roberta-fa-zwnj-base-ner_outputs.txt


In [24]:
!unzip ner-v1.zip
!ls
!ls ner

Archive:  ner-v1.zip
   creating: ner/
  inflating: ner/valid.csv           
  inflating: ner/ner.csv             
  inflating: ner/test.csv            
  inflating: ner/train.csv           
adc.json    ner_wikiann_HooshvareLab-roberta-fa-zwnj-base-ner_outputs.txt
fa.tar.gz   README.txt
ner	    sample_data
ner-v1.zip  wikiann-fa.bio
ner.csv  test.csv  train.csv  valid.csv


In [25]:
sentences_paw, labels_paw = ner_model.load_test_datasets(dataset_name="hooshvare-peyman+arman+wikiann", dataset_dir="./ner/")
print(len(sentences_paw), len(labels_paw))
print(sentences_paw[0])
print(labels_paw[0])

test part:
 #sentences: 6049, #sentences_tags: 6049
6049 6049
['همچنین', 'عملیات', 'لرزه\u200cنگاری', 'دوبعدی', 'نیز', 'با', 'فعالیت', 'مستمر', 'چهار', 'گروه', 'کاری', 'در', 'مناطقی', 'که', 'از', 'نظر', 'اکتشافی', 'مورد', 'نظر', 'بود', '،', 'به', 'پایان', 'رسید', 'که', 'نتایج', 'آن', 'در', 'حال', 'بررسی', 'است', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [26]:
is_consistent = ner_model.check_input_label_consistency(labels_paw)
print(is_consistent)

model labels: dict_keys(['B-DAT', 'B-EVE', 'B-FAC', 'B-LOC', 'B-MON', 'B-ORG', 'B-PCT', 'B-PER', 'B-PRO', 'B-TIM', 'I-DAT', 'I-EVE', 'I-FAC', 'I-LOC', 'I-MON', 'I-ORG', 'I-PCT', 'I-PER', 'I-PRO', 'I-TIM', 'O'])
dataset labels: {'O', 'B-EVE', 'B-ORG', 'I-PRO', 'B-PCT', 'B-PER', 'I-LOC', 'I-PCT', 'B-MON', 'I-PER', 'B-TIM', 'B-FAC', 'B-PRO', 'B-DAT', 'B-LOC', 'I-MON', 'I-DAT', 'I-ORG', 'I-TIM', 'I-FAC', 'I-EVE'}
intersection: {'I-PRO', 'I-LOC', 'I-PCT', 'B-MON', 'B-PRO', 'B-LOC', 'I-MON', 'I-ORG', 'I-TIM', 'I-FAC', 'I-EVE', 'O', 'B-EVE', 'B-ORG', 'B-PCT', 'B-PER', 'I-PER', 'B-TIM', 'B-FAC', 'B-DAT', 'I-DAT'}
model_labels-dataset_labels: []
dataset_labels-model_labels: []
True


In [27]:
!nvidia-smi
!lscpu

Wed Sep  8 06:29:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    59W / 149W |   5939MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [30]:
inference_output = ner_model.ner_evaluation_2(
    sentences_paw, 
    labels_paw, 
    device, 
    max_position_embeddings=ner_model.config.max_position_embeddings-2, 
    batch_size=128
)

len(input_text): 6049
len(input_labels): 6049
max_len: 512
#samples: 6049
#batch: 48
Start to evaluate test data ...
inference time for step 0: 0.02858871500029636
inference time for step 1: 0.0143164600003729
inference time for step 2: 0.015058347999911348
inference time for step 3: 0.013205195999944408
inference time for step 4: 0.014234317999580526
inference time for step 5: 0.015275054999619897
inference time for step 6: 0.015005913000095461
inference time for step 7: 0.014609536000534717
inference time for step 8: 0.0149786279998807
inference time for step 9: 0.014727455999491212
inference time for step 10: 0.014651565999884042
inference time for step 11: 0.015084055999977863
inference time for step 12: 0.01617027600059373
inference time for step 13: 0.022634406999713974
inference time for step 14: 0.013755793000200356
inference time for step 15: 0.014548514000125579
inference time for step 16: 0.013435355000183336
inference time for step 17: 0.014383870000528987
inference time fo

In [31]:
for sample_output in inference_output[:5]:
  for token, true_label, predicted_label in sample_output:
    print('{}\t{}\t{}'.format(token, true_label, predicted_label))
  print()

 همچنین	O	B-MON
 عملیات	O	O
 لرزه	O	O
‌	O	O
نگاری	O	O
 دوبعدی	O	O
 نیز	O	O
 با	O	O
 فعالیت	O	O
 مستمر	O	O
 چهار	O	O
 گروه	O	O
 کاری	O	O
 در	O	O
 مناطقی	O	O
 که	O	O
 از	O	O
 نظر	O	O
 اکتشافی	O	O
 مورد	O	O
 نظر	O	O
 بود	O	O
 ،	O	O
 به	O	O
 پایان	O	O
 رسید	O	O
 که	O	O
 نتایج	O	O
 آن	O	O
 در	O	O
 حال	O	O
 بررسی	O	O
 است	O	O
.	O	O

 محدث	B-PER	B-PRO
 در	O	O
 مورد	O	O
 مشارکت	O	O
 شرکتهای	O	O
 خارجی	O	O
 در	O	O
 فعالیتهای	O	O
 اکتشافی	O	O
 کشور	O	O
 گفت	O	O
 :	O	O
 تاکنون	O	O
 چند	O	O
 منطقه	O	O
 اکتشافی	O	O
 را	O	O
 برای	O	O
 مشارکت	O	O
 و	O	O
 سرمایه	O	O
‌	O	O
گذاری	O	O
 شرکتهای	O	O
 خارجی	O	O
 اعلام	O	O
 کرده	O	O
‌	O	O
ایم	O	O
 و	O	O
 در	O	O
 حال	O	O
 مذاکره	O	O
 با	O	O
 طرف	O	O
های	O	O
 خارجی	O	O
 هستیم	O	O
 و	O	O
 انتظار	O	O
 می	O	O
‌	O	O
رود	O	O
 تا	O	O
 آخر	O	O
 امسال	O	O
 بتوانیم	O	O
 چند	O	O
 قرارداد	O	O
 را	O	O
 نهایی	O	O
 کنیم	O	O
.	O	O

 مدیر	O	B-FAC
 امور	B-ORG	O
 اکتشاف	I-ORG	O
 شرکت	I-ORG	B-ORG
 ملی	I-ORG	I-ORG
 نفت	I-ORG	I-ORG
 فرو	O	O
افتادگی	O	O
 دزفول	B-LOC	B-LOC
 و	O	O
 م

In [32]:
ner_model.evaluate_prediction_results(labels_paw, inference_output)

Test Accuracy: 0.9123903408422346
Test Precision: 0.26904426716721414
Test Recall: 0.4319075918640663
Test F1-Score: 0.33155561267107886
Test classification Report:
              precision    recall  f1-score   support

         DAT  0.5666666667 0.2674157303 0.3633587786       445
         EVE  0.3865248227 0.3670033670 0.3765112263       297
         FAC  0.0625449317 0.6148409894 0.1135399674       283
         LOC  0.8053415511 0.4790711885 0.6007662835      3273
         MON  0.1020066890 0.4959349593 0.1692094313       123
         ORG  0.6184603886 0.4585757828 0.5266507558      3609
         PCT  0.7534246575 0.5555555556 0.6395348837        99
         PER  0.4042085427 0.3844086022 0.3940600122      3348
         PRO  0.0172623061 0.3076923077 0.0326905887       416
         TIM  0.3333333333 0.0740740741 0.1212121212        54

   micro avg  0.2690442672 0.4319075919 0.3315556127     11947
   macro avg  0.4049773889 0.4004572557 0.3337534049     11947
weighted avg  0.5623318

In [33]:
output_file_name = "ner_arman-and-peyma-and-wikiann_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for sample_output in inference_output:
    for token, true_label, predicted_label in sample_output:
      output_file.write('{}\t{}\t{}\n'.format(token, true_label, predicted_label))
    output_file.write('\n')
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()