# Named Entity Annotator

(C) 2024 by [Damir Cavar](http://damir.cavar.me/)

This is an annotator based on the spaCy pipeline.

In [1]:
import os
import spacy
from spacy.matcher import PhraseMatcher
import json
import logging
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
from transformers import pipeline
pipe = pipeline("fill-mask", model="aubmindlab/bert-large-arabertv02")

In [3]:
from spacy_transformers import Transformer
from spacy_transformers.pipeline_component import DEFAULT_CONFIG

In [4]:
def set_tokenizer_exceptions(nlp: spacy.language.Language, file_name: str):
    with open(file_name, mode='r', encoding='utf-8') as ifp:
        data = json.load(ifp)
    for line in data:
        nlp.tokenizer.add_special_case(line, [ {spacy.attrs.ORTH: f"{i}"} for i in data[line] ])

def set_lemmatization_exceptions(nlp: spacy.language.Language, file_name: str):
    data = {}
    with open(file_name, mode='r', encoding='utf-8') as ifp:
        data = json.load(ifp)
    ruler = nlp.get_pipe("attribute_ruler")
    for token in data:
        ruler.add(patterns=[[{"TEXT": f"{token.strip()}"}]], attrs={"LEMMA": f"{data[token].strip()}"})

def load_span_entities(nlp: spacy.language.Language, file_name: str):
    if file_name.endswith(".xlsx"):
        rules = compile_pattern(nlp, file_name)
    else:
        rules = compile_pattern_json(file_name)
    return rules

def compile_pattern(nlp: spacy.language.Language, file_name: str) -> list:
    entities = {}
    df = pd.read_excel(file_name, index_col=None) # , header=None)
    df.fillna("", inplace=True)
    ts = list(df)
    for t in ts:
        data = [ x.strip() for x in [ str(y) for y in df[t].tolist() ] if x.strip() ]
        for entity in data:
            val = entities.get(entity, set())
            val.add(t)
            entities[entity] = val
    ne_rules = []
    entity_keys = list(entities.keys())
    for i in tqdm(range(len(entity_keys))):
        a = entity_keys[i]
        doc = nlp(a)
        tokens = [ x.text for x in doc ]
        for i in range(len(tokens)):
            if tokens[i] == u'"':
                tokens[i] = u"\""
        #tokens_lower_rules = [ {u"LOWER": lo} for lo in [ x.lower() for x in tokens ] ]
        tokens_text_rules  = [ {u"TEXT":  t}  for t  in tokens ]
        for c in entities[a]:
            ne_rules.append({u"label": c, u"pattern": tokens_text_rules})
            #ne_rules.append({u"label": c, u"pattern": tokens_lower_rules })
    return ne_rules

def compile_pattern_json(file_name: str) -> list:
    with open(file_name, mode='r', encoding='utf-8') as ifp:
        data = json.load(ifp)
    return data["pattern"]

def load_nlp() -> spacy.language.Language:
    nlp = spacy.blank("ar")
    config = {
        "model": {
            "@architectures": "spacy-transformers.TransformerModel.v3",
            "name": "aubmindlab/bert-large-arabertv02"
        }
    }
    nlp.add_pipe("transformer", config=config)
    nlp.add_pipe('sentencizer')
    df = pd.read_excel("NEWNER.xlsx", index_col=None)
    df.fillna("", inplace=True)
    matcher = PhraseMatcher(nlp.vocab)
    ne_classes = {}
    ts = list(df)
    for t in ts:
        data = [ x.strip() for x in [ str(y) for y in df[t].tolist() ] if x.strip() ]
        for entity in data:
            val = ne_classes.get(t, set())
            val.add(entity)
            ne_classes[t] = val
    for nec in ne_classes:
        terms = list(ne_classes[nec])
        patterns = [ nlp.make_doc(text) for text in terms ]
        matcher.add(nec, patterns)
    if "attribute_ruler" not in nlp.pipe_names:
        nlp.add_pipe("attribute_ruler")
    set_tokenizer_exceptions(nlp, "tokenizer_exceptions.json")
    set_lemmatization_exceptions(nlp, "lemmatization_exceptions.json")
    nlp.initialize()
    return nlp, matcher

In [5]:
def process_file(text: str, nlp, matcher) -> list:
    doc = nlp(text)
    res = []
    s_counter = 0
    for s in doc.sents:
        s_counter += 1
        for token in s:
            res.append( [ s_counter, token.i, token.text, 'O'] )
    ents = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        match_id_string = nlp.vocab.strings[match_id]
        ents.append( (span.text, match_id_string, start, end) )
    return {"tokens": res, "entities": ents}

In [None]:
nlp, matcher = load_nlp()
nlp.pipeline

In [None]:
file_list = []
for root, dirs, files in os.walk('Data/Jaz/txt'):
    for f in files:
        if f.endswith(".txt"):
            file_list.append(os.path.join(root, f))
for i in tqdm(range(len(file_list))):
    ofname = file_list[i][:-4]+".json"
    if os.path.exists(ofname):
        continue
    with open(file_list[i], mode='r', encoding='utf-8') as ifp:
        text = ifp.read()
    res = process_file(text, nlp, matcher)
    with open(ofname, mode='w', encoding='utf-8') as ofp:
        ofp.write(json.dumps(res, ensure_ascii=False))