# Utils

In [None]:
# | default_exp ner.utils

In [None]:
# | export

from langchain_ray.imports import *
from langchain_ray.chains import *
from langchain_ray.utils import *
from langchain_ray.pdf.utils import *
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser

In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
# | export


class EduNER(BaseModel):
    degree: str = Field(description="The degree of the education.", default=None)
    major: str = Field(description="The major of the education.", default=None)
    school: str = Field(description="The school of the education.", default=None)
    date: str = Field(description="The date of the education.", default=None)


class JobNER(BaseModel):
    role: str = Field(description="The role of the job.", default=None)
    company: str = Field(description="The company of the job.", default=None)
    duration: str = Field(description="The duration of the job.", default=None)

In [None]:
#| export

def load_edu_model(model_name="tner/deberta-v3-large-ontonotes5", device="cpu"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    return pipeline(
        "ner",
        model=model,
        tokenizer=tokenizer,
        aggregation_strategy="simple",
        device=device,
    )


def load_job_model(model_name="ismail-lucifer011/autotrain-job_all-903929564", device="cpu"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    return pipeline(
        "ner",
        model=model,
        tokenizer=tokenizer,
        aggregation_strategy="simple",
        device=device,
    )


def proc_ners(ners, ner_dict={"institute": "", "date": ""}, thresh=3):
    ner_dict2 = copy.deepcopy(ner_dict)
    org_key = "institute" if "institute" in ner_dict2 else "company"
    mapper = {
        "ORG": org_key,
        "FAC": org_key,
        "GPE": org_key,
        "LOC": org_key,
        "Job": "role",
        "WORK_OF_ART": "degree",
        "DATE": "date",
    }
    ner_dicts = []
    for ner in ners:
        if len(ner) == 0:
            ner_dicts.append({})
            continue
        try:
            for d in ner:
                eg = d["entity_group"]
                w = " " + d["word"].strip()
                k = mapper.get(eg, None)
                if k is not None and ner_dict2.get(k, None) is not None and not w.startswith("##"):
                    ner_dict2[k] = (ner_dict2[k] + w).strip()
            res = {k: v for k, v in ner_dict2.items() if len(v) > thresh}
            if res.get(org_key, None) is not None:
                ner_dicts.append(res)
            else:
                ner_dicts.append({})
            ner_dict2 = copy.deepcopy(ner_dict)
        except Exception as e:
            msg.fail(f"proc_ners failed with error: {e}", spaced=True)
            ner_dicts.append({})
            ner_dict2 = copy.deepcopy(ner_dict)
    return ner_dicts


def job_ner(docs, e_ner, j_ner):
    return j_ner(docs), e_ner(docs)


def edu_ner(docs, e_ner, ner_dict={"institute": "", "date": ""}):
    ners = e_ner(docs)
    return proc_ners(ners, ner_dict)


def work_ner(docs, e_ner, j_ner, ner_dict={"company": "", "date": ""}):
    ner1, ner2 = job_ner(docs, e_ner, j_ner)
    ners = [n1 + n2 for n1, n2 in zip(ner1, ner2)]
    return proc_ners(ners, ner_dict)


def docs_to_ners(docs, e_ner, j_ner):
    ners = [{}] * len(docs)
    work_docs = np.array(
        [
            [i, doc.page_content]
            for i, doc in enumerate(docs)
            if doc.metadata.get("category", None) == "Work Experience"
        ]
    )
    work_docs_idx = work_docs[:, 0].astype(int)
    work_docs = work_docs[:, 1].tolist()
    try:
        work_ners = work_ner(work_docs, e_ner, j_ner)
    except Exception as e:
        msg.fail(f"work_ner failed with error: {e}", spaced=True)
        work_ners = [{}] * len(work_docs)
    for i, doc in enumerate(work_docs_idx):
        ners[doc] = work_ners[i]
    edu_docs = np.array(
        [
            [i, doc.page_content]
            for i, doc in enumerate(docs)
            if doc.metadata.get("category", None) == "Education"
        ]
    )
    edu_docs_idx = edu_docs[:, 0].astype(int)
    edu_docs = edu_docs[:, 1].tolist()
    try:
        edu_ners = edu_ner(edu_docs, e_ner)
    except Exception as e:
        msg.fail(f"edu_ner failed with error: {e}", spaced=True)
        edu_ners = [{}] * len(edu_docs)
    for i, doc in enumerate(edu_docs_idx):
        ners[doc] = edu_ners[i]
    return ners

def add_ners_to_docs(docs, e_ner, j_ner, key="ner"):
    fn = partial(docs_to_ners, e_ner=e_ner, j_ner=j_ner)
    return add_docs_metadata(docs, fn, key)

In [None]:
# #| hide
# #| eval: false


# examples = [{"text": "text 1", "ner": "ner 1"}, {"text": "text 2", "ner": "ner 2"}]

# job_parser = PydanticOutputParser(pydantic_object=JobNER)
# ex_temp = "text: {text}\n{ner}"
# ex_prompt = PromptTemplate.from_template(ex_temp)
# ex_prompt = FewShotPromptTemplate(
#     examples=examples,
#     example_prompt=ex_prompt,
#     prefix="Extract ner from this text:\n{format_instructions}\nHere are some examples:\n",
#     suffix="Now it's your turn:\ntext: {text}",
#     input_variables=["format_instructions", "text"],
# ).partial(format_instructions=job_parser.get_format_instructions())
# #| hide
# #| eval: false


# tt_pipe = HuggingFacePipeline(
#     pipeline=pipeline(
#         "text2text-generation",
#         model="google/flan-t5-large",
#         device_map=default_device(),
#         max_new_tokens=256,
#     )
# )
# #| hide
# #| eval: false

# gen_chain = LLMChain(prompt=ex_prompt, llm=tt_pipe)

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()