# Segment

In [None]:
#| default_exp segment

In [None]:
#| hide

%matplotlib inline
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
#| export

from dreamai_pdf.core import *
from dreamai_pdf.parse import *
from dreamai_pdf.imports import *

import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /home/hamza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#| export

def get_lemmas(word):
    # print(word)
    forms = {word}
    for happy_lemma in wordnet.lemmas(word):
        forms.add(happy_lemma.name().lower())
        for related_lemma in happy_lemma.derivationally_related_forms():
            forms.add(related_lemma.name().lower())
    # print(forms)
    return list(forms)

def get_lemma_dict(words):
    if is_list(words):
        words = {w:[] for w in words}
    words = {k:v+[k] for k,v in words.items()}
    return {lem:k for k,v in words.items() for lem in flatten_list([get_lemmas(x) for x in v])}

def text_to_segments(text, labeling_model, segments={'education':['bachelors'], 'work experience':['employment']}, keywords=[]):
    seg_ld = get_lemma_dict(segments)
    kw_ld = get_lemma_dict(keywords)
    seg_lems = list_map(dict_keys(seg_ld), str.lower)
    segs = defaultdict(list)
    for txt in text:
        pred = seg_lems[labeling_model(txt.lower(), seg_lems)[0][0]]
        segs[seg_ld[pred]].append(txt)
        for kw_lem, kw in kw_ld.items():
            if kw_lem.lower() in txt.lower() and txt not in segs[kw]:
                segs[kw].append(txt)
    return segs

def segment_to_ners(text, tagger):
    if is_list(text):
        text = ' '.join(text)
    s = Sentence(text)
    tagger.predict(s)
    return s

def ners_to_dicts(s, search_tags=['ORG', 'DATE'], dict_keys=['COMPANY', 'DATE']):
    tags_list = []
    tags_dict = {}
    for l in s.labels:
        dp = l.data_point
        tag = dp.tag
        for s,k in zip(search_tags, dict_keys):
            if tag == s:
                if not tags_dict.get(k,None):
                    tags_dict[k] = dp.text.strip()
                else:
                    tags_list.append(tags_dict)
                    tags_dict = {k:dp.text.strip()}
                
    return tags_list

def get_edu_dicts(edu, tagger):
    edu = segment_to_ners(edu, tagger)
    edu_list = ners_to_dicts(edu, search_tags=['ORG', 'DATE'], dict_keys=['INSTITUTE', 'DATE'])
    edu_list = [d for d in edu_list if d.get('INSTITUTE', None) is not None]
    return edu_list

def get_job_dicts(job, tagger):
    job = segment_to_ners(job, tagger)
    job_dict = ners_to_dicts(job, search_tags=['ORG', 'DATE'], dict_keys=['COMPANY', 'DATE'])
    job_dict = [d for d in job_dict if d.get('COMPANY', None) is not None]
    return job_dict

def get_contact_dict(text):
    if is_list(text): text = ' '.join(text)
    mail_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
    phone_regex = re.compile(r'[\d]{3}[\s-]?[\d]{3}[\s-]?[\d]{4}')
    emails = re.findall(mail_regex, text.lower())
    phones = re.findall(phone_regex, text.lower())
    return {'EMAIL':emails, 'PHONE':phones}

def load_segs_model():
    return Labels("roberta-large-mnli")

def load_ner_model():
    return Classifier.load('ner-ontonotes-large')

In [None]:
# device = default_device()

# segs_model = load_segs_model()
# ner_model = load_ner_model()
# cols_model = load_cols_model('../model/best_model.pth', device=device)

In [None]:
# file = '../pdfs/test1.pdf'
# pdf_text = pdf_to_text(file, model=cols_model)

In [None]:
# pdf_text[file]

In [None]:
# segs = text_to_segments(pdf_text[file], segs_model, segments={'education':['bachelors', 'college'],
#                                                               'work experience':['employment']},
#                         keywords=['skills', 'client'])

In [None]:
# job_dicts = get_job_dicts(segs['work experience'], ner_model)
# edu_dicts = get_edu_dicts(segs['education'], ner_model)

In [None]:
# pprint(segs)

In [None]:
# pprint(job_dicts)

In [None]:
# pprint(edu_dicts)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()