# Segment

In [None]:
#| default_exp segment

In [None]:
#| hide

%matplotlib inline
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
#| export

from dreamai_pdf.core import *
from dreamai_pdf.parse import *
from dreamai_pdf.imports import *


In [None]:
#| export

def text_to_segments(text, labeling_model, tags=['education', 'work experience']):
    segs = defaultdict(list)
    for txt in text:
        pred = tags[labeling_model(txt, tags)[0][0]]
        segs[pred].append(txt)
    return segs

def segment_to_ners(text, tagger):
    if is_list(text):
        text = ' '.join(text)
    s = Sentence(text)
    tagger.predict(s)
    return s

def ners_to_dicts(s, search_tags=['ORG', 'DATE'], dict_keys=['COMPANY', 'DATE']):
    tags_list = []
    tags_dict = {}
    for l in s.labels:
        dp = l.data_point
        tag = dp.tag
        for s,k in zip(search_tags, dict_keys):
            if tag == s:
                if not tags_dict.get(k,None):
                    tags_dict[k] = dp.text.strip()
                else:
                    tags_list.append(tags_dict)
                    tags_dict = {k:dp.text.strip()}
                
    return tags_list

def get_edu_dicts(edu, tagger):
    edu = segment_to_ners(edu, tagger)
    edu_list = ners_to_dicts(edu, search_tags=['ORG', 'DATE'], dict_keys=['INSTITUTE', 'DATE'])
    edu_list = [d for d in edu_list if d.get('INSTITUTE', None) is not None]
    return edu_list

def get_job_dicts(job, tagger):
    job = segment_to_ners(job, tagger)
    job_dict = ners_to_dicts(job, search_tags=['ORG', 'DATE'], dict_keys=['COMPANY', 'DATE'])
    job_dict = [d for d in job_dict if d.get('COMPANY', None) is not None]
    return job_dict

def get_contact_dict(text):
    if is_list(text): text = ' '.join(text)
    mail_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
    phone_regex = re.compile(r'[\d]{3}[\s-]?[\d]{3}[\s-]?[\d]{4}')
    emails = re.findall(mail_regex, text.lower())
    phones = re.findall(phone_regex, text.lower())
    return {'EMAIL':emails, 'PHONE':phones}


## Some Usage Examples

In [None]:
labeling_model = Labels("roberta-large-mnli")
tagger = Classifier.load('ner-ontonotes-large')
model = load_cols_model('../model/best_model.pth')

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2023-05-03 03:32:22,130 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


In [None]:
file = '../pdfs/test1.pdf'
pdf_text = pdf_to_text(file, model=model)
segs = text_to_segments(pdf_text, labeling_model)
job_dicts = get_job_dicts(segs['work experience'], tagger)
edu_dicts = get_edu_dicts(segs['education'], tagger)

DEBUG: FC_WEIGHT didn't match
DEBUG: FC_WEIGHT didn't match


In [None]:
pprint(segs)

defaultdict(<class 'list'>,
            {'education': ['CLIENTS LIST – 1) CELEKT 2) RAMKY',
                           '4) ROCKSTAR JEANS 5) MANJEERA 7) E- RICE 8) KAVURI '
                           'HILLS',
                           'CLIENTS LIST – 1) ADP 2) IIRM MBA – 2021 EDUCATION '
                           'PRIYADARSHINI COLLEGE OF BUSINESSES',
                           '80 DIGITAL TECHNOLOGY FOR DESIGNING SCHOOL OF '
                           'PLANNING AND ARCHITECTURE',
                           '71.6 INTERMEDIATE – 2015 NARAYAN JUNIOR COLLEGE '
                           '91.6 SSC - 2013 BRILLIANT GRAMMAR HIGH SCHOOL',
                           '8.7 INTERESTS Traveling   Sketching   DIY Arts   '
                           'Listing Music   Dancing   LANGUAGES English   '
                           'Telugu   Hindi  '],
             'work experience': ['DIKONDAWAR DEEKSHA To enhance my '
                                 'professional OBJECTIVE organization which '
        

In [None]:
pprint(job_dicts)

[{'COMPANY': 'PVT LTD', 'DATE': 'monthly'},
 {'COMPANY': 'PVT LTD', 'DATE': '2 June 2019 - 28 Feb 2021'},
 {'COMPANY': 'SUNSHINE HEALTH CARE', 'DATE': '2019'}]


In [None]:
pprint(edu_dicts)

[{'INSTITUTE': 'CELEKT'},
 {'INSTITUTE': 'RAMKY'},
 {'INSTITUTE': 'ROCKSTAR'},
 {'INSTITUTE': 'MANJEERA'},
 {'INSTITUTE': 'E- RICE'},
 {'INSTITUTE': 'KAVURI HILLS'},
 {'INSTITUTE': 'ADP'},
 {'INSTITUTE': 'IIRM'},
 {'INSTITUTE': 'PRIYADARSHINI COLLEGE OF BUSINESSES'},
 {'INSTITUTE': 'NARAYAN JUNIOR COLLEGE'}]


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()