# Resume Parsing

## 1. Customized entity rulers

In [1]:
import spacy
nlp = spacy.load("en_core_web_md")

analysis = nlp.analyze_pipes(pretty=True)

  from .autonotebook import tqdm as notebook_tqdm


[1m

#   Component         Assigns               Requires   Scores             Retokenizes
-   ---------------   -------------------   --------   ----------------   -----------
0   tok2vec           doc.tensor                                          False      
                                                                                     
1   tagger            token.tag                        tag_acc            False      
                                                                                     
2   parser            token.dep                        dep_uas            False      
                      token.head                       dep_las                       
                      token.is_sent_start              dep_las_per_type              
                      doc.sents                        sents_p                       
                                                       sents_r                       
                                                

### Skills

In [2]:
skill_path = 'data/skill_patterns.jsonl'

In [3]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path) # lodad ruler from outside
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [4]:
doc = nlp("Chaky loves ajax.")
doc.ents

(Chaky, ajax)

In [5]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Chaky NORP
ajax SKILL|ajax


### Education

### Email 

In [6]:
email_pattern = [{'label': 'EMAIL', 
                  'pattern': [{'TEXT': {'REGEX': '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'}}]}]

In [7]:
ruler.add_patterns(email_pattern)

In [26]:
doc = nlp("st124092@gmail.com")

In [27]:
for ent in doc.ents:
    print(ent.text, ent.label_)

st124092@gmail.com EMAIL


### Phone number

In [28]:
phone_pattern = [
    {"label": "PHONE NUMBER",  "pattern": [{"TEXT": {"REGEX": "((\d){7})"}}]}
]

In [29]:
ruler.add_patterns(phone_pattern)

In [32]:
text  = "This is a sample number 09123456."
doc = nlp(text)

In [33]:
for ent in doc.ents:
    print(ent.text, ent.label_)

09123456 PHONE NUMBER


## 2. Load the PDF

In [41]:
from PyPDF2 import PdfReader

reader   = PdfReader('./data/chaklam_resume.pdf')
page     = reader.pages[0]
text     = page.extract_text()
nlp        = spacy.load('en_core_web_sm')

In [42]:
#clean our data
from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):

    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(clean_tokens)

In [43]:
text = preprocessing(text)

In [44]:
type(text)

str

In [45]:
doc = nlp(text)

In [46]:
type(doc)

spacy.tokens.doc.Doc

In [47]:
colors  = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {'colors': colors}

from spacy import displacy
displacy.render(doc, style='ent', options=options)