### Linking Unstructured with Structured Text

## Import the necesssary libraries.

### 1. SETUP
To prepare your environment, you need to install some packages and enter credentials for the Watson services.



In [None]:
#!pip install nltk

In [None]:
#!pip install watson-developer-cloud==1.5

In [None]:
#!pip install PyPDF2 

In [None]:
#!pip install mammoth

### Install textract
Please Follow the instructions to install textract on your system
https://textract.readthedocs.io/en/v1.2.0/installation.html

In [None]:
#!pip install textract

### 2. Import packages and libraries
Import the packages and libraries that you'll use:

In [None]:
import pandas as pd
import json
import re
import nltk

import os, sys, glob, mammoth

from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
  import Features, EntitiesOptions, SemanticRolesOptions, RelationsOptions, KeywordsOptions

import PyPDF2 
import textract

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


### Load the Configuration file.

In [None]:
'''Fill the path to configuration.json file 
'''
#input_file  = open("< path to /configuration.json>", "rb")
config_classification_json = json.loads(input_file.read())
#print(config_classification_json)


### Load the structured data from Data.csv 

In [None]:
#stu_df = pd.read_csv(<"path to/Data.csv">)
stu_df
# stu_df_columns_list = list(stu_df.columns.values)
# stu_df_columns_list

### 3. Add your service credentials from Bluemix for the Watson services.
You must create a Watson Natural Language Understanding service on Bluemix. Create a service for Natural Language Understanding (NLU). Insert the username and password values for your NLU in the following cell. Do not change the values of the version fields.

Run the cell.

In [None]:
natural_language_understanding = NaturalLanguageUnderstandingV1(
   username= "",
  password= "",
  version='2017-02-27')


### 4. Functions for  Watson Text Classification
Write the classification related utility functions in a modularalized form with augmentation. 



In [None]:
def analyze_using_NLU(text_content):
    '''
    Call Watson Natural Language Understanding service to obtain analysis results.
    '''
    response = natural_language_understanding.analyze(
        text= text_content,
        features=Features(
        entities=EntitiesOptions(),
        relations=RelationsOptions(),
        keywords= KeywordsOptions())
    )
    return response

In [None]:
def split_sentences(text):
    """ Split text into sentences.
    """
    sentence_delimiters = re.compile(u'[\\[\\]\n.!?]')
    sentences = sentence_delimiters.split(text)
    return sentences

def split_into_tokens(text):
    """ Split text into tokens.
    """
    tokens = nltk.word_tokenize(text)
    return tokens
    
def POS_tagging(text):
    """ Generate Part of speech tagging of the text.
    """
    POSofText = nltk.tag.pos_tag(text)
    return POSofText

def keyword_tagging(tag,tagtext,text):
    """ Tag the text matching keywords.
    """
    if (text.lower().find(tagtext.lower()) != -1):
        return text[text.lower().find(tagtext.lower()):text.lower().find(tagtext.lower())+len(tagtext)]
    else:
        return 'UNKNOWN'
    
def regex_tagging(tag,regex,text):
    """ Tag the text matching REGEX.
    """    
    p = re.compile(regex, re.IGNORECASE)
    matchtext = p.findall(text)
    regex_list=[]    
    if (len(matchtext)>0):
        for regword in matchtext:
            regex_list.append(regword)
    return regex_list

def chunk_tagging(tag,chunk,text):
    """ Tag the text using chunking.
    """
    parsed_cp = nltk.RegexpParser(chunk)
    pos_cp = parsed_cp.parse(text)
    chunk_list=[]
    for root in pos_cp:
        if isinstance(root, nltk.tree.Tree):               
            if root.label() == tag:
                chunk_word = ''
                for child_root in root:
                    chunk_word = chunk_word +' '+ child_root[0]
                chunk_list.append(chunk_word)
    return chunk_list
    
def augument_NLUResponse(responsejson,updateType,text,tag):
    """ Update the NLU response JSON with augumented classifications.
    """
    if(updateType == 'keyword'):
        if not any(d.get('text', None) == text for d in responsejson['keywords']):
            responsejson['keywords'].append({"text":text,"relevance":0.5})
    else:
        if not any(d.get('text', None) == text for d in responsejson['entities']):
            responsejson['entities'].append({"type":tag,"text":text,"relevance":0.5,"count":1})        
    

def classify_text(text, config):
    """ Perform augumented classification of the text.
    """
    
    response = analyze_using_NLU(text)
    responsejson = response
    
    sentenceList = split_sentences(text)
    
    tokens = split_into_tokens(text)
    
    postags = POS_tagging(tokens)
    
    configjson = config
    
    for stages in configjson['configuration']['classification']['stages']:
        for steps in stages['steps']:
            if (steps['type'] == 'keywords'):
                for keyword in steps['keywords']:
                    for word in sentenceList:
                        wordtag = keyword_tagging(keyword['tag'],keyword['text'],word)
                        if(wordtag != 'UNKNOWN'):
                            augument_NLUResponse(responsejson,'entities',wordtag,keyword['tag'])
            elif(steps['type'] == 'd_regex'):
                for regex in steps['d_regex']:
                    for word in sentenceList:
                        regextags = regex_tagging(regex['tag'],regex['pattern'],word)
                        if (len(regextags)>0):
                            for words in regextags:
                                augument_NLUResponse(responsejson,'entities',words,regex['tag'])
            elif(steps['type'] == 'chunking'):
                for chunk in steps['chunk']:
                    chunktags = chunk_tagging(chunk['tag'],chunk['pattern'],postags)
                    if (len(chunktags)>0):
                        for words in chunktags:
                            augument_NLUResponse(responsejson,'entities',words,chunk['tag'])
            else:
                print('UNKNOWN STEP')
    
    return responsejson

def replace_unicode_strings(response):
    """ Convert dict with unicode strings to strings.
    """
    if isinstance(response, dict):
        return {replace_unicode_strings(key): replace_unicode_strings(value) for key, value in response.iteritems()}
    elif isinstance(response, list):
        return [replace_unicode_strings(element) for element in response]
    elif isinstance(response, unicode):
        return response.encode('utf-8')
    else:
        return response

### 4. Extracting Requirements From the Job Description.

In [None]:
job_description_1 = "I need a candidate with User Experience Design skills and experience should be more than 24 months. "
job_description_2 = "I need a candidate with Machine Learning Expert and experience should be more than 27 months."
job_description = [job_description_1, job_description_2]

In [None]:
def getrequirements(job_description):
        requirement_jd=[]
        for i in job_description:
            requirement_jd.append(classify_text(i,config_classification_json))
        return requirement_jd

def getRequiredCandidateEntityList(requirement_jd):
    RequiredCandidateEntityList =[]
    entity_dict={}
    text_type=''
    text_value = ''
    for i in requirement_jd:
    entity_dict={}
    for k in i['entities']:
        for key1, value1 in k.items():
            if(key1=='type'):
                text_type = value1
            if(key1=='text'):
                text_value = value1
        entity_dict[text_type] = text_value
    RequiredCandidateEntityList.append(entity_dict)
    return RequiredCandidateEntityList

def getskills_matching_candidates(RequiredCandidateEntityList):
    '''
    Filtering the Candidates matching with the required skills.
    '''
    skills_matching_candidates =[]
    row_list = []
    for i in RequiredCandidateEntityList:
        requirement_1= i['NAME'].lstrip()
        requirement_2= i['Quantity'].lstrip()
        for index, row in stu_df.iterrows():
            if '/' or ',' in row['Skills']:
                if requirement_1 in list(re.split('\/|,',row['Skills'])):
                    row_list.append(row)
                    skills_matching_candidates.append(row['Name'])
            else:
                if(requirement_1 in row['Skills']):
                    row_list.append(row)
                    skills_matching_candidates.append(row['Name'])
    return row_list

In [None]:
requirement_jd = getrequirements(job_description)

In [None]:
RequiredCandidateEntityList = getRequiredCandidateEntityList(requirement_jd)

In [None]:
row_list = getskills_matching_candidates(RequiredCandidateEntityList)
filtered_dataframe = pd.DataFrame(row_list)
filtered_dataframe


### 5. Processing the resumes.

In [None]:
# give the path to CVs & Dataframe folder and uncomment this line.
'''path = "<path to /CVs & Dataframe/>"
'''

def extractingTextfromresumes():
    '''Extracting Text from the pool of resumes(processing word docs and pdfs)
    '''
    os.walk('src')
    matching_candidates_text = []
    filenames = glob.glob(path+'/*.pdf')
    filenames_docx= glob.glob(path+'/*.docx')

    for filename in filenames:
        print(filename)
        pdfFileObj = open(filename,'rb')
        #The pdfReader variable is a readable object that will be parsed
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        #discerning the number of pages will allow us to parse through all #the pages
        num_pages = pdfReader.numPages
        count = 0
        text = ""
        #The while loop will read each page
        while count < num_pages:
            pageObj = pdfReader.getPage(count)
            count +=1
            text += pageObj.extractText()
        #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
            if text != "":
                text = text
        #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
            else:
                text = textract.process(fileurl, method='tesseract', language='eng')
        # Now we have a text variable which contains all the text derived #from our PDF file. Type print(text) to see what it contains. It #likely contains a lot of spaces, possibly junk such as '\n' etc.
        # Now, we will clean our text variable, and return it as a list of keywords.
        matching_candidates_text.append(text)

    for filename in filenames_docx:
        print(filename)
        with open(filename, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            text = result.value # The raw text
            messages = result.messages # Any messages
            matching_candidates_text.append(text)
            
    return matching_candidates_text

def processTheTextWithWatsonNLU(matching_candidates_text):
    '''
    Process the text with Watson NLU
    '''
    NLU_Results_Matched_Candidates = []
    for text in matching_candidates_text:
        json = classify_text(text,config_classification_json)
        NLU_Results_Matched_Candidates.append(json)
    return NLU_Results_Matched_Candidates


def unstructuredTexttoadataframe(NLU_Results_Matched_Candidates):
    '''
    Convert the unstructured text(entities in the result of NLU) to a dataframe
    '''
    matchedCandidateEntityList =[]
    entity_dict={}
    text_type=''
    text_value = ''
    for i in NLU_Results_Matched_Candidates:
        entity_dict={}
        for k in i['entities']:
            for key1, value1 in k.items():
                if(key1=='type'):
                    text_type = value1
                if(key1=='text'):
                    text_value = value1
            entity_dict[text_type] = text_value
        matchedCandidateEntityList.append(entity_dict)
    return matchedCandidateEntityList

In [None]:
matching_candidates_text = extractingTextfromresumes()

In [None]:
NLU_Results_Matched_Candidates = processTheTextWithWatsonNLU(matching_candidates_text)

In [None]:
matchedCandidateEntityList = unstructuredTexttoadataframe(NLU_Results_Matched_Candidates)

In [None]:
resume_df = pd.DataFrame(matchedCandidateEntityList)
resume_df


### 6. Recommendation

In [None]:
def Recommendation(resume_df):
    recommendation=[]
    display(HTML('<!DOCTYPE html><html><title>W3.CSS</title><meta name="viewport" content="width=device-width, initial-scale=1"><link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css"><h2>Recommendation</h2></html>'))
    for index, row in resume_df.iterrows():
        if (int(row['PhoneNumber']) in list(filtered_dataframe['Handphone'])):
            applied_before = filtered_dataframe[filtered_dataframe['Handphone'] == int(row['PhoneNumber'])]['Applied Before'].iloc[0]
            comments = filtered_dataframe[filtered_dataframe['Handphone'] == int(row['PhoneNumber'])]['Comments'].iloc[0]

            name = filtered_dataframe[filtered_dataframe['Handphone'] == int(row['PhoneNumber'])]['Name'].iloc[0]

            if(applied_before.lower() == 'yes'):
                    print_card = "Candidate "+ name +" "+comments
                    display(HTML('<!DOCTYPE html><html><title>W3.CSS</title><meta name="viewport" content="width=device-width, initial-scale=1"><link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css"><body><div class="w3-container"><div class="w3-panel w3-card w3-red"><p>'+ print_card +'</p></div></div></body></html>'))
            else:
                experience = filtered_dataframe[filtered_dataframe['Name'] == name]['Experience in Months'].iloc[0]             
                line = requirement_2
                matchObj = re.match( r'\d{2}', line)
                if(matchObj):
                    if(int(matchObj.group()) <= experience):
                        print_card = "Candidate " + name + " matches both requirements"
                        display(HTML('<!DOCTYPE html><html><title>W3.CSS</title><meta name="viewport" content="width=device-width, initial-scale=1"><link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css"><body><div class="w3-container"><div class="w3-panel w3-card w3-green"><p>'+ print_card +'</p></div></div></body></html>'))


In [None]:
Recommendation(resume_df)