In [1]:
# Text Extraction
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import io
from docx2python import docx2python
from bs4 import BeautifulSoup

# -------------------------------------------------

# NER
import spacy

# -------------------------------------------------

# O-NET
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# --------------------------------------------------

# Project Description
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

import re
from typing import List, Any
import warnings
import os
import pickle
import time

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [3]:

try:
    nltk.download("averaged_perceptron_tagger")
    spacy.load("en_core_web_sm")
except:
    # !python -m spacy download en_core_web_md
    !python -m spacy download en_core_web_sm

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tharu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def duration(func: Any) -> Any:
    @wraps(func)
    def wrapper(*args, **kwargs) -> Any:
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        duration = end_time - start_time
        print(f"Function {func.__name__} took {duration:.4f} seconds")
        return result

    return wrapper

# Text Extraction


In [4]:
class File2Text:
    """
    A class to extract text content from various types of files such as PDFs, DOCX, and HTML files.

    Attributes:
        np_of_pages (int): The number of pages in the file.
        text (str): The extracted text from the file.
        file_path (str): The path to the file.
        file_type (str): The type of the file (e.g., "pdf", "docx", "html").

    Methods:
        __init__(file_path):
            Initializes the File2Text object with the given file path.

        get_text():
            Extracts and returns the text content from the file.

        __extractPdfText():
            Private method to extract text from a PDF file.

        __extractDocx2Text():
            Private method to extract text from a DOCX file.

        __extractHtml2Text():
            Private method to extract text from an HTML file.

        __cleanText(text):
            Private method to clean and preprocess the extracted text.
    """

    def __init__(self, file_path: str) -> None:
        """
        Initializes the File2Text object with the given file path.

        Args:
            file_path (str): The path to the file.
        """
        self.file_path = file_path
        self.file_type = file_path.split(".")[-1]

    def get_text(self) -> List[str]:
        """
        Extracts and returns the text content from the file.

        Returns:
            List[str]: A list containing the extracted text.
        """
        __nlp = spacy.load("en_core_web_sm")

        if self.file_type == "pdf":
            self.__extractPdfText()
        elif self.file_type == "docx":
            self.__extractDocx2Text()
        elif self.file_type == "html":
            self.__extractHtml2Text()
        else:
            raise ValueError("File type not supported")
        return __nlp(self.__cleanText(self.text)).text

    def __extractPdfText(self) -> None:
        """
        Private method to extract text from a PDF file.
        """
        i_f = open(self.file_path, "rb")
        resMgr = PDFResourceManager()
        retData = io.StringIO()
        TxtConverter = TextConverter(resMgr, retData, laparams=LAParams())
        interpreter = PDFPageInterpreter(resMgr, TxtConverter)
        for page in PDFPage.get_pages(i_f):
            interpreter.process_page(page)
        self.text = retData.getvalue()

    def __extractDocx2Text(self) -> None:
        """
        Private method to extract text from a DOCX file.
        """
        self.text = docx2python(self.file_path).text

    def __extractHtml2Text(self) -> None:
        """
        Private method to extract text from an HTML file.
        """
        with open(self.file_path, "r", encoding="utf-8") as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, "html.parser")
        self.text = soup.get_text()

    def __cleanText(self, text: str) -> str:
        """
        Private method to clean and preprocess the extracted text.

        Args:
            text (str): The text to be cleaned.

        Returns:
            str: The cleaned text.
        """
        # text = text.replace("\n", " ")
        text = text.replace("\r", " ")
        # text = text.replace("\t", " ").replace("\xa0", " ")
        text = text.replace("\x0c", " ").replace("\x0b", " ")
        text = text.replace("\x0e", " ").replace("\x0f", " ").replace("\x1c", " ")

        text = re.sub("http\S+\s", " ", text)
        text = re.sub("RT|cc", " ", text)
        text = re.sub("#\S+\s", " ", text)
        text = re.sub("@\S+", "  ", text)
        text = re.sub(
            "[%s]" % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), " ", text
        )
        text = re.sub(r"[^\x00-\x7f]", " ", text)
        text = re.sub("\s+", " ", text)
        return text

In [5]:
f2t = File2Text("test/easy_resume_level1_a.pdf")

In [6]:
resumeText = f2t.get_text()
resumeText

'Prasham Sheth Data Scientist Phone 1 516 707 1668 Email p d sheth LinkedIn SUMMARY I currently work as a Data Scientist at the SLB Software Technology Innovation Center STIC in Menlo Park California My research interests include Machine Learning and Deep Learning based approaches for solving complex problems in the fields of Computer Vision Prognostic and Health Management and Time Series Analysis Further I am focusing on Hybrid modeling techniques involving Physics Informed Machine Learning EDUCATION Columbia University Master of Science in Data Science GPA 4 08 4 00 Coursework Machine Learning Applied Machine Learning Applied Deep Learning Statistical Inference Modeling Personalization Theory Natural Language Processing Algorithms for Data Science Computer Systems Exploratory Data Analysis and Visualization Nirma University Bachelor of Technology in Computer Engineering GPA 9 50 10 Rank 2 900 Coursework Machine Learning Deep Learning Artificial Intelligence Linear Algebra Algorithms

# Resume Section


In [7]:
def extract_project_section(text: str) -> Any:
    pattern = r"(?i)\b(?:project|developed|implemented|utilized)\b[\s\w,;:]*"
    return re.findall(pattern, resumeText)


def extract_education_section(text: str) -> Any:
    pattern = r"(?i)education(?:[\s\w,;:()-]*graduated[\s\w,;:()-]*:(.*?)(?=(?:[\s\w,;:()-]*\b\d{4}\b)|(?:[\s\w,;:()-]*$)))"
    education_sections = re.finditer(pattern, resumeText)
    return str(education_sections.group())


def extract_skills(resumeText):
    skills = []

    project_descriptions = extract_project_section(resumeText)

    for description in project_descriptions:
        # Tokenize words and tag parts of speech
        words = word_tokenize(description)
        tagged_words = pos_tag(words)

        # Define grammar for noun phrases
        grammar = "NP: {<JJ>*<NN|NNS>+}"

        # Create a chunk parser with the defined grammar
        chunk_parser = nltk.RegexpParser(grammar)
        chunked_words = chunk_parser.parse(tagged_words)

        # Extract the noun phrases (potential skills)
        for subtree in chunked_words.subtrees(filter=lambda t: t.label() == "NP"):
            skill = " ".join([word for word, tag in subtree.leaves()])
            skills.append(skill)

    return list(set(skills))

In [8]:
CUSTOM_NER_MODEL_PATH = os.path.join("models/ner/JdModel/output", "model-best")

In [9]:
resume_ner_model = spacy.load(CUSTOM_NER_MODEL_PATH)

In [10]:
resume_ner_model.pipe_labels

{'transformer': [],
 'ner': ['CERTIFICATION', 'DEGREE', 'EXPERIENCE', 'JOBPOST', 'SKILLS']}

In [11]:
resume_section = {
    label.capitalize(): [] for label in resume_ner_model.pipe_labels["ner"]
}
resume_section["Projects"] = (
    extract_project_section(resumeText)
    if len(extract_project_section(resumeText)) > 0
    else []
)
resume_section["Education"] = []

In [12]:
doc = resume_ner_model(resumeText)
for ent in doc.ents:
    resume_section[ent.label_.capitalize()].append(ent.text)

In [13]:
if len(extract_skills(resume_section["Projects"])) == 0:
    resume_section["Skills"] = extract_skills(resume_section["Projects"])

In [14]:
if len(resume_section["Education"]) != 0:
    resume_section["Education"] = extract_education_section(resumeText)

In [15]:
resume_section

{'Certification': [],
 'Degree': ['Bachelor of Technology in Computer Engineering GPA 9 50 10'],
 'Experience': [],
 'Jobpost': [],
 'Skills': ['Deep Learning',
  'Deep Learning',
  'TensorFlow',
  'CNN',
  'Python',
  'SQL',
  'R',
  'Java C C Tools',
  'Scikit',
  'NumPy',
  'PyTorch',
  'OpenCV',
  'Scipy',
  'BigQuery',
  'Oracle',
  'MongoDB',
  'LaTeX'],
 'Projects': ['Developed Physics Informed Machine Learning based Hybrid Framework to create an advisory system that identifies the Menlo Park CA Feb 2021 Present New York NY Dec 2020 Ahmedabad India May 2019 regions with risky Stick Slip conditions and outputs an optimal operating window for drilling the future stands Devised Hybrid framework using Physics Informed Machine Learning for digitally generating LWD logs Gamma Ray logs in real time to increase efficiency and robustness of log collection process Working towards implementation of Physics Inspired Machine Learning based toolbox for Time Series data Computer Vision Researc

In [16]:
for label, entities in resume_section.items():
    if len(entities) > 0:
        print(label.upper())
        print("-" * len(label))
        for ent in entities:
            print("* ", ent)
        print("------" * 10)

DEGREE
------
*  Bachelor of Technology in Computer Engineering GPA 9 50 10
------------------------------------------------------------
SKILLS
------
*  Deep Learning
*  Deep Learning
*  TensorFlow
*  CNN
*  Python
*  SQL
*  R
*  Java C C Tools
*  Scikit
*  NumPy
*  PyTorch
*  OpenCV
*  Scipy
*  BigQuery
*  Oracle
*  MongoDB
*  LaTeX
------------------------------------------------------------
PROJECTS
--------
*  Developed Physics Informed Machine Learning based Hybrid Framework to create an advisory system that identifies the Menlo Park CA Feb 2021 Present New York NY Dec 2020 Ahmedabad India May 2019 regions with risky Stick Slip conditions and outputs an optimal operating window for drilling the future stands Devised Hybrid framework using Physics Informed Machine Learning for digitally generating LWD logs Gamma Ray logs in real time to increase efficiency and robustness of log collection process Working towards implementation of Physics Inspired Machine Learning based toolbox for

# Find Job Title


In [17]:
NER_MODEL_PATH = os.path.join("models", "jobtitle")

In [18]:
def formatJTPayload(dct: dict) -> str:
    """
    Formats the dictionary into a string.
    """
    formatted_str = ""
    for key, value in dct.items():
        formatted_str += f"{key}: " + " ".join(str(i) for i in value) + "\n"
    return formatted_str


def findJobTitle(payload: str) -> str:
    classifier = pickle.load(
        open(os.path.join(NER_MODEL_PATH, "OneVsRestClassifier.pkl"), "rb")
    )
    vectorizer = pickle.load(
        open(os.path.join(NER_MODEL_PATH, "TfidfVectorizer.pkl"), "rb")
    )
    label_encoder = pickle.load(
        open(os.path.join(NER_MODEL_PATH, "LabelEncoder.pkl"), "rb")
    )

    return label_encoder.inverse_transform(
        classifier.predict(vectorizer.transform([payload]))
    )[0]

In [19]:
jtPayload = {
    "Projects": [],
    "Education": [],
    "Skills": [],
    "Experience": [],
}

for label, entities in resume_section.items():
    if len(entities) > 0:
        if label in jtPayload:
            jtPayload[label] = entities
    else:
        if label in jtPayload:
            del jtPayload[label]

jtPayload = formatJTPayload(jtPayload)

In [20]:
jobTitle = findJobTitle(jtPayload)
jobTitle

'Data Science'

# Find O-NET Occupation


In [21]:
O_NET_MODEL_NAME = "msmarco-distilbert-base-tas-b"

O_NET_DATASTORE_PATH = os.path.join("dataset/ONET", "2019_Occupations.csv")

In [22]:
print("Loading pretrained model...")
st_model = SentenceTransformer(O_NET_MODEL_NAME)

Loading pretrained model...


In [23]:
def encode_text(text: str) -> Any:
    return st_model.encode([text])[0]

In [24]:
def find_best_match_sentence_transformers(
    input_text: str, data_list: pd.DataFrame
) -> tuple:
    input_encoding = encode_text(input_text)

    best_similarity = -1
    best_match = None
    best_match_index = None

    for i, title in enumerate(data_list):
        title_encoding = encode_text(title)
        similarity = cosine_similarity([input_encoding], [title_encoding])[0, 0]

        if similarity > best_similarity:
            best_similarity = similarity
            best_match = title
            best_match_index = i

    return best_match, best_match_index

In [25]:
df = pd.read_csv(O_NET_DATASTORE_PATH)
df.head()

Unnamed: 0,O*NET-SOC 2019 Code,O*NET-SOC 2019 Title,O*NET-SOC 2019 Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


In [26]:
codes = df["O*NET-SOC 2019 Code"]
titles = df["O*NET-SOC 2019 Title"]
description = df["O*NET-SOC 2019 Description"]

#### Find the best match


In [27]:
print("Finding best match...")
best_match, best_match_index = find_best_match_sentence_transformers(jobTitle, titles)
best_match, best_match_index

Finding best match...


('Data Scientists', 142)

In [28]:
if best_match is not None:
    best_match_code = codes[best_match_index]
    print(f"Job Title: {jobTitle}")
    print("-----" * 20)
    print(f"Occupation: {best_match}")
    print("-----" * 20)
    print(f"Corresponding Code: {best_match_code}")
    print("-----" * 20)
    print(f"Description: {description[best_match_index]}")
else:
    print("\nNo similar match found.")

Job Title: Data Science
----------------------------------------------------------------------------------------------------
Occupation: Data Scientists
----------------------------------------------------------------------------------------------------
Corresponding Code: 15-2051.00
----------------------------------------------------------------------------------------------------
Description: Develop and implement a set of techniques or analytics applications to transform raw data into meaningful information using data-oriented programming languages and visualization software. Apply data mining, data modeling, natural language processing, and machine learning to extract and analyze information from large structured and unstructured datasets. Visualize, interpret, and report data findings. May create dynamic data reports.


# Extract Skill from Project Description


In [29]:
def extract_skills(resumeText: str) -> List[str]:
    skills = []

    pattern = r"(?i)\b(?:project|developed|implemented|utilized)\b[\s\w,;:]*"

    project_descriptions = re.findall(pattern, resumeText)

    for description in project_descriptions:

        # Tokenize words and tag parts of speech
        words = word_tokenize(description)
        tagged_words = pos_tag(words)

        # Define grammar for noun phrases
        grammar = "NP: {<JJ>*<NN|NNS>+}"

        # Create a chunk parser with the defined grammar
        chunk_parser = nltk.RegexpParser(grammar)
        chunked_words = chunk_parser.parse(tagged_words)

        # Extract the noun phrases (potential skills)
        for subtree in chunked_words.subtrees(filter=lambda t: t.label() == "NP"):
            skill = " ".join([word for word, tag in subtree.leaves()])
            skills.append(skill)

    return list(i for i in set(skills) if len(i) > 1)


if len(resume_section["Projects"]) > 0:
    project_skills = extract_skills(resumeText)
    print("\t\tSkills from Project Description\n", "----" * 17)
    for idx, i in enumerate(project_skills):
        print(f"{idx + 1} * ", i.capitalize())
else:
    print("No project descriptions found.")

		Skills from Project Description
 --------------------------------------------------------------------
1 *  Multimedia services
2 *  End
3 *  Sentence formation
4 *  Log collection process
5 *  Optimal operating window
6 *  State
7 *  Health
8 *  Smartphones
9 *  Uracy drop
10 *  Production setting
11 *  Towards implementation
12 *  Attributes
13 *  Datasets
14 *  Image processing
15 *  Data
16 *  Combination
17 *  Pictures
18 *  Logs
19 *  Low end
20 *  Conditions
21 *  Footprint
22 *  Edge devices
23 *  Field datasets
24 *  Scores
25 *  Future
26 *  Technique
27 *  Presence
28 *  Sequence model
29 *  Toolbox
30 *  Association
31 *  Core team
32 *  Reduced utility
33 *  Insights
34 *  Sentence level lip
35 *  Turnaround time
36 *  Promising test results
37 *  Comparison
38 *  Submission
39 *  Training quantization
40 *  Fps
41 *  Engineering way
42 *  Way
43 *  Document
44 *  Information
45 *  Processing
46 *  Parameters
47 *  Approaches
48 *  Images
49 *  Formulated models
50 *  Pro