## Data Extraction
### This Notebook focuses on extracting skills and education from the PDFs and storing them as a csv


In [1]:
# Imports
import os
import re
import pdfplumber
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)

In [3]:
# Define a function to extract information from a PDF
def extract_information(pdf_path):
    """
    Extracts text from a PDF file and returns it as a string.
    Args:
        pdf_path (str): The path to the PDF file.
    Returns:
        str: The extracted text from the PDF.
    """
    
    with pdfplumber.open(pdf_path) as pdf:
        resume_text = ""
        for page in pdf.pages:
            resume_text = " ".join([resume_text, page.extract_text()])
    resume_text = resume_text.strip()
    return resume_text

In [4]:
# Define a function to extract Skills, and Education
def extract_details(resume_text):
    """
    Extracts Skills and Education from the resume text.
    Args:
        resume_text (str): The text of the resume.
    Returns:
        dict: A dictionary containing the extracted Skills and Education.
    """
    
    # Define regular expressions to extract Skills & Education
    skills_pattern = r'Skills\n([\s\S]*?)(?=\n[A-Z]|$)' 
    education_pattern = r'Education\n([\s\S]*?)(?=\n[A-Z][a-z]*\n|$)'
    
    # Get Skills & Education
    skills_match = re.findall(skills_pattern, resume_text, re.DOTALL)
    education_match = re.findall(education_pattern, resume_text, re.DOTALL)
    
    # Skills & Education
    if len(skills_match)!=0:
        skills = skills_match[0]
    else:
        skills_pattern = r'skills\n((?:.*)*)' 
        skills_match = re.findall(skills_pattern, resume_text, re.DOTALL)
        if len(skills_match)!=0:
            skills = skills_match[0]
        else:
            skills = None
            
    if len(education_match)!=0:
        education = education_match[0]
    else:
        education = None
    
    return {
        'Skills': skills,
        'Education': education
    }

In [5]:
%%time

data_folder = 'data'
resume_data = []

# Iterate through sub-folders and PDF files
for category_folder in os.listdir(data_folder):
    category_path = os.path.join(data_folder, category_folder)
    if os.path.isdir(category_path):
        for pdf_file in os.listdir(category_path):
            if pdf_file.endswith('.pdf'):
                pdf_path = os.path.join(category_path, pdf_file)
                # print(pdf_path)
                text = extract_information(pdf_path)
                details = extract_details(text)
                
                # Adding Category & ID
                details['ID'] = pdf_file.replace('.pdf', '')
                details['Category'] = category_folder
                
                # print(f'File: [{pdf_path}]')
                # print(details, end='\n\n')
                resume_data.append(details)
                
print('PDF Extraction Done!')

PDF Extraction Done!
CPU times: user 9min 53s, sys: 3.61 s, total: 9min 56s
Wall time: 10min 4s


In [6]:
resume_df = pd.DataFrame(resume_data)
resume_df.to_csv('data/extracted_resume.csv', index=False)

In [7]:
resume_df.shape

(2484, 4)

### Checking Null Values

In [8]:
# Null values
resume_df.isna().sum()

Skills        91
Education    590
ID             0
Category       0
dtype: int64

Looks like Education was not extracted properly 

In [9]:
print(resume_df[(resume_df.Skills.isna() & resume_df.Education.isna())])

     Skills Education        ID                Category
141    None      None  14248724                    ARTS
261    None      None  16280971                   SALES
658    None      None  15499825              HEALTHCARE
1201   None      None  29051656  INFORMATION-TECHNOLOGY
1298   None      None  20237244  INFORMATION-TECHNOLOGY
1525   None      None  14849103            CONSTRUCTION
1554   None      None  14014749                DESIGNER
1607   None      None  90066849                DESIGNER
1758   None      None  17576030                 FITNESS
1760   None      None  21178545                 FITNESS
1828   None      None  77266989                 FITNESS
1934   None      None  12632728    BUSINESS-DEVELOPMENT
2082   None      None  31225895                 APPAREL
2183   None      None  17021141                ADVOCATE
2388   None      None  20566550             ENGINEERING


In [10]:
print(resume_df[~(resume_df['Skills'].isna() & resume_df['Education'].isna())].shape)

(2469, 4)


Removing PDFs with both Skills and Education as NA

In [11]:
df = resume_df[~(resume_df['Skills'].isna() & resume_df['Education'].isna())].reset_index(drop=True)

In [12]:
df.head()

Unnamed: 0,Skills,Education,ID,Category
0,Excellent classroom managementÂ,Subject Matter Authorization in Science: Scien...,37201447,AGRICULTURE
1,"Team mediation, Budget Management, Delegation ...","2009 Howard University ï¼​ City , State , USA ...",12674256,AGRICULTURE
2,"COMPUTER LITERACY, E-mail, English, government...","2011\nThe Universty of Zambia ï¼​ City , State...",29968330,AGRICULTURE
3,"C, C++, communication skills, designing, ELISA...","Masters of Science , Biotechnology 5 2013 Univ...",81042872,AGRICULTURE
4,"Data Entry, Printers, Clients, Loans, Tax Retu...",Wayne State University 2013 MBA : Linguistics ...,20006992,AGRICULTURE


In [13]:
df.isna().sum()

Skills        76
Education    575
ID             0
Category       0
dtype: int64

In [14]:
df[df.Skills.isna()]

Unnamed: 0,Skills,Education,ID,Category
34,,Bachelor of Arts : Criminal Justice California...,27689009,AGRICULTURE
51,,"Ph.D : Soil, Water and Environmental Science 2...",24001783,AGRICULTURE
61,,"Master : Accounting (GPA 3.7) , 2009 McCombs S...",56068028,AGRICULTURE
69,,2014\nBachelor of Science : Recording Arts Ful...,66226673,ARTS
80,,"Association Youth Art Month Exhibition, Young ...",11555549,ARTS
...,...,...,...,...
2331,,Bachelor of Science : Electrical and Computer ...,12748557,ENGINEERING
2346,,"Ph. D : Mechanical Engineering , April, 2015 U...",35172961,ENGINEERING
2362,,Graduate Certificate : Project Management 2010...,28631840,ENGINEERING
2371,,Master of Science : Mechanical Engineering Dec...,77828437,ENGINEERING


In [15]:
df[df.Education.isna()]

Unnamed: 0,Skills,Education,ID,Category
12,Time Management,,28165687,AGRICULTURE
16,"Deep expertise in designing,developing,\nimple...",,11813872,AGRICULTURE
22,"Customer service, art, clerical, doors, specia...",,79536879,AGRICULTURE
29,Production and Processing Â Basic computer skills,,16849128,AGRICULTURE
31,Strong interpersonal skills,,38216888,AGRICULTURE
...,...,...,...,...
2458,"vehicle repairs, database, inventory, marketin...",,16332293,AUTOMOBILE
2460,Professional and friendly.Careful and active l...,,23522150,AUTOMOBILE
2464,Claims file management processes,,11257723,AUTOMOBILE
2465,Quality control Solid communication skills,,11797122,AUTOMOBILE


In [16]:
df.Category.value_counts()

Category
BUSINESS-DEVELOPMENT      119
ACCOUNTANT                118
FINANCE                   118
CHEF                      118
INFORMATION-TECHNOLOGY    118
AVIATION                  117
ADVOCATE                  117
ENGINEERING               117
BANKING                   115
CONSULTANT                115
SALES                     115
HEALTHCARE                114
FITNESS                   114
PUBLIC-RELATIONS          111
CONSTRUCTION              111
HR                        110
DESIGNER                  105
TEACHER                   102
ARTS                      102
DIGITAL-MEDIA              96
APPAREL                    96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64