## Data Extraction
### This Notebook focuses on extracting skills and education from the PDFs and storing them as a csv


In [1]:
# Imports
import os
import re
import pdfplumber
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)

In [3]:
# Define a function to extract information from a PDF
def extract_information(pdf_path):
    """
    Extracts text from a PDF file and returns it as a string.
    Args:
        pdf_path (str): The path to the PDF file.
    Returns:
        str: The extracted text from the PDF.
    """
    
    with pdfplumber.open(pdf_path) as pdf:
        resume_text = ""
        for page in pdf.pages:
            resume_text = " ".join([resume_text, page.extract_text()])
    resume_text = resume_text.strip()
    return resume_text

In [4]:
# Define a function to extract Skills, and Education
def extract_details(resume_text):
    """
    Extracts Skills and Education from the resume text.
    Args:
        resume_text (str): The text of the resume.
    Returns:
        dict: A dictionary containing the extracted Skills and Education.
    """
    
    # Define regular expressions to extract Skills & Education
    skills_pattern = r'Skills\n([\s\S]*?)(?=\n[A-Z]|$)' 
    education_pattern = r'Education\n([\s\S]*?)(?=\n[A-Z][a-z]*\n|$)'
    
    # Get Skills & Education
    skills_match = re.findall(skills_pattern, resume_text, re.DOTALL)
    education_match = re.findall(education_pattern, resume_text, re.DOTALL)
    
    # Skills & Education
    if len(skills_match)!=0:
        skills = skills_match[0]
    else:
        skills_pattern = r'skills\n((?:.*)*)' 
        skills_match = re.findall(skills_pattern, resume_text, re.DOTALL)
        if len(skills_match)!=0:
            skills = skills_match[0]
        else:
            skills = None
            
    if len(education_match)!=0:
        education = education_match[0]
    else:
        education = None
    
    return {
        'Skills': skills,
        'Education': education
    }

In [5]:
%%time

data_folder = 'data'
resume_data = []

# Iterate through sub-folders and PDF files
for category_folder in os.listdir(data_folder):
    category_path = os.path.join(data_folder, category_folder)
    if os.path.isdir(category_path):
        for pdf_file in os.listdir(category_path):
            if pdf_file.endswith('.pdf'):
                pdf_path = os.path.join(category_path, pdf_file)
                # print(pdf_path)
                text = extract_information(pdf_path)
                details = extract_details(text)
                
                # Adding Category & ID
                details['ID'] = pdf_file.replace('.pdf', '')
                details['Category'] = category_folder
                
                # print(f'File: [{pdf_path}]')
                # print(details, end='\n\n')
                resume_data.append(details)
                
print('PDF Extraction Done!')

PDF Extraction Done!
CPU times: user 9min 53s, sys: 3.61 s, total: 9min 56s
Wall time: 10min 4s


In [6]:
resume_df = pd.DataFrame(resume_data)
resume_df.to_csv('data/extracted_resume.csv', index=False)

In [7]:
resume_df.shape

(2484, 4)