In [1]:
# ! pip install PyPDF2
# ! pip install PyMuPDF
# ! pip install Pillow
# ! pip install pdfquery

In [2]:
import numpy as np
import pandas as pd

from pypdf import PdfReader
from PIL import Image
import fitz
import pdfquery

import os
root_path = "D:\Capital Placement\data\data"

### Initial EDA on Resume

In [3]:
# Category

category = os.listdir(root_path)
print("No of category :", len(category))
print(category)

No of category : 24
['ACCOUNTANT', 'ADVOCATE', 'AGRICULTURE', 'APPAREL', 'ARTS', 'AUTOMOBILE', 'AVIATION', 'BANKING', 'BPO', 'BUSINESS-DEVELOPMENT', 'CHEF', 'CONSTRUCTION', 'CONSULTANT', 'DESIGNER', 'DIGITAL-MEDIA', 'ENGINEERING', 'FINANCE', 'FITNESS', 'HEALTHCARE', 'HR', 'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS', 'SALES', 'TEACHER']


In [4]:
# Resumes inside each category

pdf_count = 0
category_summary = {}

for cat in category:
    # Fetch all PDF Identifier
    records = os.listdir(root_path + "\\" + cat)
    
    page = []
    for pdf in records:
        pdf_obj = PdfReader(root_path + "\\" + cat + "\\" + pdf)
        page.append(len(pdf_obj.pages))
    
    page = np.array(page)
    
    summary = {}
    summary["count"] = int(len(page))
    summary["min"] = min(page)
    summary["max"] = max(page)
    summary["average"] = np.mean(page)
    
    category_summary[cat] = summary
    pdf_count += len(records)

print("Total no of PDF's :", pdf_count)    

category_df = pd.DataFrame(category_summary).T
category_df

Total no of PDF's : 2484


Unnamed: 0,count,min,max,average
ACCOUNTANT,118.0,1.0,5.0,2.042373
ADVOCATE,118.0,1.0,5.0,2.067797
AGRICULTURE,63.0,1.0,5.0,2.095238
APPAREL,97.0,1.0,4.0,1.938144
ARTS,103.0,1.0,3.0,1.932039
AUTOMOBILE,36.0,1.0,6.0,2.0
AVIATION,117.0,1.0,4.0,1.871795
BANKING,115.0,1.0,4.0,1.878261
BPO,22.0,1.0,4.0,2.136364
BUSINESS-DEVELOPMENT,120.0,1.0,4.0,1.908333


In [5]:
# Extract Images from Resumes

img_count = 0

for cat in category:
    # Fetch all PDF Identifier
    records = os.listdir(root_path + "\\" + cat)
    
    for pdf in records:
        pdf_file = fitz.open(root_path + "\\" + cat + "\\" + pdf)
        for page_num in range(len(pdf_file)):
            img_count += len(pdf_file[page_num].get_images()) # Extracts number of Images from a Page
    
print("Number of Images :", img_count)

Number of Images : 0


### Insights

- Total Number of Resumes: 2484
- Total Number of Categories: 24
- There are no images in the Resumes.
- The average number of pages is 2.
- There are Resumes with more than 4 pages; the highest is a 9-page resume in the field of public relations.
- IT and Business Development have the highest number of resumes (120), while the least is in the Automobile category with 36.

#### Note
- Tried to find the number of tables and the structure of the resume and assign a **score** to the resume based on **readability  and structure**.
- Since all PDFs follow the same template, there's no need for that. 
- However, in the future, if we are working with real world resumes, **tables and structure should be taken into account**.

## Dataset Creation

- Thought of using pdfquery, but it's very slow and not suitable for bulk extraction.

In [6]:
pdf = pdfquery.PDFQuery(r"D:\Capital Placement\data\data\INFORMATION-TECHNOLOGY\10553553.pdf")
pdf.load()
print(pdf.pq('LTTextLineHorizontal:contains("Experience")').text())

pdf = pdfquery.PDFQuery(r"D:\Capital Placement\data\data\INFORMATION-TECHNOLOGY\10553553.pdf")
print(pdf.extract([('with_formatter', 'text'), ('Summary', 'LTTextLineHorizontal:contains("Summary")'),]))

Experience
{'Summary': 'Summary'}


#### Extracting raw data using **PyMuPDF**, since it's faster than PyPDF2 and PDFMiner.

In [7]:
data = {}

for each_cat in category:
    
    # Fetch all PDF Identifier
    records = os.listdir(root_path + "\\" + each_cat)
    
    for each_pdf in records:
        
        # PDF Path
        pdf_path = root_path + "\\" + each_cat + "\\" + each_pdf
        
        # Document Object
        pdf_file = fitz.open(pdf_path)
        
        # Extract Text
        text = ""
        for page in range(len(pdf_file)):
            text += pdf_file[page].get_text()
        
        data[each_pdf.split(".")[0]] = {"Category" : each_cat, "Pages" : len(pdf_file), "Data" : text}
        
each_cat = None
each_pdf = None
pdf_path = None
pdf_file = None
text = None
page = None

In [8]:
df = pd.DataFrame(data).T
df

Unnamed: 0,Category,Pages,Data
10554236,ACCOUNTANT,5,ACCOUNTANT\nSummary\nFinancial Accountant spec...
10674770,ACCOUNTANT,2,STAFF ACCOUNTANT\nSummary\nHighly analytical a...
11163645,ACCOUNTANT,2,ACCOUNTANT\nProfessional Summary\nTo obtain a ...
11759079,ACCOUNTANT,2,SENIOR ACCOUNTANT\nExperience\nCompany Name Ju...
12065211,ACCOUNTANT,2,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...
...,...,...,...
86322251,TEACHER,2,READING TEACHER\nSummary\nI am a highly motiva...
86597425,TEACHER,3,HISTORY TEACHER\nProfessional Summary\nTo be e...
90363254,TEACHER,2,"TEACHER\nSummary\nHighly ethical, dependable, ..."
96547039,TEACHER,2,TEACHER\nSummary\nTalented early education pro...


### Testing 

In [9]:
for _ in range(10):

    test_df = df.sample(n = 1)

    test_text = ""
    test_pdf = fitz.open(root_path + "\\" + test_df["Category"][0] + "\\" + test_df.index[0] + ".pdf")
    for page in range(test_df["Pages"][0]):
        test_text += test_pdf[page].get_text()

    print(test_df["Data"][0] == test_text)

    test_df = None
    test_pdf = None
    test_text = None

True
True
True
True
True
True
True
True
True
True


In [10]:
df.to_csv("Extracted_Data.csv")