## Importing Libraries

In [6]:
import spacy
import pdfminer
import re
import os
import pandas as pd
import pdf2txt
nlp = spacy.load("en_core_web_sm")

## Making a custom function to read pdf files

In [7]:
def convert_pdf(f):
    output_filename = os.path.basename(os.path.splitext(f)[0]) + ".txt"
    output_filepath = os.path.join("output/txt/", output_filename)
    pdf2txt.main(args=[f, "--outfile", output_filepath])
    print(output_filepath + " saved successfully!!!")
    return open(output_filepath).read()

## Creating a dictionary

In [8]:
result_dict = {'name': [], 'phone': [], 'email': [], 'skills': []} 
names = []
phones = []
emails = []
skills = []

## Parsing all the pdf text data

In [9]:
def parse_content(text):
    skillset = re.compile("python|java|sql|hadoop|tableau")
    phone_num = re.compile(
        "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
    )
    doc = nlp(text)
    name = [entity.text for entity in doc.ents if entity.label_ == "PERSON"][0]
    print(name)
    email = [word for word in doc if word.like_email == True][0]
    print(email)
    phone = str(re.findall(phone_num, text.lower()))
    skills_list = re.findall(skillset, text.lower())
    unique_skills_list = str(set(skills_list))
    names.append(name)
    emails.append(email)
    phones.append(phone)
    skills.append(unique_skills_list)
    print("Extraction completed successfully!!!")

## Creating and writing the scraped data from pdf

In [10]:
for file in os.listdir('resumes/'):
    if file.endswith('.pdf'):
        print('Reading.....' + file)
        txt = convert_pdf(os.path.join('resumes/',file))
        parse_content(txt)

Reading.....Alisson ParkerCV.pdf
output/txt/Alisson ParkerCV.txt saved successfully!!!
Alisson Parker-Wright                                                                                                 
alli1414parks@mail.com
Extraction completed successfully!!!
Reading.....Angelica Astrom.pdf
output/txt/Angelica Astrom.txt saved successfully!!!
Lorem
someone@example.com
Extraction completed successfully!!!
Reading.....AshleyMilesCV.pdf
output/txt/AshleyMilesCV.txt saved successfully!!!
Ashley Miles
ashleymiles@memail.com
Extraction completed successfully!!!
Reading.....John DominicCV.pdf
output/txt/John DominicCV.txt saved successfully!!!
John Dominic
johndominic@mail.com
Extraction completed successfully!!!


## Creating Dataframes

In [11]:
result_dict['name'] = names
result_dict['phone'] = phones
result_dict['email'] = emails
result_dict['skills'] = skills

In [12]:
result_df = pd.DataFrame(result_dict)
result_df

Unnamed: 0,name,phone,email,skills
0,Alisson Parker-Wright ...,['8569878511'],alli1414parks@mail.com,"{'tableau', 'python', 'java'}"
1,Lorem,['(212) 555-1234'],someone@example.com,set()
2,Ashley Miles,['6592251422'],ashleymiles@memail.com,"{'tableau', 'sql'}"
3,John Dominic,['7877756411'],johndominic@mail.com,"{'python', 'java', 'hadoop'}"


## Creating a CSV file

In [13]:
result_df.to_csv("Parsed_Data.csv")

## Reading created CSV file

In [14]:
df = pd.read_csv("Parsed_Data.csv")
df

Unnamed: 0.1,Unnamed: 0,name,phone,email,skills
0,0,Alisson Parker-Wright ...,['8569878511'],alli1414parks@mail.com,"{'tableau', 'python', 'java'}"
1,1,Lorem,['(212) 555-1234'],someone@example.com,set()
2,2,Ashley Miles,['6592251422'],ashleymiles@memail.com,"{'tableau', 'sql'}"
3,3,John Dominic,['7877756411'],johndominic@mail.com,"{'python', 'java', 'hadoop'}"
