### Converting into pdfs

In [1]:
import os
from docx2pdf import convert
from PyPDF2 import PdfReader,PdfWriter#manipulating the pdf
from reportlab.lib.pagesizes import A4 #creating from scratch and manipulating the pdfs
from reportlab.pdfgen import canvas# canvas provides a flexible way to place text, images, graphics, and other elements at specific positions on the PDF page.

In [2]:
#convert docx to pdf
def convert_docx_to_pdf(doc_file,output_pdf):
    convert(doc_file,output_pdf)

In [3]:
#convert text to pdf
def convert_text_to_pdf(text_file,output_pdf):
    with open(text_file,'r') as file:
        text=file.read()
        #creating the outputpdf layout
        c = canvas.Canvas(output_pdf, pagesize=A4)
        width, height = A4
        top_margin=height-50
        left_margin=50
        lines=text.split('\n')
        for line in lines:
            c.drawString(left_margin,top_margin,line)
            top_margin-=15
        c.save()

In [4]:
#coping the pdf files to another folder
def copy_pdf_to_folder(pdf_file, output_folder):
    pdf_reader = PdfReader(pdf_file)
    pdf_writer = PdfWriter()

    for page in pdf_reader.pages:
        pdf_writer.add_page(page)

    output_pdf = os.path.join(output_folder, os.path.basename(pdf_file))
    with open(output_pdf, 'wb') as f:
        pdf_writer.write(f)


In [5]:
if __name__ == "__main__":
    input_folder = "C:/Users/user/Desktop/resumes"
    output_folder = "C:/Users/user/Desktop/pdfresumes"
    #creates a folder if it doesnot exits
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        file_path = os.path.join(input_folder, filename)

        if filename.lower().endswith(".txt"):
            output_pdf = os.path.join(output_folder, filename.replace(".txt", ".pdf"))
            convert_text_to_pdf(file_path, output_pdf)

        elif filename.lower().endswith(".docx"):
            output_pdf = os.path.join(output_folder, filename.replace(".docx", ".pdf"))
            convert_docx_to_pdf(file_path, output_pdf)

        elif filename.lower().endswith(".pdf"):
            copy_pdf_to_folder(file_path, output_folder)

    print("Conversion completed. PDFs are saved in the output folder.")


Conversion completed. PDFs are saved in the output folder.


### By using NLTK

### Extracting the information of name,email,skills,phonenumber from the resumes

In [6]:
import os
import re
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

# Download missing NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [7]:
# Function to extract skills using NLTK
def extract_skills_nltk(text):
    sentences = sent_tokenize(text)
    skills = []

    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tagged_words = pos_tag(words)
        named_entities = ne_chunk(tagged_words)
        
        for entity in named_entities:
            if isinstance(entity, nltk.Tree):
                skill = " ".join([word for word, tag in entity.leaves()])
                if any(tag == 'NNP' for word, tag in entity.leaves()):
                    skills.append(skill)
    
    return list(set(skills))

# Function to extract contact details using regular expressions
def extract_contact_info(text):
    name = re.findall(r'^[A-Z][a-zA-Z]+(?:\s[A-Z][a-zA-Z]+)*', text)
    email = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    return name, email

if __name__ == "__main__":
    resume_folder = "C:/Users/user/Desktop/pdfresumes"
    resumes = []
    
    for filename in os.listdir(resume_folder):
        if filename.endswith(".pdf"):
            with open(os.path.join(resume_folder, filename), "rb") as resume_file:
                pdf_reader = PyPDF2.PdfReader(resume_file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text()
                resumes.append({"filename": filename, "text": text})

    # Step 3: Text Analysis - Extract skills and details
    for resume in resumes:
        resume["skills"] = extract_skills_nltk(resume["text"])
        name, email = extract_contact_info(resume["text"])
        resume["name"] = name[0] if name else "Not Found"
        resume["email"] = email[0] if email else "Not Found"

    # Print details of all resumes
    for resume in resumes:
        print(f"Resume: {resume['filename']}")
        print("Name:", resume["name"])
        print("Email:", resume["email"])
        print("Skills (NLTK):", ", ".join(resume["skills"]))
        print("-" * 30)


Resume: Alice Johnson.pdf
Name: Alice Johnson
Email: alice.johnson@email.com
Skills (NLTK): Social Media Platforms, Twitter Analytics, Elm Street, BBA, Anytown, Google Analytics, Johnson, Facebook Insights Content, USA, ROI, Instagram, Campaign, Influencer, Business Administration, Email, Alice, Anytown University, Digital
------------------------------
Resume: Daniel Cooper.pdf
Name: Daniel Cooper
Email: daniel.cooper@email.com
Skills (NLTK): Microsoft Excel, Product Launch, Daniel, Salesforce Data Analysis, BBA, Sunflower Lane, Negotiation, Key, USA, Sales CRM, Meadowville University, Business Administration, Meadowville, Sales Manager Dynamic Solutions, New, Account Management
------------------------------
Resume: David Wilson.pdf
Name: David Wilson
Email: david.wilson@email.com
Skills (NLTK): Education, ESL, Oakwood Drive, Grades, USA, David, English, Suburbia College, School Teacher, ABC Elementary School, Suburbia, Objective, Wilson, Improvement Program, State Teaching Certifica

### Extracting the Resume score

In [8]:
from PyPDF2 import PdfReader
import os

In [9]:
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text_content = ""
    for page in pdf_reader.pages:
        text_content += page.extract_text()
    return text_content

In [10]:
if __name__ == "__main__":
    resume_folder = "C:/Users/user/Desktop/pdfresumes"
    resumes = []
    for filename in os.listdir(resume_folder):
        if filename.endswith(".pdf"):
            with open(os.path.join(resume_folder, filename), "rb") as resume_file:
                pdf_reader = PdfReader(resume_file)
                text = ""
                for page in range(len(pdf_reader.pages)):
                    text += pdf_reader.pages[page].extract_text()
                resumes.append({"filename": filename, "text": text})


In [11]:
# Define the skill sets you are looking for
desired_skills = input("Enter the desired skills (comma-separated): ").split(",")

Enter the desired skills (comma-separated): c,java,html,css,python


In [12]:
#Text Analysis and Score Calculation
for resume in resumes:
    skills_found = [skill for skill in desired_skills if skill.lower() in resume["text"].lower()]
    resume["score"] = len(skills_found)

In [13]:
 #Ranking
resumes.sort(key=lambda x: x["score"], reverse=True)

In [14]:
#Visualization
for resume in resumes:
    print(f"{resume['filename']} - Score: {resume['score']}")

Michael Smith.pdf - Score: 5
John Doe.pdf - Score: 3
Emily Chen.pdf - Score: 2
Alice Johnson.pdf - Score: 1
Daniel Cooper.pdf - Score: 1
David Wilson.pdf - Score: 1
Emma Adams.pdf - Score: 1
Olivia Martinez.pdf - Score: 1
Robert Davis.pdf - Score: 1
Sophia Lee.pdf - Score: 1
William Turner.pdf - Score: 1


### Extract the details of the perfect candidate for the job

In [15]:
# Function to extract contact details using regular expressions
def extract_contact_info(text):
    name = re.findall(r'^[A-Z][a-zA-Z]+(?:\s[A-Z][a-zA-Z]+)*', text)
    email = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone_number = re.findall(r'\b\d{10}\b', text)
    address = re.findall(r'\b\d+\s+[^,\n]+,[^,\n]+,[^,\n]+\b', text)
    return name, email, phone_number, address

if __name__ == "__main__":
    resume_folder = "C:/Users/user/Desktop/pdfresumes"
    resumes = []
    
    for filename in os.listdir(resume_folder):
        if filename.endswith(".pdf"):
            with open(os.path.join(resume_folder, filename), "rb") as resume_file:
                pdf_reader = PyPDF2.PdfReader(resume_file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text()
                resumes.append({"filename": filename, "text": text})

    #  Text Analysis and Score Calculation
    for resume in resumes:
        skills_found = [skill.strip().lower() for skill in desired_skills if skill.strip().lower() in resume["text"].lower()]
        resume["score"] = len(skills_found)

    #  Find the maximum score
    max_score = max(resume["score"] for resume in resumes)

    # Extract and Print details of all resumes with the highest score
    print("Details of the resumes with the highest score:")
    for resume in resumes:
        if resume["score"] == max_score:
            skills_nltk = extract_skills_nltk(resume["text"])
            name, email, phone_number, address = extract_contact_info(resume["text"])
            
            print(f"Resume: {resume['filename']}")
            print("Name:", name[0] if name else "Not Found")
            print("Address:", address[0] if address else "Not Found")
            print("Phone Number:", phone_number[0] if phone_number else "Not Found")
            print("Email:", email[0] if email else "Not Found")
            print("Skills (NLTK):", ", ".join(skills_nltk))
            print("-" * 30)

Details of the resumes with the highest score:
Resume: Michael Smith.pdf
Name: Michael Smith
Address: 456 Oak Avenue, Cityville, State
Phone Number: Not Found
Email: michael.smith@email.com
Skills (NLTK): MongoDB, Scrum Master Certification, MySQL, Python Web, CSS, APIs, Software Engineer Tech Innovators, Hibernate, Online, Objective, Java, Oak Avenue, State, CSM, Michael, Cityville, JavaScript Frameworks, Programmer, Computer Science City University, Science, Upgrade, Oracle Certified Professional, Java SE
------------------------------
