In [1]:
import fitz  # For PDF extraction
import spacy  # For NLP
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load spaCy's language model
nlp = spacy.load("en_core_web_sm")


In [3]:
# Helper function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text

# Helper function to split resume text into sections (e.g., Education, Experience, Skills)
import re

def split_resume_into_sections(resume_text):
    # Define section headings and use regex to capture section breaks
    section_patterns = {
        'Education': r'\b(education|academic background)\b',
        'Experience': r'\b(experience|employment history|work history)\b',
        'Skills': r'\b(skills|abilities|expertise|technical skills)\b',
        'Certifications': r'\b(certifications|licenses|courses)\b',
        'Projects': r'\b(projects|side projects|personal projects)\b',
        'Additional Experience': r'\b(additional experience|other experience)\b'
    }

    sections = {}
    
    # Normalize text to lowercase
    resume_text = resume_text.lower()

    # Store the location of each section heading found
    section_locations = {}
    for section, pattern in section_patterns.items():
        match = re.search(pattern, resume_text)
        if match:
            section_locations[section] = match.start()
    
    # Sort sections by the location they appear in the resume
    sorted_sections = sorted(section_locations.items(), key=lambda x: x[1])
    
    # Extract content for each section by finding the text between two section headings
    for i, (section, start_idx) in enumerate(sorted_sections):
        if i + 1 < len(sorted_sections):
            end_idx = sorted_sections[i + 1][1]
        else:
            end_idx = len(resume_text)
        
        sections[section] = resume_text[start_idx:end_idx].strip()
    
    return sections


# Function to compare resume sections with job description sections
def compare_resume_with_job(resume_sections, job_description_sections):
    results = {}
    for section, resume_content in resume_sections.items():
        job_content = job_description_sections.get(section, "")
        similarity = compute_similarity(resume_content, job_content)
        results[section] = similarity
    return results

# Helper function to compute similarity between two texts
def compute_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Sample job description split into sections
job_description_sections = {
    'Education': 'BSc in Computer Science or related field',
    'Experience': '3+ years of software development experience, particularly in Python and Django',
    'Skills': 'Proficiency in Python, Django, and front-end technologies like React',
    'Certifications': 'Relevant certifications like AWS or Google Cloud Architect'
}

# Main function to process resume and compare it to the job description
def process_resume(pdf_path, job_description_sections):
    resume_text = extract_text_from_pdf(pdf_path)
    resume_sections = split_resume_into_sections(resume_text)
    comparison_results = compare_resume_with_job(resume_sections, job_description_sections)
    
    # Print or return the similarity results for each section
    for section, similarity in comparison_results.items():
        print(f"{section}: Similarity Score = {similarity:.2f}")
    
    return resume_sections, resume_text, comparison_results

In [4]:
# Example usage:

pdf_path = "resume.pdf"  # Provide the path to the resume PDF
resume_section,  resume_text, comparison_results= process_resume(pdf_path, job_description_sections)


Experience: Similarity Score = 0.36
Skills: Similarity Score = 0.32
Projects: Similarity Score = 0.00
Education: Similarity Score = 0.22
Certifications: Similarity Score = 0.14
Additional Experience: Similarity Score = 0.00


In [5]:
print(resume_section['Education'])

education
bachelor of science (bsc) in computer science
university of xyz | graduation: may 2019
relevant coursework:
●
data structures and algorithms
●
web development
●
database management systems
●
cloud computing
