In [1]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import fitz 
# Load spaCy model
nlp = spacy.load("en_core_web_sm")


def parse_pdf(pdf_path):
    # Extract text from PDF using PyMuPDF
    with fitz.open(pdf_path) as pdf_document:
        text = ""
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            text += page.get_text("text")
    return text



def parse_text(text):
    
    # Tokenize and process the text using spaCy
    doc = nlp(text)
    
    # Extract named entities
    named_entities = [ent.text for ent in doc.ents]
    
    # Extract other relevant information based on patterns or rules
    # For example, extracting technical skills by identifying nouns and adjectives
    tech_skills = [token.text for token in doc if token.pos_ in ['NOUN', 'ADJ'] and 'tech' in token.text.lower()]
    
    degrees = [token.text for token in doc if token.pos_ == 'NOUN' and 'degree' in token.text.lower()]
    domains = [token.text for token in doc if token.pos_ == 'NOUN' and 'domain' in token.text.lower()]

    # Extract location information
    locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']

    # Create a dictionary or data structure to store the extracted information
    parsed_info = {
        'named_entities': named_entities,
        'tech_skills': tech_skills,
        'degrees': degrees,
        'domains': domains,
        'locations': locations
        # Add more fields as needed based on your requirements
    }

    return parsed_info

def calculate_similarity(job_description, resume):
    # Use TF-IDF for text vectorization
    vectorizer = TfidfVectorizer()
    job_vector = vectorizer.fit_transform([job_description])
    resume_vector = vectorizer.transform([resume])
    
     # Calculate cosine similarity
    similarity_score = cosine_similarity(job_vector, resume_vector)[0, 0]
    return similarity_score * 100  # Scale to a 0-100 range

In [2]:
def match_score(job_description,resume):
    # Parse job description and resume
    job_desc_info = parse_text(job_description)
    resume_info = parse_text(resume)

    # Create models for job description and resume
    job_desc_model = create_model(job_description)
    resume_model = create_model(resume)

    # Calculate similarity scores for different attributes
    title_similarity = calculate_similarity(job_desc_model, resume_model)
    location_similarity = calculate_similarity(job_desc_model, resume_model)
    industry_similarity = calculate_similarity(job_desc_model, resume_model)
    education_similarity = calculate_similarity(job_desc_model, resume_model)
    tech_skills_similarity = calculate_similarity(job_desc_model, resume_model)

    # Define weights for different attributes
    title_weight = 0.2
    location_weight = 0.1
    industry_weight = 0.15
    education_weight = 0.1
    tech_skills_weight = 0.45

    # Calculate overall match score
    match_score = (
        title_similarity * title_weight +
        location_similarity * location_weight +
        industry_similarity * industry_weight +
        education_similarity * education_weight +
        tech_skills_similarity * tech_skills_weight
    ) * 100

    return match_score

In [3]:
job_description_path = "sample-job-description.pdf"
resume_path = "Himanshu Manke cv F.pdf"

# Parse text from PDFs
job_description_text = parse_pdf(job_description_path)
resume_text = parse_pdf(resume_path)

# Calculate match score
score = calculate_similarity(job_description_text, resume_text)

# Print the match score
print(f"Match Score: {score:.2f}%")

Match Score: 66.24%
