In [None]:
import pdfplumber
import pytesseract
import fitz  
from PIL import Image
import io

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print("Error using pdfplumber:", e)

    # If no text is found, try OCR with PyMuPDF and pytesseract
    if not text.strip():
        try:
            doc = fitz.open(pdf_path)
            for page in doc:
                pix = page.get_pixmap()
                img = Image.open(io.BytesIO(pix.tobytes()))
                text += pytesseract.image_to_string(img) + "\n"
        except Exception as e:
            print("Error using OCR:", e)
    return text.strip()


In [10]:
import spacy
import re
from fuzzywuzzy import process

# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    return "Unknown"

def extract_contact_info(text):
    email = re.findall(r"[a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+", text)
    phone = re.findall(r"\+?\d[\d -]{8,}\d", text)
    return {"email": email[0] if email else "Not found", "phone": phone[0] if phone else "Not found"}

def extract_skills(text, skills_list):
    found_skills = []
    for skill in skills_list:
        if skill.lower() in text.lower():
            found_skills.append(skill)
        elif process.extractOne(skill, text.split(), score_cutoff=80):
            found_skills.append(skill)
    return list(set(found_skills))


In [11]:
from dateutil import parser
import re

def extract_experience(text):
    # Extract all four-digit years from the text
    years = re.findall(r"\b(19|20)\d{2}\b", text)
    if len(years) >= 2:
        start_year, end_year = int(years[0]), int(years[-1])
        total_exp = end_year - start_year
    else:
        total_exp = 0
    return total_exp


In [15]:
def match_candidate(candidate, job_req):
    skill_match = len(set(candidate["skills"]).intersection(set(job_req["skills"]))) / len(job_req["skills"])
    exp_match = 1 if candidate["experience"] >= job_req["min_experience"] else 0
    score = (skill_match * 0.7) + (exp_match * 0.3)
    return score

# Example candidate and job requirement
candidate = {
    "skills": ["Python", "SQL", "Data Science"],
    "experience": 10
}
job_requirements = {
    "skills": ["Python", "Machine Learning", "SQL"],
    "min_experience": 3
}

print("Match Score:", match_candidate(candidate, job_requirements))


Match Score: 0.7666666666666666


In [16]:
def rank_candidates(candidates, job_req):
    ranked = sorted(candidates, key=lambda c: match_candidate(c, job_req), reverse=True)
    return ranked[:5]  # Returns top 5 candidates

# Example usage with a list of candidate dictionaries:
candidates_list = [
    {"skills": ["Python", "SQL"], "experience": 10},
    {"skills": ["Python", "Machine Learning", "SQL"], "experience": 5},
    {"skills": ["Java", "SQL"], "experience": 4},
]
top_candidates = rank_candidates(candidates_list, job_requirements)
print(top_candidates)


[{'skills': ['Python', 'Machine Learning', 'SQL'], 'experience': 5}, {'skills': ['Python', 'SQL'], 'experience': 10}, {'skills': ['Java', 'SQL'], 'experience': 4}]
