In [3]:
import os
import re
from docx import Document
import pandas as pd

# Define extraction functions
def extract_years_experience(text):
    """
    Extract the number of years of experience from the text.
    Looks for patterns like 'X years', 'X+ years', 'X-years', etc.
    """
    match = re.search(r"(\d+)\s*(\+|-|to)?\s*years?", text, re.IGNORECASE)
    if match: #CSamson Added condition to accept max years of expericence < 70 years
        if int(match.group(1)) < 70:
            return int(match.group(1))
    return None

def extract_education(text):
    """
    Extract the highest level of education from the text.
    Looks for common degree names.
    """
    education_patterns = [
        r"bachelor'?s\s*(of\s*[a-z]*)?",  # Bachelor's
        r"bachelor\s*(of\s*[a-z]*)?",  # Bachelor #CSamson Added keyword bachelor without 's
        r"master'?s\s*(of\s*[a-z]*)?",    # Master's
        r"ph\.?d",                        # PhD
        r"doctorate",                     # Doctorate
        r"associate'?s\s*(of\s*[a-z]*)?"  # Associate's
    ]
    for pattern in education_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(0).capitalize()
    return "Not Found"

def extract_skills(text):
    """
    Extract skills from the text.
    Skills are identified from a predefined list or general skill-related terms.
    """
    skill_keywords = [
        "Python", "SQL", "Java", "C++", "Machine Learning", "Data Analysis",
        "Project Management", "Communication", "Leadership", "Excel",
        "Cloud Computing", "AWS", "Azure", "Big Data", "Problem-Solving"
    ]
    skills_found = [skill for skill in skill_keywords if re.search(rf"\b{skill}\b", text, re.IGNORECASE)]
    return ", ".join(skills_found) if skills_found else "Not Found"

def extract_location(text):
    """
    Extract location information from the text.
    Looks for city/state/country patterns.
    """
    location_pattern = r"\b(?:[A-Z][a-z]+(?:\s[A-Z][a-z]+)*),?\s*(?:[A-Z]{2}|\b[a-zA-Z]+\b)"
    match = re.search(location_pattern, text)
    if match:
        return match.group(0)
    return "Not Found"

def extract_job_title(text):
    """
    Extract the job title from the text.
    Looks for common job title patterns or uses heuristics.
    """
    job_title_pattern = r"(?:[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\s(?:Manager|Engineer|Analyst|Developer|Specialist|Consultant|Scientist|Technician|Officer|Director|Lead|Architect|Administrator))"
    match = re.search(job_title_pattern, text)
    if match:
        return match.group(0)
    return "Not Found"

# Process a single Word document
def process_resume(file_path):
    doc = Document(file_path)
    full_text = " ".join([para.text for para in doc.paragraphs])
    
    years_experience = extract_years_experience(full_text)
    education = extract_education(full_text)
    skills = extract_skills(full_text)
    location = extract_location(full_text)
    job_title = extract_job_title(full_text)
    
    return {
        "File Name": os.path.basename(file_path),
        "Years of Experience": years_experience,
        "Education": education,
        "Skills": skills,
        "Location": location,
        "Job Title": job_title,
    }

# Process all resumes in a directory
def process_resumes(directory):
    resume_data = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".docx"):
            file_path = os.path.join(directory, file_name)
            resume_data.append(process_resume(file_path))
    return pd.DataFrame(resume_data)

# Directory containing resumes
resume_directory = "C:\\Users\\samue\\OneDrive\\Desktop\\Job_Recommender\Resumes"  # CSamson File path where resumes are saved

# Process resumes and save results
output_file = "resume_analysis_results.csv"
df = process_resumes(resume_directory)
df.to_csv(output_file, index=False)

print(f"Resume analysis completed! Results saved to {output_file}")


Resume analysis completed! Results saved to resume_analysis_results.csv
