# **Fully Optimized Resume Processing and Segmentation**
This notebook extracts, cleans, anonymizes, and segments resume content from various file formats (PDF, DOCX, TXT). It ensures accurate section detection and structured output.

In [22]:

import os
import re
import pdfminer.high_level
from docx import Document
from collections import defaultdict

# Define section headers for resume segmentation
SECTION_HEADERS = {
    "contact": ["contact", "personal details", "info", "email", "phone", "location"],
    "summary": ["summary", "profile", "about me", "objective"],
    "experience": ["experience", "work experience", "employment", "internships"],
    "education": ["education", "academic background"],
    "skills": ["skills", "technical skills", "core competencies", "language"],
    "certifications": ["certifications", "licenses", "certificates"],
    "projects": ["projects", "portfolio"],
    "languages": ["languages", "spoken languages"],
    "miscellaneous": ["hobbies", "volunteer work", "interests"]
}


In [23]:

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        text = pdfminer.high_level.extract_text(pdf_path)
        return text.strip()
    except Exception as e:
        return f"Error extracting PDF text: {str(e)}"

def extract_text_from_docx(docx_path):
    """Extract text from a DOCX file."""
    try:
        doc = Document(docx_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text.strip()
    except Exception as e:
        return f"Error extracting DOCX text: {str(e)}"

def extract_text_from_txt(txt_path):
    """Extract text from a TXT file."""
    try:
        with open(txt_path, "r", encoding="utf-8") as f:
            return f.read().strip()
    except Exception as e:
        return f"Error extracting TXT text: {str(e)}"

def extract_text_from_cv(file_path):
    """Identify the file format and extract text accordingly."""
    if not os.path.exists(file_path):
        return "File not found!"

    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    elif ext == ".txt":
        return extract_text_from_txt(file_path)
    else:
        return "Unsupported file format! Please use PDF, DOCX, or TXT."


In [24]:

def preprocess_text(text):
    """Fix extracted text formatting issues before segmentation."""
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    text = re.sub(r"(?<=\b[A-Z])\s(?=[A-Z]+\b)", "", text)  # Remove spaces between capitalized words
    text = re.sub(r"\s([,.])", r"\1", text)  # Fix spaces before punctuation
    return text

def remove_sensitive_info(text):
    """Removes sensitive information such as emails, phone numbers, and URLs."""
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "[EMAIL]", text)
    text = re.sub(r"\b(\+?\d{1,3}[-.\s]?)?\(?\d{2,4}\)?[-.\s]?\d{2,4}[-.\s]?\d{2,9}\b", "[PHONE]", text)
    text = re.sub(r"http[s]?://\S+", "[URL]", text)
    return text

def clean_and_normalize_text(text):
    """Basic text cleaning: remove unnecessary spaces and normalize text."""
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [25]:

def segment_resume_final(text):
    """Final optimized segmentation function ensuring accurate section detection."""
    text = preprocess_text(clean_and_normalize_text(text))
    sections = defaultdict(str)
    current_section = "summary"  # Default section for initial text

    # Ensure section headers are properly formatted and separated
    for section, keywords in SECTION_HEADERS.items():
        for kw in keywords:
            text = re.sub(rf"(\s*{kw}\s*)", r"\n\1\n", text, flags=re.IGNORECASE)

    # Split text into lines
    lines = text.split("\n")
    found_first_section = False

    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Identify section headers
        found_section = None
        for section, keywords in SECTION_HEADERS.items():
            if any(re.match(rf"^\s*{kw}\s*$", line, re.IGNORECASE) for kw in keywords):
                found_section = section
                found_first_section = True
                break

        if found_section:
            current_section = found_section
            sections[current_section] = ""
        else:
            if not found_first_section:
                sections["summary"] += line + " "
            elif "@" in line or "Phone" in line or "Location" in line:
                sections["contact"] += line + " "
            else:
                sections[current_section] += line + " "

    return sections


In [26]:

def process_resume(file_path):
    """Extracts, cleans, anonymizes, and segments resume from a given file."""
    extracted_text = extract_text_from_cv(file_path)
    clean_text = remove_sensitive_info(extracted_text)
    structured_resume = segment_resume_final(clean_text)
    return structured_resume

def display_segmented_resume(resume_data):
    """Displays segmented resume data in a readable format."""
    print("\n======================= SEGMENTED RESUME =======================")
    for section, content in resume_data.items():
        if content.strip():
            print(f"\n=== {section.upper()} ===\n{content.strip()}")
    print("\n================================================================")


In [27]:

# Test with uploaded sample CV
file_path = "sample_cv.pdf"  # Change this if needed
resume_data = process_resume(file_path)
display_segmented_resume(resume_data)




=== SUMMARY ===
I am a passionate developer with

=== EXPERIENCE ===
INTERN • FPT Software Da Nang. June 2023 - August 2023 I participated in developing a Java and Spring Boot-based sales management application using the MVC architecture. Followed the company’s rules, coding standards, and version control procedures to keep the project consistent. Improved teamwork and communication through regular feedback sessions, code reviews, and problem-solving discussions.

=== SKILLS ===
s: Java, C#, JavaScript. Frameworks: Java Spring,.NET Framework, Express. Web Development: HTML, CSS, JavaScript. Databases: MySQL, SQL Server, MongoDB. Tools & Platforms: Github, Swagger, Postman, Trello, Jira. English: Intermediate level in listening and translation. TOEIC: 650

=== CONTACT ===
rmation Systems

=== EDUCATION ===
Danang University of Science and Technology. 2020 - 2025 Bachelor of Engineering in

=== PROJECTS ===
E-Commerce Web Application The E-Commerce project is a web-based application de