<a href="https://colab.research.google.com/github/Gxwthmmmm/Resume-Optimizer/blob/main/resumeoptimizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pdfplumber mammoth markdownify weasyprint

import os
import re
import pdfplumber
import mammoth
from markdownify import markdownify as md
from markdown import markdown
from weasyprint import HTML

def resume_to_markdown(input_path, md_path):
    ext = os.path.splitext(input_path)[1].lower()

    if ext == ".pdf":
        text = ""
        with pdfplumber.open(input_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        markdown_text = text

    elif ext == ".docx":
        with open(input_path, "rb") as docx_file:
            result = mammoth.convert_to_markdown(docx_file)
        markdown_text = result.value

    else:
        raise ValueError("Unsupported file format")

    with open(md_path, "w", encoding="utf-8") as f:
        f.write(markdown_text)

    print("Resume converted to Markdown / text")

def add_metrics_if_missing(bullet):
    if re.search(r"\d+%|\d+x|\d+", bullet):
        return bullet
    return bullet.rstrip(".") + " resulting in measurable performance improvements."


def rewrite_bullets(lines):
    rewritten = []
    for line in lines:
        if line.strip().startswith("-"):
            content = line.replace("-", "").strip()
            content = add_metrics_if_missing(content)
            rewritten.append(f"- {content}")
        else:
            rewritten.append(line)
    return rewritten


def optimize_resume(input_md, output_md):
    with open(input_md, "r", encoding="utf-8") as f:
        content = f.read()

    lines = content.split("\n")
    lines = rewrite_bullets(lines)
    optimized = "\n".join(lines)

    with open(output_md, "w", encoding="utf-8") as f:
        f.write(optimized.strip())

    print("Resume content optimized")


def extract_contact_info(text):
    name = ""
    for line in text.splitlines():
        if line.strip():
            name = line.strip()
            break

    email_pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
    email_match = re.search(email_pattern, text)
    email = email_match.group(0) if email_match else "email@example.com"

    phone_pattern = r"(\+?\d[\d\s\-]{8,15})"
    phone_match = re.search(phone_pattern, text)
    phone = phone_match.group(0).strip() if phone_match else "+91-XXXXXXXXXX"

    return name, email, phone


def extract_gender(text):
    gender_pattern = r"gender[:\-]\s*(male|female|other|non-binary|nonbinary|prefer not to say)"
    m = re.search(gender_pattern, text, re.IGNORECASE)
    if m:
        return m.group(1).title()

    token_pattern = r"\b(male|female)\b"
    m2 = re.search(token_pattern, text, re.IGNORECASE)
    if m2:
        return m2.group(1).title()

    return "Not specified"


SECTION_ALIASES = {
    "profile": [
        "profile summary", "summary", "objective", "profile",
        "career objective", "professional summary",
    ],
    "education": [
        "education history", "education", "academic background",
        "academics", "qualifications",
    ],
    "internships": [
        "internships", "internship", "work experience",
        "experience", "professional experience", "employment",
    ],
    "projects": [
        "projects", "project work", "academic projects",
    ],
    "activities": [
        "extra-curricular activities", "extracurricular activities",
        "extra curricular activities", "activities", "co-curricular activities",
    ],
}




def is_heading_candidate(line):
    stripped = line.strip()
    if not stripped:
        return False
    if stripped.isupper():
        return True
    words = stripped.split()
    if len(words) <= 5 and stripped[0].isupper():
        return True
    return False


def normalize_heading(line):
    return re.sub(r"[^a-z]", " ", line.lower()).strip()


def split_into_logical_sections(text):
    sections = {
        "profile": [],
        "education": [],
        "internships": [],
        "projects": [],
        "activities": [],
    }

    current_bucket = None
    lines = text.splitlines()

    for line in lines:
        stripped = line.strip()
        if not stripped:
            if current_bucket is not None:
                sections[current_bucket].append("")
            continue

        if is_heading_candidate(stripped):
            norm = normalize_heading(stripped)
            assigned = False
            for bucket, aliases in SECTION_ALIASES.items():
                for alias in aliases:
                    if alias in norm:
                        current_bucket = bucket
                        assigned = True
                        break
                if assigned:
                    break
            if assigned:
                continue

        if current_bucket is not None:
            sections[current_bucket].append(line)

    for key in sections:
        sections[key] = "\n".join(sections[key]).strip()

    return sections

def parse_education_section(text):
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    if not lines:
        return "<ul class='nested'></ul>"

    html = []
    i = 0
    while i < len(lines):
        line = lines[i]

        if (line[0].isupper() and
            ('B.Tech' in line or 'B.E.' in line or 'M.Tech' in line or
             'University' in line or re.search(r'\d{4}', line))):

            course_html = f"<li class='course'><strong>{line}</strong>"
            i += 1

            desc_lines = []
            while i < len(lines) and not (lines[i][0].isupper() and
                ('B.Tech' in lines[i] or 'B.E.' in lines[i] or 'M.Tech' in lines[i] or
                 'University' in lines[i] or re.search(r'\d{4}', lines[i]))):
                desc_lines.append(lines[i])
                i += 1

            if desc_lines:
                desc_html = "".join(f"<div>{d}</div>" for d in desc_lines)
                course_html += f"<ul class='nested-desc'>{desc_html}</ul>"

            course_html += "</li>"
            html.append(course_html)
        else:
            i += 1

    return f"<ul class='nested'>{''.join(html)}</ul>"


def parse_projects_section(text):
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    if not lines:
        return "<ul class='projects'></ul>"

    html = []
    i = 0
    while i < len(lines):
        line = lines[i]

        if (line[0].isupper() or
            any(keyword in line.lower() for keyword in ['project', 'built', 'developed', 'created', 'designed'])):

            project_html = f"<li class='project'><strong>{line}</strong>"
            i += 1

            desc_lines = []
            while i < len(lines) and not (lines[i][0].isupper() or
                any(keyword in lines[i].lower() for keyword in ['project', 'built', 'developed', 'created', 'designed'])):
                desc_lines.append(lines[i])
                i += 1

            if desc_lines:
                desc_html = "".join(f"<div class='project-desc'>{d}</div>" for d in desc_lines)
                project_html += f"<div class='project-body'>{desc_html}</div>"

            project_html += "</li>"
            html.append(project_html)
        else:
            i += 19

    return f"<ul class='projects'>{''.join(html)}</ul>"


def parse_internships_section(text):
    """
    NEW: Internships - Bold position as bullet + description body
    """
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    if not lines:
        return "<ul class='internships'></ul>"

    html = []
    i = 0
    while i < len(lines):
        line = lines[i]

        # Position titles: uppercase start, or position keywords
        if (line[0].isupper() or
            any(keyword in line.lower() for keyword in ['intern', 'software engineer', 'developer', 'analyst', 'junior'])):

            position_html = f"<li class='position'><strong>{line}</strong>"
            i += 1

            # Collect description until next position
            desc_lines = []
            while i < len(lines) and not (lines[i][0].isupper() or
                any(keyword in lines[i].lower() for keyword in ['intern', 'software engineer', 'developer', 'analyst', 'junior'])):
                desc_lines.append(lines[i])
                i += 1

            if desc_lines:
                desc_html = "".join(f"<div class='intern-desc'>{d}</div>" for d in desc_lines)
                position_html += f"<div class='intern-body'>{desc_html}</div>"

            position_html += "</li>"
            html.append(position_html)
        else:
            i += 1

    return f"<ul class='internships'>{''.join(html)}</ul>"

def section_to_html(text):
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    if not lines:
        return "<p></p>"

    first = lines[0]
    rest = lines[1:]

    para_html = f"<p>{first}</p>"
    if not rest:
        return para_html

    bullet_items = "".join(f"<li>{ln}</li>" for ln in rest)
    bullets_html = f"<ul class='bullets'>{bullet_items}</ul>"
    return para_html + bullets_html

def markdown_to_pdf(md_path, pdf_path):
    with open(md_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

    name, email, mobile = extract_contact_info(raw_text)
    gender = extract_gender(raw_text)
    sections = split_into_logical_sections(raw_text)

    profile_md = sections["profile"]
    if not profile_md.strip():
        profile_md = (
            "Motivated software developer with a strong foundation in Python, "
            "data structures, and problem-solving."
        )

    profile_html = section_to_html(profile_md)
    education_html = parse_education_section(sections["education"])
    internships_html = parse_internships_section(sections["internships"])
    projects_html = parse_projects_section(sections["projects"])
    activities_html = section_to_html(sections["activities"])

    skills_list = [
        "Organisation Skills",
        "Software Engineering",
        "Database Administration",
        "Machine Learning",
        "Product Development",
        "Data Structures",
        "Data Mining",
    ]
    technical_skills = ["Java", "Python", "C"]
    languages = ["Hindi", "English"]

    skills_html = "".join(f"<li>{s}</li>" for s in skills_list)
    tech_html = "".join(f"<li>{s}</li>" for s in technical_skills)
    lang_html = "".join(f"<li>{s}</li>" for s in languages)

    html_template = f"""
    <html>
    <head>
        <meta charset="utf-8">
        <style>
            @page {{
                size: A4;
                margin: 8mm;
            }}
            * {{
                box-sizing: border-box;
            }}
            html, body {{
                margin: 0;
                padding: 0;
                font-family: Calibri, Helvetica, Arial, sans-serif;
                line-height: 1.2;
                font-size: 9px;
            }}
            .page {{
                width: 100%;
                height: 100%;
            }}

            .name-bar {{
                padding: 6px 10px;
                border-bottom: 1px solid #000;
            }}
            .name-text {{
                font-size: 18px;
                font-weight: bold;
                text-transform: uppercase;
                letter-spacing: 0.06em;
                margin: 0;
            }}

            .content-wrapper {{
                display: flex;
                margin-top: 4px;
            }}
            .left-col {{
                width: 35%;
                padding: 4px 8px 4px 10px;
                border-right: 1px solid #000;
            }}
            .right-col {{
                width: 65%;
                padding: 4px 10px 4px 8px;
            }}

            .block-title {{
                font-size: 9px;
                font-weight: bold;
                text-transform: uppercase;
                margin: 5px 0 1px 0;
                border-bottom: 0.5px solid #000;
                padding-bottom: 1px;
            }}
            .block-body {{
                margin: 2px 0 0 0;
            }}
            .spaced-group {{
                margin-bottom: 5px;
            }}

            p {{
                margin: 0 0 2px 0;
                padding: 0;
            }}

            ul.clean {{
                list-style-type: none;
                padding-left: 0;
                margin: 0;
            }}
            ul.clean li {{
                margin-bottom: 1px;
                line-height: 1.1;
            }}

            ul.bullets {{
                margin-top: 2px;
                margin-bottom: 0;
                padding-left: 12px;
            }}
            ul.bullets li {{
                margin-bottom: 1px;
                line-height: 1.1;
            }}

            /* PROJECTS: Bold bullet title + indented body */
            ul.projects {{
                list-style-type: disc;
                padding-left: 14px;
                margin: 0;
            }}
            ul.projects li.project {{
                margin-bottom: 3px;
                line-height: 1.15;
            }}
            ul.projects li.project strong {{
                font-size: 9px;
                font-weight: bold;
            }}
            .project-body {{
                margin: 1px 0 0 8px;
                padding-left: 4px;
            }}
            .project-desc {{
                margin: 0 0 1px 0;
                line-height: 1.1;
                font-size: 8.5px;
            }}

            /* INTERNSHIPS: Bold position bullet + body */
            ul.internships {{
                list-style-type: disc;
                padding-left: 14px;
                margin: 0;
            }}
            ul.internships li.position {{
                margin-bottom: 3px;
                line-height: 1.15;
            }}
            ul.internships li.position strong {{
                font-size: 9px;
                font-weight: bold;
            }}
            .intern-body {{
                margin: 1px 0 0 8px;
                padding-left: 4px;
            }}
            .intern-desc {{
                margin: 0 0 1px 0;
                line-height: 1.1;
                font-size: 8.5px;
            }}

            /* EDUCATION: Nested structure */
            ul.nested {{
                list-style-type: disc;
                padding-left: 14px;
                margin: 0;
            }}
            ul.nested li.course {{
                margin-bottom: 3px;
                line-height: 1.15;
            }}
            ul.nested li.course strong {{
                font-weight: bold;
            }}
            ul.nested-desc {{
                list-style-type: circle;
                padding-left: 20px;
                margin: 1px 0 2px 0;
            }}
            ul.nested-desc div {{
                margin: 0;
                padding: 0;
                line-height: 1.1;
            }}
        </style>
    </head>
    <body>
        <div class="page">
            <div class="name-bar">
                <p class="name-text">{name}</p>
            </div>

            <div class="content-wrapper">
                <div class="left-col">
                    <div class="spaced-group">
                        <div class="block-title">Contact Details</div>
                        <div class="block-body">
                            <p><strong>Mobile:</strong> {mobile}<br><strong>Email:</strong> {email}</p>
                        </div>
                    </div>

                    <div class="spaced-group">
                        <div class="block-title">Personal Details</div>
                        <div class="block-body">
                            <p><strong>Gender:</strong> {gender}</p>
                        </div>
                    </div>

                    <div class="spaced-group">
                        <div class="block-title">Skills</div>
                        <div class="block-body">
                            <ul class="clean">{skills_html}</ul>
                        </div>
                    </div>

                    <div class="spaced-group">
                        <div class="block-title">Technical Skills</div>
                        <div class="block-body">
                            <ul class="clean">{tech_html}</ul>
                        </div>
                    </div>

                    <div class="spaced-group">
                        <div class="block-title">Languages Known</div>
                        <div class="block-body">
                            <ul class="clean">{lang_html}</ul>
                        </div>
                    </div>
                </div>

                <div class="right-col">
                    <div class="spaced-group">
                        <div class="block-title">Profile Summary</div>
                        <div class="block-body">
                            {profile_html}
                        </div>
                    </div>

                    <div class="spaced-group">
                        <div class="block-title">Education</div>
                        <div class="block-body">
                            {education_html}
                        </div>
                    </div>

                    <div class="spaced-group">
                        <div class="block-title">Experience</div>
                        <div class="block-body">
                            {internships_html}
                        </div>
                    </div>

                    <div class="spaced-group">
                        <div class="block-title">Projects</div>
                        <div class="block-body">
                            {projects_html}
                        </div>
                    </div>

                    <div class="spaced-group">
                        <div class="block-title">Activities</div>
                        <div class="block-body">
                            {activities_html}
                        </div>
                    </div>
                </div>
            </div>
        </div>
    </body>
    </html>
    """

    HTML(string=html_template).write_pdf(pdf_path)
    print("Single-page resume with ALL NESTED FORMATTING generated!")


def run_resume_optimizer(resume_path):
    base = os.path.splitext(resume_path)[0]

    md_resume = base + ".md"
    optimized_md = base + "_optimized.md"
    final_pdf = base + "_final.pdf"

    resume_to_markdown(resume_path, md_resume)
    optimize_resume(md_resume, optimized_md)
    markdown_to_pdf(optimized_md, final_pdf)

    print("✓ TOP-PERFORMER RESUME READY")
    print(f"✓ Final PDF: {final_pdf}")


if __name__ == "__main__":
    run_resume_optimizer("/content/Resume-Sample-1-Software-Engineer.pdf")


Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mammoth
  Downloading mammoth-1.11.0-py2.py3-none-any.whl.metadata (26 kB)
Collecting markdownify
  Downloading markdownify-1.2.2-py3-none-any.whl.metadata (9.9 kB)
Collecting weasyprint
  Downloading weasyprint-67.0-py3-none-any.whl.metadata (3.7 kB)
Collecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting cobbl

DEBUG:fontTools.ttLib.ttFont:Reading 'maxp' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'maxp' table
DEBUG:fontTools.subset.timer:Took 0.004s to load 'maxp'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'maxp'
INFO:fontTools.subset:maxp pruned
DEBUG:fontTools.ttLib.ttFont:Reading 'cmap' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'cmap' table
DEBUG:fontTools.ttLib.ttFont:Reading 'post' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'post' table
DEBUG:fontTools.subset.timer:Took 0.005s to load 'cmap'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'cmap'
INFO:fontTools.subset:cmap pruned
INFO:fontTools.subset:fpgm dropped
INFO:fontTools.subset:prep dropped
INFO:fontTools.subset:cvt  dropped
INFO:fontTools.subset:kern dropped
DEBUG:fontTools.subset.timer:Took 0.000s to load 'post'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'post'
INFO:fontTools.subset:post pruned
INFO:fontTools.subset:GPOS dropped
INFO:fontTools.subset:GSUB dropped
DEBUG:f

Single-page resume with ALL NESTED FORMATTING generated!
✓ TOP-PERFORMER RESUME READY
✓ Final PDF: /content/Resume-Sample-1-Software-Engineer_final.pdf
