In [56]:
import fitz 
import json
import textwrap
from datetime import datetime
import re
import google.generativeai as genai

PDF_PATH = "/Users/harshpathak/Documents/Personal/Navtech_Python_with_ML_Coding_Assessment/sample_resume.pdf"
GENAI_API_KEY = "IUsedMine"
genai.configure(api_key=GENAI_API_KEY)
model = genai.GenerativeModel("gemini-2.0-flash")

def extract_text_from_pdf(file_path: str) -> str:
    with fitz.open(file_path) as doc:
        return "".join(page.get_text() for page in doc).strip()

def extract_resume_data(text: str) -> dict:
    prompt = textwrap.dedent(f"""
        You are a resume parser. From the following resume text, extract and return JSON with the following fields:

        - first_name
        - last_name
        - email
        - phone
        - address (with city, state, country)
        - summary
        - education_history (list of {{ name, degree, from_date, to_date }} in yyyy-mm-dd format)
        - work_history (list of {{ company, title, description, from_date, to_date }} in yyyy-mm-dd format)
        - skills (list of skills)

        Dates must be in 'yyyy-mm-dd' format. Use 'null' if date is missing or marked 'Present'. Here's the resume:
        ```
        {text}
        ```
    """)
    response = model.generate_content(prompt)
    cleaned = response.text.strip().strip("```").replace("json", "", 1).strip()
    try:
        return json.loads(cleaned)
    except:
        return {}

def normalize_date(date_str):
    if not date_str or not isinstance(date_str, str):
        return " "

    date_str = date_str.strip().lower().replace("present", "").replace("–", "-").replace("to", "-")
    date_str = re.sub(r"\s*-\s*", "-", date_str)

    formats = [
        "%Y-%m-%d", "%d-%m-%Y", "%Y/%m/%d", "%d/%m/%Y",
        "%B %Y", "%b %Y", "%m/%Y", "%Y-%m", "%Y"
    ]

    for fmt in formats:
        try:
            dt = datetime.strptime(date_str, fmt)
            return dt.strftime("%d-%m-%Y")
        except:
            continue

    match = re.search(r"(20\d{2})", date_str)
    return f"01-01-{match.group(1)}" if match else " "

def post_process(data):
    if "address" in data and isinstance(data["address"], dict):
        data["address"] = {k: v.lower() for k, v in data["address"].items()}

    data["first_name"] = data.get("first_name", "").lower()
    data["last_name"] = data.get("last_name", "").lower()
    data["email"] = data.get("email", "").lower()
    data["phone"] = data.get("phone", "")

    if "skills" in data:
        data["skills"] = [{"skill": s.lower()} for s in data["skills"]]

    for edu in data.get("education_history", []):
        edu["name"] = edu.get("name", "").lower().split(",")[0].strip()
        edu["degree"] = edu.get("degree", "").lower()
        edu["from_date"] = normalize_date(edu.get("from_date", ""))
        edu["to_date"] = normalize_date(edu.get("to_date", ""))

    for job in data.get("work_history", []):
        job["company"] = job.get("company", "").lower()
        job["title"] = job.get("title", "").lower()
        job["from_date"] = normalize_date(job.get("from_date", ""))
        job["to_date"] = normalize_date(job.get("to_date", ""))

    return data

if __name__ == "__main__":
    resume_text = extract_text_from_pdf(PDF_PATH)
    parsed_data = extract_resume_data(resume_text)
    processed_resume = post_process(parsed_data)
    print(json.dumps(processed_resume, indent=2))


{
  "first_name": "vijay",
  "last_name": "pagare",
  "email": "xyz@gmail.com",
  "phone": "+91889XXXXX28",
  "address": {
    "city": "thane",
    "state": "mh",
    "country": "ind"
  },
  "summary": "A frontend-leaning software engineer who has 4.5+ years of experience in building and maintaining high-quality (B2B) saas products and web applications. Proven ability to work independently and as part of a team in fast-moving, resource-constraint environments where short turnaround times are a norm. Exceptional at leveraging interpersonal skills to facilitate a collaborative relationship among cross-functional teams to get the work done. Excellent problem-solver with an aptitude for troubleshooting and the ability to quickly master new skills, technology, or a role.",
  "education_history": [
    {
      "name": "rajiv gandhi institute of technology",
      "degree": "bachelor of engineering - computers",
      "from_date": " ",
      "to_date": "01-05-2019"
    }
  ],
  "work_history"