In [7]:
import os
import re
import fitz  # PyMuPDF
import pandas as pd


In [8]:
COMMON_SKILLS = [
    "python", "java", "c++", "sql", "javascript", "html", "css", "machine learning",
    "data science", "deep learning", "flask", "django", "react", "node.js",
    "git", "linux", "aws", "azure", "docker", "kubernetes", "nlp"
]

def extract_email(text):
    match = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
    return match.group(0) if match else None

def extract_phone(text):
    match = re.search(r"(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", text)
    return match.group(0) if match else None

def extract_name(text):
    lines = text.strip().split("\n")
    for line in lines:
        words = line.strip().split()
        if len(words) >= 2 and all(w.istitle() for w in words[:2]):
            return " ".join(words[:2])
    return None

def extract_skills(text):
    found_skills = []
    text = text.lower()
    for skill in COMMON_SKILLS:
        if skill.lower() in text:
            found_skills.append(skill)
    return list(set(found_skills))

def parse_resume(text):
    return {
        "name": extract_name(text),
        "email": extract_email(text),
        "phone": extract_phone(text),
        "skills": extract_skills(text)
    }


In [9]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text


In [10]:
pdf_path = "NayaabJindani_resume.pdf"  # Replace with your actual file name
text = extract_text_from_pdf(pdf_path)
parsed = parse_resume(text)

import pandas as pd
df = pd.DataFrame([parsed])
df["filename"] = pdf_path
df.to_csv("resume_extracted_data.csv", index=False)
print("✅ Parsed one resume and saved to resume_extracted_data.csv.")


✅ Parsed one resume and saved to resume_extracted_data.csv.
