In [None]:
!pip install pdfplumber spacy
!python -m spacy download en_core_web_sm


Collecting pdfplumber
  Downloading pdfplumber-0.11.2-py3-none-any.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m30.7/40.1 kB[0m [31m774.2 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m727.9 kB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.2-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m3.0

In [3]:
import pdfplumber
import spacy
import json
import re

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def parse_resume(text):
    doc = nlp(text)
    parsed_data = {
        "name": "",
        "email": "",
        "phone": "",
        "education": [],
        "experience": [],
        "skills": []
    }

    # Extract entities
    for ent in doc.ents:
        if ent.label_ == "PERSON" and parsed_data["name"] == "":
            parsed_data["name"] = ent.text
        elif ent.label_ == "ORG" or ent.label_ == "GPE":
            parsed_data["education"].append(ent.text)
        elif ent.label_ == "DATE":
            parsed_data["experience"].append(ent.text)

    # Extract email and phone number using regex
    email_pattern = re.compile(r'\S+@\S+')
    phone_pattern = re.compile(r'\(?\d{3}\)?-?\s?\d{3}-?\s?\d{4}')

    parsed_data["email"] = re.findall(email_pattern, text)
    parsed_data["phone"] = re.findall(phone_pattern, text)

    # Clean up duplicate entries
    parsed_data["email"] = list(set(parsed_data["email"]))
    parsed_data["phone"] = list(set(parsed_data["phone"]))

    return parsed_data

def resume_to_json(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    parsed_data = parse_resume(text)
    return json.dumps(parsed_data, indent=4)

# Example usage
if __name__ == "__main__":
    pdf_path = "Vedika_Bhat_resume.pdf"
    resume_json = resume_to_json(pdf_path)
    print(resume_json)


{
    "name": "Vedika Bhat",
    "email": [
        "vedikabhat2503@gmail.com"
    ],
    "phone": [
        "8261845212"
    ],
    "education": [
        "Vishwakarma Institute of Information Technology Pune",
        "MH",
        "Bachelor of Technology in Electronics",
        "Tele",
        "\u2022 Cumulative Grade Point",
        "CGPA",
        "Yashwantrao Mohite College Pune",
        "MH",
        "Physics, Chemistry & Mathematics",
        "Sinhgad City School",
        "Central",
        "\u2022 Secured",
        "Marathi, Mathematics, Science, Social",
        "Data Structures",
        "ML",
        "Github\n\u2022 Developed",
        "\u2022 Created",
        "GUI",
        "\u2022 Utilized",
        "Github\n\u2022 Implemented",
        "java",
        "Github\n\u2022",
        "Tkinter",
        "\u2022 Modular",
        "RC",
        "RL",
        "RLC",
        "Pandas",
        "\u2022 Created",
        "\u2022 Implemented",
        "\u2022 Created",
        "Line