In [1]:
# Step 1: Install dependencies
!pip install spacy pdfplumber
!python -m spacy download en_core_web_sm

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0-py3-non

In [2]:
# Step 2: Import libraries
import spacy
import pdfplumber
import re
import json

nlp = spacy.load("en_core_web_sm")

In [3]:
# Step 3: Upload resume (.pdf or .txt)
from google.colab import files
uploaded = files.upload()

for file_name in uploaded.keys():
    if file_name.endswith(".pdf"):
        with pdfplumber.open(file_name) as pdf:
            text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    elif file_name.endswith(".txt"):
        with open(file_name, 'r', encoding='utf-8') as f:
            text = f.read()
    else:
        text = ""
        print("Unsupported file type.")

Saving cloud1.0.pdf to cloud1.0.pdf


In [4]:
# Step 4: NER-based extraction
def extract_entities(text):
    doc = nlp(text)
    name = ""
    skills = []
    degree = []
    institutions = []
    work_experience = []

    for ent in doc.ents:
        if ent.label_ == "PERSON" and not name:
            name = ent.text
        elif ent.label_ == "ORG":
            institutions.append(ent.text)

    skills = re.findall(r"\b(Python|Java|SQL|Excel|Machine Learning|NLP|C\+\+|React|Git)\b", text, re.I)
    degree = re.findall(r"\b(B\.Tech|M\.Tech|BSc|MSc|Ph\.D|MBA)\b", text)
    work_experience = re.findall(r"(\d+[\+]* years? of experience)", text)

    return {
        "Name": name,
        "Skills": list(set(skills)),
        "Degree": list(set(degree)),
        "Institutions": list(set(institutions)),
        "Work Experience": list(set(work_experience))
    }

results = extract_entities(text)
print(json.dumps(results, indent=4))

{
    "Name": "Email",
    "Skills": [
        "Excel",
        "Python",
        "Java"
    ],
    "Degree": [],
    "Institutions": [
        "St.Joseph\u2019sHighSchool Narasaraopet",
        "APIGateway",
        "NAGINENI",
        "VSCode",
        "Problem-Solving",
        "TN",
        "RMSE",
        "AP",
        "AnalyticalSkills",
        "MS-Excel\n\u25cf Platforms",
        "GitHub",
        "naginenirohith.829@gmail.com"
    ],
    "Work Experience": []
}


In [5]:
# Step 5: (Optional) Export to JSON
with open("resume_extracted.json", "w") as f:
    json.dump(results, f, indent=4)