In [2]:

!apt-get update -qq
!apt-get install -y -qq poppler-utils tesseract-ocr

!pip install -q transformers datasets accelerate evaluate sentencepiece pdfplumber PyMuPDF pytesseract pillow sentence-transformers spacy

!python -m spacy download en_core_web_sm


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package poppler-utils.
(Reading database ... 126718 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.11_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.11) ...
Setting up poppler-utils (22.02.0-2ubuntu0.11) ...
Processing triggers for man-db (2.10.2-1) ...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.1 MB/s[0m eta [36m

In [3]:
from google.colab import drive
drive.mount('/content/drive')

import os
DATA_DIR = "/content/drive/MyDrive/resume_data"
MODEL_DIR = "/content/drive/MyDrive/resume_qa_Model"
TEXT_DIR = "/content/resume_texts"
os.makedirs(TEXT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

print("DATA_DIR:", DATA_DIR)
print("MODEL_DIR:", MODEL_DIR)


Mounted at /content/drive
DATA_DIR: /content/drive/MyDrive/resume_data
MODEL_DIR: /content/drive/MyDrive/resume_qa_Model


In [4]:
import pdfplumber, fitz, pytesseract
from PIL import Image
import io, re

def clean_text(t: str) -> str:
    if not t:
        return ""
    t = re.sub(r'\r\n?', '\n', t)
    t = re.sub(r'\n{2,}', '\n\n', t)
    t = re.sub(r'[ \t]+', ' ', t)
    return t.strip()

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Tries multiple extractors in order: pdfplumber -> PyMuPDF -> Tesseract OCR
    Returns cleaned text.
    """
    text = ""
    # pdfplumber
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for p in pdf.pages:
                t = p.extract_text()
                if t:
                    text += t + "\n"
    except Exception as e:
        print("pdfplumber error:", e)

    #  PyMuPDF
    if len(text.split()) < 50:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for p in doc:
                text += p.get_text("text") + "\n"
        except Exception as e:
            print("PyMuPDF error:", e)

    # OCR
    if len(text.split()) < 50:
        try:
            doc = fitz.open(pdf_path)
            ocr_text = ""
            for page in doc:
                pix = page.get_pixmap()
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                ocr_text += pytesseract.image_to_string(img) + "\n"
            text = ocr_text
        except Exception as e:
            print("Tesseract OCR error:", e)

    return clean_text(text)


In [5]:
import os, glob
pdf_paths = sorted([p for p in glob.glob(os.path.join(DATA_DIR, "*.pdf"))])
print("Found PDFs:", len(pdf_paths))

for p in pdf_paths:
    fname = os.path.basename(p)
    out_txt = os.path.join(TEXT_DIR, fname.replace(".pdf", ".txt"))
    if os.path.exists(out_txt):
        print("Skipping (exists):", out_txt)
        continue
    txt = extract_text_from_pdf(p)
    if txt and len(txt.split())>20:
        with open(out_txt, "w", encoding="utf-8") as f:
            f.write(txt)
        print("Saved:", out_txt)
    else:
        print("Extraction produced too little text for:", fname)


Found PDFs: 16
Saved: /content/resume_texts/17045017_Aniruddha_Sharma_Resume.txt
Saved: /content/resume_texts/20045029_Ayushi_Yadav _INTERVIEW.txt
Saved: /content/resume_texts/21035033_Kana _Yadav_Maths (10).txt
Saved: /content/resume_texts/21035036_Krishna_Tripathi_KRT_web.txt
Saved: /content/resume_texts/21045065_Jyoti__BA (10).txt
Saved: /content/resume_texts/Ajitesh _Pandey_Resume_IITBHU (1).txt
Saved: /content/resume_texts/Anurag_Tripathi_CV.txt
Saved: /content/resume_texts/JYOTI_resume (4).txt
Saved: /content/resume_texts/KUMAR SOURAV - RESUME (2)_compressed (2).txt
Saved: /content/resume_texts/Lipika-Chaudhary-sde (1).txt
Saved: /content/resume_texts/Off_Campus_Resume Akansha Upadhyay .txt
Saved: /content/resume_texts/Prithivi_R.txt
Saved: /content/resume_texts/Resume202506300110.txt
Saved: /content/resume_texts/Resume_offcampus.txt
Saved: /content/resume_texts/h.txt
Saved: /content/resume_texts/personal_resume-1-2-1.txt


In [6]:
#  auto-generates simple QA pairs using regex and heuristics.
#  fine-tuning and we  label a proper SQuAD dataset manually or augment the output.

import json, re
from pathlib import Path

def auto_generate_qas(text):
    qas = []
    # email
    em = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    if em:
        qas.append({"question":"What is the candidate's email address?", "answers":[{"text": em[0], "answer_start": text.find(em[0])}], "id":"email"})

    # phone
    ph = re.findall(r'(\+?\d[\d\s\-\(\)]{7,}\d)', text)
    if ph:
        qas.append({"question":"What is the candidate's phone number?", "answers":[{"text": ph[0], "answer_start": text.find(ph[0])}], "id":"phone"})


    first_block = text.strip().split("\n")[0:2]
    cand = " ".join(first_block).strip()

    nm = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', cand)
    if nm:
        qas.append({"question":"What is the candidate's full name?", "answers":[{"text": nm.group(0), "answer_start": text.find(nm.group(0))}], "id":"name"})

    # skills line heuristic
    skills = re.search(r'(Skills|Technical Skills|Skills:)\s*[:\-]?\s*(.+)', text, re.IGNORECASE)
    if skills:
        s = skills.group(2).split('\n')[0].strip()
        qas.append({"question":"What are the candidate's skills?", "answers":[{"text": s, "answer_start": text.find(s)}], "id":"skills"})

    # education heuristic to find degree keywords
    edu = re.search(r'(B\.?A\.?|B\.?Sc|B\.?E\.|M\.?A|M\.?S|M\.?Tech|MBA|Ph\.?D)[^\n]{0,120}', text, re.IGNORECASE)
    if edu:
        ed = edu.group(0).strip()
        qas.append({"question":"What is the candidate's education?", "answers":[{"text": ed, "answer_start": text.find(ed)}], "id":"education"})

    return qas

dataset = {"data": []}
for txt_file in sorted(Path(TEXT_DIR).glob("*.txt")):
    txt = txt_file.read_text(encoding="utf-8")
    qas = auto_generate_qas(txt)
    if qas:
        dataset["data"].append({"title": txt_file.name, "paragraphs": [{"context": txt, "qas": qas}]})

out_path = "/content/resume_squad_auto.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(dataset, f, indent=2)
print("Saved auto-generated SQuAD JSON to:", out_path)
print("Number of examples:", len(dataset["data"]))


Saved auto-generated SQuAD JSON to: /content/resume_squad_auto.json
Number of examples: 16


In [7]:
import json
j = json.load(open("/content/resume_squad_auto.json", "r", encoding="utf-8"))
print("Examples:", len(j["data"]))
if j["data"]:
    print("Sample title:", j["data"][0]["title"])
    print("Sample QAs:", j["data"][0]["paragraphs"][0]["qas"])


Examples: 16
Sample title: 17045017_Aniruddha_Sharma_Resume.txt
Sample QAs: [{'question': "What is the candidate's email address?", 'answers': [{'text': 'aniruddha.sharma0019@gmail.com', 'answer_start': 3573}], 'id': 'email'}, {'question': "What is the candidate's phone number?", 'answers': [{'text': '9589339000', 'answer_start': 3559}], 'id': 'phone'}, {'question': "What are the candidate's skills?", 'answers': [{'text': 'Technical Skills Python, SQL, MS-Excel, Tableau', 'answer_start': 259}], 'id': 'skills'}, {'question': "What is the candidate's education?", 'answers': [{'text': 'MA', 'answer_start': 14}], 'id': 'education'}]


In [10]:
!ls /content/drive/MyDrive/resume_qa_Model


checkpoint-228	merges.txt	   special_tokens_map.json  tokenizer.json
config.json	model.safetensors  tokenizer_config.json    vocab.json


In [11]:


from transformers import pipeline
from pathlib import Path
import torch

MODEL_DIR = "/content/drive/MyDrive/resume_qa_Model"


qa_ft = pipeline(
    "question-answering",
    model=MODEL_DIR,
    tokenizer=MODEL_DIR,
    device=0 if torch.cuda.is_available() else -1
)


TEXT_DIR = "/content/resume_texts"
sample_txts = sorted([p for p in Path(TEXT_DIR).glob("*.txt")])

if sample_txts:
    ctx = sample_txts[0].read_text(encoding="utf-8")
    q = "What is the candidate's email address?"
    print("Question:", q)
    answer = qa_ft(question=q, context=ctx)
    print("\nAnswer:", answer)
else:
    print(" No sample text found to test.")


Device set to use cpu


Question: What is the candidate's email address?

Answer: {'score': 1.9964377749674895, 'start': 3573, 'end': 3603, 'answer': 'aniruddha.sharma0019@gmail.com'}
