#HUY THANH LE SOLUTION - SOURCE DOE

# 0. IMPORT LIB

In [None]:
import os
import random 
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from sentence_transformers import SentenceTransformer, util
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dateutil import parser
from typing import Optional, List, Tuple
import pandas as pd
import pdfplumber
import pytesseract
from transformers import  AutoTokenizer
from pypdf import PdfReader
from bs4 import BeautifulSoup
import re
import json
import kagglehub
from pdf2image import convert_from_path
import pytesseract
import html
import math
from dateutil import parser

# 1. IMPORT LIB & DATA SOUCE

In [None]:
path = kagglehub.dataset_download("snehaanbhawal/resume-dataset")

print("Path to dataset files:", path)

# 2. DATA PREPARATION

In [None]:
DATA_DIR = r"\resume\data\data\INFORMATION-TECHNOLOGY"

CSV_PATH = r"\resume\Resume\Resume.csv"  # Path to the CSV file with resume metadata

resume_df = pd.read_csv(CSV_PATH)

resume_df = resume_df.loc[resume_df['Category'] == 'INFORMATION-TECHNOLOGY']

pdf_files = [f for f in os.listdir(DATA_DIR) if f.lower().endswith('.pdf')]

print(f"Loaded CSV meta data for {len(resume_df)} INFORMATION-TECHNOLOGY resumes.")

print(f"Found {len(pdf_files)} PDF files in {DATA_DIR}.")


Loaded CSV meta data for 120 INFORMATION-TECHNOLOGY resumes.
Found 120 PDF files in C:\Users\huy.let3\Desktop\resume\data\data\INFORMATION-TECHNOLOGY.


# 3. FROM UNSTRUCTURE TO STRUCTURE

In [397]:
def is_scanned_pdf(pdf_path):
    """Check if the PDF is scanned by verifying if text can be extracted."""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            if page.extract_text():
                return False
    return True

def extract_text_from_digital_pdf(pdfPath): #digital pdf
    text = ""
    try:
        reader = PdfReader(pdfPath)
        for page in reader.pages:
            pageText = page.extract_text()
            if pageText:
                text += pageText + "\n"
    except Exception as e:
        print(f"Error reading {pdfPath} using PyPDF2: {e}")
    return text.strip()


def extract_text_from_scanned_pdf(pdf_path):
    text = ""
    try:
        # Convert PDF to a list of images (one per page)
        images = convert_from_path(pdf_path)
        for i, image in enumerate(images):
            page_text = pytesseract.image_to_string(image)
            if page_text:
                text += page_text + "\n"
    except Exception as e:
        print(f"Error processing {pdf_path} with OCR: {e}")
    return text.strip()


def clean_text(text):
    return text.replace("\n", " ").replace("  ", " ").strip()


def process_pdf(pdf_path):  #Split Flow digital (PRD Reader) or scan (OCR Engine)
    if is_scanned_pdf(pdf_path):
        print("Detected: Scanned PDF (image-based)")
        raw_text = extract_text_from_scanned_pdf(pdf_path)
    else:
        print("Detected: Text-based PDF (digitally generated)")
        raw_text = extract_text_from_digital_pdf(pdf_path)

    cleaned = clean_text(raw_text)
    return cleaned


# 4. LLM MODEL (OPENAI)

In [None]:
from openai import OpenAI
from langchain.chat_models import ChatOpenAI

client = OpenAI(api_key = "YOUR KEY")

def extract_entities_with_gpt(text):
    prompt = f"""
    
    You are an AI assistant that extracts exact structured candidate information from resumes.

    Please extract exactly words the following fields and return the result in valid JSON format:

    - Name: full name of the candidate if not label NA
    - Email: valid email address need contain @ in results;if not label NA
    - Phone: phone number ;if not label NA
    - Skills: a list of technical skills in IT fields and professional/soft skills in IT Fields
    - Education: list of degrees with institution name and graduation year  
    - Experience: for each job, include:
        - Job Title
        - Company Name
        - Years Worked: Each Jobs, calculate by using end year minus start year and Round up with 1 decimal 
        - Short Description of Responsibilities: extract exact words
    - Certifications: list of relevant certifications (if available)  ; if not label NA
    - Languages: languages the candidate can speak or write such as English or Spain or Vietnam; if not label NA 

    Text:

    {text}

    Respond in JSON format.
    """
    response = client.responses.create(
        model="gpt-4",
        input=[
        {"role": "user", "content": prompt}]

    )
    extract = response.output_text
    extracted_entities = json.loads(extract)
    return extracted_entities


In [None]:
structure_data = process_pdf(r"data\data\INFORMATION-TECHNOLOGY\33241454.pdf")
results_llm = extract_entities_with_gpt(structure_data)
results_llm

Detected: Text-based PDF (digitally generated)
Sending to GPT for entity extraction...


{'Name': 'NA',
 'Email': 'NA',
 'Phone': 'NA',
 'Skills': ['Excellent communication techniques',
  'Manufacturing systems integration',
  'Multidisciplinary exposure',
  'Design instruction creation',
  'Project management',
  'Complex problem solver',
  'Advanced critical thinking',
  'SharePoint',
  'Microsoft Excel, Project and Visio',
  'LAN/WAN protocols',
  'Army',
  'Cisco',
  'counseling',
  'customer assistance',
  'database',
  'documentation',
  'Information Technology',
  'inventory',
  'IP',
  'LAN',
  'Windows 7',
  'Network',
  'personnel',
  'policies',
  'protocols',
  'repairs',
  'Routing',
  'San',
  'supervisor',
  'test equipment',
  'troubleshoot',
  'WAN'],
 'Education': [{'Degree': 'Certification, Windows 7',
   'Institution': 'Microsoft, Fort Bragg, NC',
   'Year': '2012'},
  {'Degree': 'Certification, Security',
   'Institution': 'Comptia, Yong San, Korea',
   'Year': '2012'},
  {'Degree': 'Distinguished Graduate Certificate, Information Technology (Network C

# 5.Grouth Truth Via with REGEX HTML

In [433]:
def clean_text(text):
    if not isinstance(text, str):
        return "NA"
    text = html.unescape(text)
    text = text.replace("\xa0", " ")
    text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def extract_resume_entities(html_string):  ### BASED ON SECTION_*
    soup = BeautifulSoup(html_string, "html.parser")
    extracted = {
        "Name": "NA",
        "Email": "NA",
        "Phone": "NA",
        "Skills": [],
        "Education": [],
        "Experience": [],
        "Certifications": [],
        "Languages": []
    }

    # 1. Name
    name_section = soup.find("div", id=re.compile(r"SECTION_NAME1"))
    if name_section:
        name_text = clean_text(name_section.get_text(separator=" ", strip=True))
        if name_text:
            extracted['Name'] = name_text

    # 2. Email
    email_match = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", html_string)
    if email_match:
        extracted['Email'] = email_match.group()

    # 3. Phone
    phone_match = re.search(r"\b(?:\+?1[\s\-]?)?(?:\(?\d{3}\)?[\s\-]?)?\d{3}[\s\-]?\d{4}\b", html_string)
    if phone_match:
        extracted['Phone'] = phone_match.group()

    # 4. Skills
    skill_sections = soup.find_all("div", id=re.compile(r"SECTION_SKLL"))
    for section in skill_sections:
        for block in section.find_all("div", class_="field"):
            text = clean_text(block.get_text(separator=" ", strip=True))
            if text:
                for skill in re.split(r"[,;•\-|\n]+", text):
                    skill_clean = clean_text(skill)
                    if skill_clean and skill_clean != "NA":
                        extracted["Skills"].append(skill_clean)

    # 5. Education
    education_section = soup.find("div", id=re.compile("SECTION_EDUC"))
    if education_section:
        entries = education_section.find_all("div", class_="singlecolumn")
        for entry in entries:
            degree = clean_text(entry.find("span", class_="degree").get_text()) if entry.find("span", class_="degree") else "NA"
            program = clean_text(entry.find("span", class_="programline").get_text()) if entry.find("span", class_="programline") else "NA"
            year = clean_text(entry.find("span", class_="jobdates").get_text()) if entry.find("span", class_="jobdates") else "NA"
            institution = clean_text(entry.find("span", class_="companyname").get_text()) if entry.find("span", class_="companyname") else "NA"

            edu_item = {
                "Degree": degree,
                "Program": program,
                "Year": year,
                "Institution": institution
            }
            extracted["Education"].append(edu_item)

    # 6. Experience
    experience_section = soup.find("div", id=re.compile("SECTION_EXPR"))
    if experience_section:
        jobs = experience_section.find_all("div", class_="paragraph")
        for job in jobs:
            title = clean_text(job.find("span", class_="jobtitle").get_text()) if job.find("span", class_="jobtitle") else "NA"
            company = clean_text(job.find("span", class_="companyname").get_text()) if job.find("span", class_="companyname") else "NA"
            description = clean_text(job.find("span", class_="jobline").get_text()) if job.find("span", class_="jobline") else "NA"

            # Extract 2 dates
            years_worked = "NA"
            dates = job.find_all("span", class_="jobdates", format=True)
            if len(dates) == 2:
                try:
                    start_date = parser.parse(clean_text(dates[0].get_text()), dayfirst=False)
                    end_date = parser.parse(clean_text(dates[1].get_text()), dayfirst=False)
                    delta_years = (end_date - start_date).days / 365.25
                    years_worked = math.ceil(delta_years * 10) / 10
                except:
                    years_worked = "NA"

            exp_item = {
                "Job Title": title,
                "Company Name": company,
                "Years Worked": years_worked,
                "Short Description of Responsibilities": description
            }
            extracted["Experience"].append(exp_item)

    # 7. Certifications
    cert_match = re.findall(r"(?:certified|certification|certificate)[^<\n]{0,100}", html_string, flags=re.IGNORECASE)
    if cert_match:
        certs_clean = [clean_text(c) for c in cert_match]
        extracted["Certifications"] = list(set(filter(lambda x: x != "NA", certs_clean)))

    # 8. Languages
    lang_keywords = re.findall(r"\b(English|Spanish|Vietnamese|Chinese|French|German|Japanese|Korean)\b", html_string, flags=re.IGNORECASE)
    if lang_keywords:
        extracted["Languages"] = list(set([clean_text(lang) for lang in lang_keywords]))
    else:
        extracted["Languages"] = "NA"

    return extracted


In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def compute_similarity(text1, text2):
    embeddings = model.encode([text1, text2], convert_to_tensor=True)
    return float(util.cos_sim(embeddings[0], embeddings[1]))

SIMILARITY_THRESHOLD = 0.7

ground_truth = extract_resume_entities(resume_df.loc[resume_df['ID'] == 16899268]['Resume_html'].values[0])

comparison = {}
for key in ground_truth:
    response_text = str(results_llm.get(key, "")).replace("\n", " ").strip().lower()
    expected_text = str(ground_truth.get(key, "")).replace("\n", " ").strip().lower()
    similarity = compute_similarity(response_text, expected_text)
    comparison[key] = similarity >= SIMILARITY_THRESHOLD
    print(f"{key}: {'✅ Match' if comparison[key] else '❌ Mismatch'} (Similarity: {similarity:.2f})")


Name: ✅ Match (Similarity: 1.00)
Email: ✅ Match (Similarity: 1.00)
Phone: ✅ Match (Similarity: 1.00)
Skills: ❌ Mismatch (Similarity: 0.05)
Education: ✅ Match (Similarity: 0.87)
Experience: ✅ Match (Similarity: 0.84)
Certifications: ❌ Mismatch (Similarity: 0.29)
Languages: ✅ Match (Similarity: 1.00)


In [527]:
comparison_records = [] 
similarity_records = [] 

sample_files = random.sample(pdf_files, min(3, len(pdf_files)))

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

for x in sample_files:
    structure_data = process_pdf(r"data\data\INFORMATION-TECHNOLOGY" + "\\" + x)
    results_llm = extract_entities_with_gpt(structure_data)  # Uncomment when GPT is used
    id_csv = int(x[:-4])    
    ground_truth = extract_resume_entities(resume_df.loc[resume_df['ID'] == id_csv]['Resume_html'].values[0])
    comparison = {}
    similarity_row = {}
    print("-"*50)
    print("TESTING ID", id_csv)
    for key in ground_truth:
        response_text = str(results_llm.get(key, "")).replace("\n", " ").strip().lower()
        expected_text = str(ground_truth.get(key, "")).replace("\n", " ").strip().lower()
        similarity = compute_similarity(response_text, expected_text)
        comparison[key] = similarity >= SIMILARITY_THRESHOLD
        similarity_row[key] = similarity
    comparison["filename"] = x
    similarity_row["filename"] = x
    comparison_records.append(comparison)
    similarity_records.append(similarity_row)
comparison_df = pd.DataFrame(comparison_records)
similarity_df = pd.DataFrame(similarity_records)
avg_similarities = similarity_df.drop(columns=['filename']).mean()
print("\n🔍 Average Similarities per Field:")
print(avg_similarities)

Detected: Text-based PDF (digitally generated)
--------------------------------------------------
TESTING ID 27058381
Detected: Text-based PDF (digitally generated)
--------------------------------------------------
TESTING ID 15791766
Detected: Text-based PDF (digitally generated)
--------------------------------------------------
TESTING ID 66832845

🔍 Average Similarities per Field:
Name              0.715924
Email             1.000000
Phone             0.723672
Skills            0.860838
Education         0.722207
Experience        0.700774
Certifications    0.387761
Languages         1.000000
dtype: float64


# 6. SELF-EVALUATION LLM (Context vs Questions)

In [None]:
model = ChatOpenAI(
    model="gpt-4",  # or "gpt-3.5-turbo"
    temperature=0.5,
    openai_api_key= 'YOUR KEY' )

DATA_DIR = r"\resume\data\data\INFORMATION-TECHNOLOGY"

MARKDOWN_SEPARATORS = [
    "\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", "\n\n", "\n", " ", ""
]

def split_documents(chunk_size: int, knowledge_base: list):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        pdf_name = doc.metadata.get("pdf_name") or doc.metadata.get("source", "Unknown PDF")
        chunks = text_splitter.split_documents([doc])
        for chunk in chunks:
            chunk.page_content = f"[{pdf_name[-12:-4]}]\n{chunk.page_content}"
            docs_processed.append(chunk)

    # Remove duplicates by content
    seen = set()
    return [doc for doc in docs_processed if doc.page_content not in seen and not seen.add(doc.page_content)]

all_documents = []

for filename in os.listdir(DATA_DIR):
    if filename.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(DATA_DIR, filename))
        docs = loader.load()
        for doc in docs:
            doc.metadata["pdf_name"] = filename
            doc.metadata["id"] = filename[:-4]  # e.g., 123.pdf -> id = '123'
            all_documents.append(doc)

docs_processed = split_documents(512, all_documents)

In [531]:
def retrieve_and_generate_openai(id: str, query: str, docs_processed) -> str:
    docs = [doc for doc in docs_processed if doc.metadata.get("id") == id]
    context = "\n".join([doc.page_content for doc in docs])
    prompt = f"""You are a helpful assistant.
    Context: {context}

    Pls extract these information from Context HTML:
    - Name: full name of the candidate; if not, label NA
    - Email: valid email address (must contain @); if not, label NA
    - Phone: phone number; if not, label NA
    - Skills: list of technical and soft skills relevant to IT
    - Education: list of degrees with institution name and graduation year  
    - Experience: for each job, include:
        - Job Title
        - Company Name
        - Years Worked: end year - start year, round up with 1 decimal
        - Short Description of Responsibilities (exact wording)
    
    - Certifications: list of relevant certifications; if none, label NA

    - Languages: list of languages spoken or written (e.g., English, Spanish, Vietnamese); if none, label NA

    Pls return only similarity score float from 0 to 1 each fields between query: {query}, and context as json files 

    Answer:
    """

    response = model.predict(prompt)
    return response.strip()


In [532]:
for x in sample_files:
    structure_data = process_pdf(r"data\data\INFORMATION-TECHNOLOGY" + "\\" + x)
    results_llm = extract_entities_with_gpt(structure_data)  # Uncomment when GPT is used
    print("-"*50)
    print("TESTING ID", id_csv)
    output = retrieve_and_generate_openai(x[:-4],results_llm,docs_processed)
    print(output)

Detected: Text-based PDF (digitally generated)
--------------------------------------------------
TESTING ID 66832845
{'Name': 0.0, 'Email': 0.0, 'Phone': 0.0, 'Skills': 1.0, 'Education': 1.0, 'Experience': 1.0, 'Certifications': 1.0, 'Languages': 1.0}
Detected: Text-based PDF (digitally generated)
--------------------------------------------------
TESTING ID 66832845
{'Name': 0.0, 'Email': 0.0, 'Phone': 0.0, 'Skills': 1.0, 'Education': 1.0, 'Experience': 1.0, 'Certifications': 1.0, 'Languages': 1.0}
Detected: Text-based PDF (digitally generated)
--------------------------------------------------
TESTING ID 66832845
{'Name': 0.0, 'Email': 0.0, 'Phone': 0.0, 'Skills': 1.0, 'Education': 1.0, 'Experience': 0.7, 'Certifications': 0.0, 'Languages': 0.0}
