# 0. IMPORT LIB

In [None]:
import os
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from sentence_transformers import SentenceTransformer, util
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dateutil import parser
from typing import Optional, List, Tuple
import pandas as pd
import pdfplumber
import pytesseract
from transformers import  AutoTokenizer
from pypdf import PdfReader
from bs4 import BeautifulSoup
import re
import json
import kagglehub


# 1. IMPORT LIB & DATA SOUCE

In [None]:
path = kagglehub.dataset_download("snehaanbhawal/resume-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\huy.let3\.cache\kagglehub\datasets\snehaanbhawal\resume-dataset\versions\1


# 2. DATA PREPARATION

In [51]:
DATA_DIR = r"C:\Users\huy.let3\Desktop\resume\data\data"

CSV_PATH = r"C:\Users\huy.let3\Desktop\resume\Resume\Resume.csv"  # Path to the CSV file with resume metadata

resume_df = pd.read_csv(CSV_PATH)
print(f"Loaded metadata for {len(resume_df)} resumes")

resume_df = resume_df.loc[resume_df['Category'] == 'INFORMATION-TECHNOLOGY']
resume_df

# List to store all loaded documents
all_documents = []

# Check if the data directory exists
if not os.path.exists(DATA_DIR):
    print(f"Warning: Data directory '{DATA_DIR}' not found. Please check the path.")
else:
    # Get all categories (subdirectories)
    categories = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
    
    for category in categories:
        category_path = os.path.join(DATA_DIR, category)
        print(f"Loading resumes from category: {category}")
        
        # Use DirectoryLoader to load all PDFs in the category directory
        loader = DirectoryLoader(
            category_path, 
            glob="**/*.pdf",  # Load all PDFs, including in subdirectories
            loader_cls=PyPDFLoader
        )
        
        try:
            docs = loader.load()
            # Add metadata: include the category and filename
            for doc in docs:
                doc.metadata["category"] = category
                filename = os.path.basename(doc.metadata["source"])
                doc.metadata["file_name"] = filename
                doc.metadata["id"] = os.path.splitext(filename)[0]  # Remove extension to get ID
                
                # Add additional metadata from CSV if available
                if resume_df is not None:
                    resume_id = doc.metadata["id"]
                    resume_info = resume_df[resume_df["ID"] == resume_id]
                    if not resume_info.empty:
                        # Add any additional metadata from the CSV
                        pass
            
            all_documents.extend(docs)
            print(f"  Loaded {len(docs)} resumes from {category}")
        except Exception as e:
            print(f"  Error loading documents from {category}: {e}")

print(f"Total resumes loaded: {len(all_documents)}")

Loaded metadata for 2484 resumes
Loading resumes from category: INFORMATION-TECHNOLOGY
  Loaded 247 resumes from INFORMATION-TECHNOLOGY
Total resumes loaded: 247


# 3. FROM UNSTRUCTURE TO STRUCTURE

In [None]:
def is_scanned_pdf(pdf_path):
    """Check if the PDF is scanned by verifying if text can be extracted."""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            if page.extract_text():
                return False
    return True

def extract_text_from_scanned_pdf(pdf_path):
    """Extract text from a digitally generated (text-based) PDF."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()


def extract_text_from_pdf(pdfPath):
    text = ""
    try:
        reader = PdfReader(pdfPath)
        for page in reader.pages:
            pageText = page.extract_text()
            if pageText:
                text += pageText + "\n"
    except Exception as e:
        print(f"Error reading {pdfPath} using PyPDF2: {e}")
    return text.strip()

def clean_text(text):
    return text.replace("\n", " ").replace("  ", " ").strip()


# 4. LLM MODEL (OPENAI)

In [None]:
from openai import OpenAI
client = OpenAI(api_key = "your_key")

from langchain.chat_models import ChatOpenAI

model = ChatOpenAI(
    model="gpt-4",  # or "gpt-3.5-turbo"
    temperature=0.2,
    openai_api_key="YOUR KEY"  # Or set via ENV variable: OPENAI_API_KEY
)

def extract_entities_with_gpt(text):
    prompt = f"""
    You are an AI assistant that extracts exact structured candidate information from resumes.

    Please extract exactly words the following fields and return the result in valid JSON format:

    - Name: full name of the candidate if not label NA
    - Email: valid email address need contain @ in results;if not label NA
    - Phone: phone number ;if not label NA
    - Skills: a list of technical skills in IT fields and professional/soft skills in IT Fields
    - Education: list of degrees with institution name and graduation year  
    - Experience: for each job, include:
        - Job Title
        - Company Name
        - Years Worked: Each jobs calculate by using End Year Minus Start Year and Round up with 1 decimal
        - Short Description of Responsibilities: extract exact words
    - Certifications: list of relevant certifications (if available)  ; if not label NA
    - Languages: languages the candidate can speak or write such as English or Spain or Vietnam; if not label NA 

    Text:

    {text}

    Respond in JSON format.
    """
    response = client.responses.create(
        model="gpt-4",
        input=[
        {"role": "user", "content": prompt}]

    )
    return response.output_text


def process_pdf(pdf_path):
    if is_scanned_pdf(pdf_path):
        print("Detected: Scanned PDF (image-based)")
        raw_text = extract_text_from_scanned_pdf(pdf_path)
    else:
        print("Detected: Text-based PDF (digitally generated)")
        raw_text = extract_text_from_pdf(pdf_path)

    cleaned = clean_text(raw_text)
    print("Sending to GPT for entity extraction...")
    extracted_entities = extract_entities_with_gpt(cleaned)
    return extracted_entities

In [260]:
input_pdf = r"data\data\INFORMATION-TECHNOLOGY\33241454.pdf"
results_llm = json.loads(process_pdf(input_pdf))
results_llm

Detected: Text-based PDF (digitally generated)
Sending to GPT for entity extraction...


{'Name': 'NA',
 'Email': 'NA',
 'Phone': 'NA',
 'Skills': ['Army',
  'Cisco',
  'counseling',
  'customer assistance',
  'database',
  'documentation',
  'Information Technology',
  'inventory',
  'IP',
  'LAN',
  'Windows 7',
  'Network',
  'policies',
  'protocols',
  'repairs',
  'Routing',
  'supervisor',
  'test equipment',
  'troubleshoot',
  'WAN'],
 'Education': [{'Degree': 'Certification (Windows 7)',
   'Institution': 'Microsoft, Fort Bragg, NC',
   'Year': '2012'},
  {'Degree': 'Certification (Security)',
   'Institution': 'Comptia, Yong San, Korea',
   'Year': '2012'},
  {'Degree': 'Distinguished Graduate Certificate, Information Technology (Network Communications) Course',
   'Institution': 'U.S. Army',
   'Year': '2009'},
  {'Degree': 'Certificate, IT Network and Cisco Routing',
   'Institution': 'IT Field Services Branch',
   'Year': '2009'},
  {'Degree': 'Associate of Science : Radiography',
   'Institution': 'Northwest Florida State College',
   'Year': '2008'},
  {'De

# 5.Grouth Truth Via with REGEX HTML

In [None]:

def extract_resume_entities(html_string):
    soup = BeautifulSoup(html_string, "html.parser")
    extracted = {
        "Name": "NA",
        "Email": "NA",
        "Phone": "NA",
        "Skills": [],
        "Education": [],
        "Experience": [],
        "Certifications": [],
        "Languages": []
    }

    # 1. Name (using SECTION_NAME)
    name_section = soup.find("div", id=re.compile(r"SECTION_NAME1"))
    if name_section:
        name_text = name_section.get_text(separator=" ", strip=True)
        if name_text:
            extracted['Name'] = name_text

    # 2. Email
    email_match = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", html_string)
    if email_match:
        extracted['Email'] = email_match.group()

    # 3. Phone
    phone_match = re.search(r"\b(?:\+?1[\s\-]?)?(?:\(?\d{3}\)?[\s\-]?)?\d{3}[\s\-]?\d{4}\b", html_string)
    if phone_match:
        extracted['Phone'] = phone_match.group()

    # 4. Skills
    skill_sections = soup.find_all("div", id=re.compile(r"SECTION_SKLL"))
    for section in skill_sections:
        for block in section.find_all("div", class_="field"):
            text = block.get_text(separator=" ", strip=True)
            if text:
                for skill in re.split(r"[,;•\-|\n]+", text):
                    skill_clean = skill.strip()
                    if skill_clean:
                        extracted["Skills"].append(skill_clean)

    # 5. Education
    education_section = soup.find("div", id=re.compile("SECTION_EDUC"))
    if education_section:
        entries = education_section.find_all("div", class_="singlecolumn")
        for entry in entries:
            degree = entry.find("span", class_="degree")
            program = entry.find("span", class_="programline")
            year = entry.find("span", class_="jobdates")
            institution = entry.find("span", class_="companyname")

            edu_item = {
                "Degree": degree.get_text(strip=True) if degree else "NA",
                "Program": program.get_text(strip=True) if program else "NA",
                "Year": year.get_text(strip=True) if year else "NA",
                "Institution": institution.get_text(strip=True) if institution else "NA"
            }
            extracted["Education"].append(edu_item)

    # 6. Experience
    experience_section = soup.find("div", id=re.compile("SECTION_EXPR"))
    if experience_section:
        jobs = experience_section.find_all("div", class_="paragraph")
        for job in jobs:
            title = job.find("span", class_="jobtitle")
            company = job.find("span", class_="companyname")
            description = job.find("span", class_="jobline")
            
            # Extract exactly 2 jobdates (start and end)
            dates = job.find_all("span", class_="jobdates", format=True)  # filter only real dates
            years_worked = "NA"
            import math

            if len(dates) == 2:
                try:
                    start_date = parser.parse(dates[0].get_text(strip=True), dayfirst=False)
                    end_date = parser.parse(dates[1].get_text(strip=True), dayfirst=False)
                    delta_years = (end_date - start_date).days / 365.25
                    years_worked = math.ceil(delta_years * 10) / 10
                except Exception:
                    years_worked = "NA"

            exp_item = {
                "Job Title": title.get_text(strip=True) if title else "NA",
                "Company Name": company.get_text(strip=True) if company else "NA",
                "Years Worked": years_worked,
                "Short Description of Responsibilities": description.get_text(strip=True) if description else "NA"
            }
            extracted.setdefault("Experience", []).append(exp_item)

    # 7. Certifications
    cert_match = re.findall(r"(?:certified|certification|certificate)[^<\n]{0,100}", html_string, flags=re.IGNORECASE)
    if cert_match:
        extracted["Certifications"] = list(set(map(str.strip, cert_match)))
    # 8. Languages
    lang_keywords = re.findall(r"\b(English|Spanish|Vietnamese|Chinese|French|German|Japanese|Korean)\b", html_string, flags=re.IGNORECASE)
    if lang_keywords:
        extracted["Languages"] = list(set(lang_keywords))
    return extracted

In [None]:
ground_truth = extract_resume_entities(resume_df.loc[resume_df['ID'] == 33241454]['Resume_html'].values[0])
ground_truth

{'Name': 'NA',
 'Email': 'NA',
 'Phone': 'NA',
 'Skills': ['Army',
  'Cisco',
  'counseling',
  'customer assistance',
  'database',
  'documentation',
  'Information Technology',
  'inventory',
  'IP',
  'LAN',
  'Windows 7',
  'Network',
  'personnel',
  'policies',
  'protocols',
  'repairs',
  'Routing',
  'San',
  'supervisor',
  'test equipment',
  'troubleshoot',
  'WAN'],
 'Education': [{'Degree': 'Certification, Windows 7, Microsoft, Fort Bragg, NC,',
   'Program': '',
   'Year': '2012',
   'Institution': ''},
  {'Degree': '*Certification, Security  , Comptia, Yong San, Korea,',
   'Program': '',
   'Year': '2012',
   'Institution': ''},
  {'Degree': '*Distinguished Graduate Certificate, Information Technology (Network Communications) Course',
   'Program': '',
   'Year': '2009',
   'Institution': 'U.S. Army'},
  {'Degree': 'Certificate, IT Network and Cisco Routing, IT Field Services Branch',
   'Program': '',
   'Year': '2009',
   'Institution': ''},
  {'Degree': 'Associate 

In [None]:

# Load small embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def compute_similarity(text1, text2):
    embeddings = model.encode([text1, text2], convert_to_tensor=True)
    return float(util.cos_sim(embeddings[0], embeddings[1]))

# Set a reasonable threshold for similarity
SIMILARITY_THRESHOLD = 0.7

comparison = {}
for key in ground_truth:
    response_text = str(results_llm.get(key, "")).replace("\n", " ").strip().lower()
    expected_text = str(ground_truth.get(key, "")).replace("\n", " ").strip().lower()
    similarity = compute_similarity(response_text, expected_text)
    comparison[key] = similarity >= SIMILARITY_THRESHOLD

    print(f"{key}: {'✅ Match' if comparison[key] else '❌ Mismatch'} (Similarity: {similarity:.2f})")


Name: ✅ Match (Similarity: 1.00)
Email: ✅ Match (Similarity: 1.00)
Phone: ✅ Match (Similarity: 1.00)
Skills: ✅ Match (Similarity: 1.00)
Education: ✅ Match (Similarity: 0.96)
Experience: ✅ Match (Similarity: 0.76)
Certifications: ❌ Mismatch (Similarity: 0.60)
Languages: ❌ Mismatch (Similarity: 0.26)


# 6. SELF-EVALUATION LLM (Context vs Questions)

In [None]:
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", "\n\n", "\n", " ", ""
]

def split_documents(chunk_size: int, knowledge_base: list):
    """
    Splits documents using character-based logic only (no tokenizer).
    Adds overlap, supports Markdown-style separators, prefixes with PDF name, removes duplicates.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        pdf_name = doc.metadata.get("pdf_name") or doc.metadata.get("source", "Unknown PDF")
        chunks = text_splitter.split_documents([doc])
        for chunk in chunks:
            chunk.page_content = f"[{pdf_name[-12:-4]}]\n{chunk.page_content}"
            docs_processed.append(chunk)

    # Remove duplicates by content
    seen = set()
    return [doc for doc in docs_processed if doc.page_content not in seen and not seen.add(doc.page_content)]


def retrieve_and_generate_openai(id :str, query: str) -> str:
    docs_processed = split_documents(512, all_documents)
    docs = [doc for doc in docs_processed if doc.metadata.get("id") == id]
    context = "\n".join([doc.page_content for doc in docs])
    prompt = f"""You are a helpful assistant.
    Context:
    {context}
      Pls extract these information from Context:
    - Name: full name of the candidate if not label NA
    - Email: valid email address need contain @ in results;if not label NA
    - Phone: phone number ;if not label NA
    - Skills: a list of technical skills in IT fields and professional/soft skills in IT Fields
    - Education: list of degrees with institution name and graduation year  
    - Experience: for each job, include:
        - Job Title
        - Company Name
        - Years Worked (start year minus end year)
        - Short Description of Responsibilities: extract exact words
    - Certifications: list of relevant certifications (if available)  ; if not label NA
    - Languages: languages the candidate can speak or write such as English or Spain or Vietnam; if not label NA 

    COMPARE CONTEXT TO {query}
    
    Question: Return string similarity metrics score each fields between context and query

    Answer:"""

    # Step 3: Call OpenAI
    response = model.predict(prompt)
    return response.strip()

In [None]:
# Split the loaded documents into chunks (adjust chunk_size as needed)
docs_processed = split_documents(512, all_documents)
query = results_llm
print(retrieve_and_generate_openai("33241454",query))

Total chunks after splitting: 1930
The string similarity metrics score for each field between the context and the query is as follows:

- Name: 100% (Both are "NA")
- Email: 100% (Both are "NA")
- Phone: 100% (Both are "NA")
- Skills: 100% (Both lists contain the same skills)
- Education: 100% (Both lists contain the same education details)
- Experience: 100% (Both lists contain the same job titles, company names, years worked, and responsibilities)
- Certifications: 100% (Both lists contain the same certifications)
- Languages: 100% (Both are "NA") 

Please note that these scores are based on exact string matches. Any slight variation in wording, punctuation, or order would result in a lower score.
