In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
import time, json, re, random, requests
from typing import List, Optional
from pydantic import BaseModel


In [4]:
def scrape_resume_links(category_url, max_pages=200):
    """Scrape resume links from a given category (multiple pages)."""
    resume_links = []
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/116.0 Safari/537.36"
    }

    session = requests.Session()
    session.headers.update(headers)

    for page in range(1, max_pages + 1):
        url = f"{category_url}/page/{page}"
        print(f"[+] Scraping {url}")

        r = session.get(url)
        if r.status_code != 200:
            print(f"[-] Page {page} returned {r.status_code}, stopping.")
            break

        soup = BeautifulSoup(r.text, "html.parser")
        links = [a['href'] for a in soup.select("table.hit-table h4 a")]

        if not links:  # No more resumes
            print(f"[-] No resumes found on page {page}, stopping.")
            break

        resume_links.extend(links)

        # small random delay (just to avoid hammering)
        time.sleep(random.uniform(0.3, 0.7))

    return resume_links

In [5]:
categories = [
    "https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes",
    "https://www.hireitpeople.com/resume-database/71-sap-resumes",
    "https://www.hireitpeople.com/resume-database/77-oracle-resumes",
    "https://www.hireitpeople.com/resume-database/78-oracle-dba-resumes",
]

categories = [
    "https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes",
]


num_pages = 500    # each page contains 10 resumes

In [6]:
all_links = {}

with ThreadPoolExecutor(max_workers=len(categories)) as executor:
    futures = {executor.submit(scrape_resume_links, cat_url, max_pages=500): cat_url for cat_url in categories}
    for future in as_completed(futures):
        cat_url = futures[future]
        try:
            links = future.result()
            all_links[cat_url] = links
            print(f"[✓] Collected {len(links)} resumes from {cat_url}")
        except Exception as e:
            print(f"[!] Failed to scrape {cat_url}: {e}")

# flatten list
all_resume_links = [link for links in all_links.values() for link in links]

[+] Scraping https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/page/1
[+] Scraping https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/page/2
[+] Scraping https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/page/3
[+] Scraping https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/page/4
[+] Scraping https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/page/5
[+] Scraping https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/page/6
[+] Scraping https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/page/7
[+] Scraping https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/page/8
[+] Scraping https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/page/9
[+] Scraping https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/page/10
[+] Scraping https://www.hireitpeople.com/resume-database/7

In [17]:
all_resume_links

['https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/628272-teradata-dba-resume-hoffman-estates-il-2',
 'https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/628245-senior-oracle-developer-resume-nj-2',
 'https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/628214-oracle-developer-production-support-resume-st-louis-mo',
 'https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/628085-oracle-soa-bpel-consultant-resume-wilmington-delaware-2',
 'https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/628083-oracle-soa-bpel-consultant-resume-wilmington-delaware',
 'https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/628034-oracle-soa-consultant-resume-camarillo-ca-4',
 'https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/628032-oracle-soa-consultant-resume-camarillo-ca-2',
 'https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes

In [18]:
import requests
from bs4 import BeautifulSoup, NavigableString, Tag

def fetch_and_print_dom(url, max_depth=3):
    
    """
    Fetches HTML content from a URL and prints a tree representation of the DOM.

    Args:
        url (str): The URL to fetch.
        max_depth (int): Maximum depth of the tree to print.
    """
    # Step 1: Fetch HTML content
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch URL: {response.status_code}")
        return

    html = response.text

    # Step 2: Parse HTML into DOM
    soup = BeautifulSoup(html, "html.parser")

    # Step 3: Recursive function to print tree
    def print_tree(node, indent=0, depth=0):
        if depth > max_depth:
            return
        prefix = " " * (indent * 2)
        if isinstance(node, Tag):
            print(f"{prefix}{node.name}")
            for child in node.children:
                print_tree(child, indent + 1, depth + 1)
        elif isinstance(node, NavigableString):
            text = node.strip()
            if text:
                print(f"{prefix}#text: {text[:30]}")  # Show first 30 chars of text

    # Step 4: Print the tree starting from <html>
    html_tag = soup.find("html")
    if html_tag:
        print_tree(html_tag)
    else:
        print_tree(soup)  # fallback if <html> is missing

# Example usage
url = all_resume_links[0]
fetch_and_print_dom(url)

html
  head
    base
    meta
    title
      #text: Teradata DBA Resume Hoffman Es
    meta
      meta
      meta
      meta
      meta
  body
    script
      #text: (function (i, s, o, g, r, a, m
    #text: Google tag (gtag.js)
    script
    script
      #text: window.dataLayer = window.data
    div
    script
      #text: (function (d, s, id) {
       
    div
      #text: <div class="page-loader page-l
      header
      section
      section
      #text: Modal
      #text: <div class="modal fade resume-
      #text: hubspot form
      #text: <div class="hubspot-resume-for
      div
      div
      div
      div
      div
      #text: Scripts
      noscript
      script
      script
      script
      script
      script
      script
      #text: Developed By Revo Interactive
  script
    #text: _linkedin_partner_id = "479588
  script
    #text: (function(l) { if (!l){window.
  noscript
    img


In [19]:

def extract_post_body(url, target_class="single-post-body"):
    """
    Extracts the textual content from the main post body div.

    Args:
        url (str): Web page URL to scrape.
        target_class (str): Class name of the main content container.

    Returns:
        str: Clean text from the post body.
    """
    # Fetch HTML
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the main content container
    container = soup.find("div", class_=target_class)
    if not container:
        raise ValueError(f"No <div> found with class '{target_class}'")

    # Extract all paragraphs and optional elements
    paragraphs = container.find_all(["p", "h1", "h2", "h3", "li"])
    
    # Clean and join text
    text_blocks = []
    for p in paragraphs:
        txt = p.get_text(separator=" ", strip=True)
        if txt:
            text_blocks.append(txt)

    # Join paragraphs into one big text
    full_text = "\n\n".join(text_blocks)
    return full_text


# Example usage
url = all_resume_links[0]
text = extract_post_body(url)
print(text[:1000])  # Preview first 1000 chars
with open("resume.txt", "w") as f:
    f.write(text)

OBJECTIVE Position where Database Designing and DBA skills and experience in database designing, administration and Management will add value in today’s challenging business and Information Technology. SUMMARY 8+ years of experience in IT and Teradata administration, designing database solutions with Architecture. 5+ Years of exclusive Production support and deploy of Teradata Patches, fixtures and DBQL settings. Experienced in archiving, restoring and recovering data on Teradata using ARC utility and TARA GUI. Excellent knowledge in moving database objects between Teradata systems and Teradata Database. Very well trained and experienced in scheduling backups and recovery of the entire EDW databases across various geographical locations for the business continuity and response time. Track record in Archiving and Recovering Down Amps Highly experienced in designing and testing Node fail over tests Highly knowledgeable in designing table DDL mechanisms for Automatic Data Protection Skill

In [20]:
def extract_post_body(url, target_class="single-post-body"):
    """
    Extracts clean textual content from the main post body div, avoiding duplicates.
    """
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    container = soup.find("div", class_=target_class)
    if not container:
        raise ValueError(f"No <div> found with class '{target_class}'")

    # Extract only direct <p> tags and text nodes, not nested twice
    text_blocks = []
    for child in container.descendants:
        # Only consider direct visible text from paragraph-like tags
        if child.name in ["p", "h1", "h2", "h3", "li"]:
            txt = child.get_text(separator=" ", strip=True)
            if txt and txt not in text_blocks:  # avoid duplicate lines
                text_blocks.append(txt)

    # Join with line breaks
    full_text = "\n\n".join(text_blocks)
    return full_text


# Example usage
text = extract_post_body(url)
print(text)
with open("resume.txt", "w") as f:
    f.write(text)

OBJECTIVE Position where Database Designing and DBA skills and experience in database designing, administration and Management will add value in today’s challenging business and Information Technology. SUMMARY 8+ years of experience in IT and Teradata administration, designing database solutions with Architecture. 5+ Years of exclusive Production support and deploy of Teradata Patches, fixtures and DBQL settings. Experienced in archiving, restoring and recovering data on Teradata using ARC utility and TARA GUI. Excellent knowledge in moving database objects between Teradata systems and Teradata Database. Very well trained and experienced in scheduling backups and recovery of the entire EDW databases across various geographical locations for the business continuity and response time. Track record in Archiving and Recovering Down Amps Highly experienced in designing and testing Node fail over tests Highly knowledgeable in designing table DDL mechanisms for Automatic Data Protection Skill

In [11]:
def extract_post_body(url, target_class="single-post-body", min_word_threshold=200):
    """
    Extracts clean textual content from the main post body div,
    skipping oversized unstructured text blocks.
    """
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    container = soup.find("div", class_=target_class)
    if not container:
        raise ValueError(f"No <div> found with class '{target_class}'")

    text_blocks = []
    skipped_blocks = []

    for child in container.descendants:
        if child.name in ["p", "h1", "h2", "h3", "li"]:
            txt = child.get_text(separator=" ", strip=True)
            if not txt:
                continue

            word_count = len(txt.split())
            has_line_breaks = "\n" in txt

            # Skip overly long single-line text blobs
            if word_count > min_word_threshold and not has_line_breaks:
                skipped_blocks.append(txt[:120] + "...")
                continue

            # Avoid duplicates
            if txt not in text_blocks:
                text_blocks.append(txt)

    full_text = "\n\n".join(text_blocks)

    print(f"✅ Extracted {len(text_blocks)} valid blocks, 🧹 skipped {len(skipped_blocks)} noisy ones.")
    return full_text, skipped_blocks


# Example usage
text, skipped = extract_post_body(url)

print("\n--- CLEANED TEXT PREVIEW ---\n")
print(text[:600])

print("\n--- SKIPPED BLOCKS (for debugging) ---\n")
for s in skipped:
    print("-", s[:200])

with open("resume.txt", "w") as f:
    f.write(text)

✅ Extracted 141 valid blocks, 🧹 skipped 1 noisy ones.

--- CLEANED TEXT PREVIEW ---

OBJECTIVE

Position where Database Designing and DBA skills and experience in database designing, administration and Management will add value in today’s challenging business and Information Technology.

SUMMARY

8+ years of experience in IT and Teradata administration, designing database solutions with Architecture.

5+ Years of exclusive Production support and deploy of Teradata Patches, fixtures and DBQL settings.

Experienced in archiving, restoring and recovering data on Teradata using ARC utility and TARA GUI.

Excellent knowledge in moving database objects between Teradata systems and T

--- SKIPPED BLOCKS (for debugging) ---

- OBJECTIVE Position where Database Designing and DBA skills and experience in database designing, administration and Mana...


In [21]:
import re
import requests
import json
from bs4 import BeautifulSoup

def normalize_breaks(soup):
    """
    Convert <br> tags to newline text nodes so .get_text() uses them.
    """
    for br in soup.find_all("br"):
        br.replace_with("\n")

def clean_whitespace(text):
    # Collapse multiple spaces but keep line breaks
    # remove leading/trailing whitespace on each line
    lines = [ln.strip() for ln in text.splitlines()]
    # drop empty lines at ends
    while lines and lines[0] == "":
        lines.pop(0)
    while lines and lines[-1] == "":
        lines.pop()
    return "\n".join([re.sub(r'\s+', ' ', ln) for ln in lines])

def extract_post_body_safe(url,
                           target_class=None,
                           class_regex=None,
                           allow_fallback=True,
                           debug=False):
    """
    Robustly extract text from the main post container.

    Args:
        url (str)
        target_class (str): exact class name to search for (preferred)
        class_regex (str): regex to match classes like "post-content|entry-content"
        allow_fallback (bool): if True, try to auto-detect the largest text container
        debug (bool): prints debugging info

    Returns:
        dict: {
            "full_text": str,           # joined paragraph text (with double-newlines between p tags)
            "p_texts": [str,...],       # list of p-level texts captured
            "container_text": str,      # container.get_text() (cleaned)
            "missing_excerpt": str,     # excerpt of text present in container_text but not in concatenated p_texts (if any)
            "warnings": [str,...]       # list of warnings (e.g., JS-rendered suspicion)
        }
    """
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)"}
    resp = requests.get(url, headers=headers, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Normalize <br> tags
    normalize_breaks(soup)

    container = None
    if target_class:
        container = soup.find("div", class_=target_class)
    if not container and class_regex:
        container = soup.find("div", class_=re.compile(class_regex))
    if not container and allow_fallback:
        # Fallback: choose the div with largest text content
        divs = soup.find_all("div")
        if divs:
            container = max(divs, key=lambda d: len(d.get_text(strip=True)))
    if not container:
        raise ValueError("Could not find a suitable container. Try specifying target_class or class_regex.")

    # Capture the raw container text and normalize whitespace
    raw_container_text = container.get_text(separator="\n", strip=True)
    container_text = clean_whitespace(raw_container_text)

    # Find all paragraph-like tags inside container
    paragraph_tags = container.find_all(["p", "article", "div"], recursive=True)
    # We use a filter: we keep those tags which have some textual content and are likely paragraphs.
    p_texts = []
    for idx, tag in enumerate(paragraph_tags):
        # Skip tags that are empty or just whitespace/newlines
        txt = " ".join(tag.stripped_strings)  # joins text pieces including nested spans
        txt = txt.strip()
        if not txt:
            continue

        # Heuristic: avoid grabbing container-level big dumps repeated (optional)
        # (But by default we keep everything; you can add thresholds if needed)
        p_texts.append(clean_whitespace(txt))

    # Deduplicate while preserving order
    seen = set()
    unique_p_texts = []
    all_p_texts = []
    for t in p_texts:
        if t not in seen:
            unique_p_texts.append(t)
            seen.add(t)
        all_p_texts.append(t)
    # Join paragraphs with double newlines
    joined_p = "\n\n".join(unique_p_texts)

    # Compare counts to detect missing parts
    warnings = []
    # If container_text is substantially longer than joined_p, maybe some text is outside p-tags or JS-rendered
    container_words = len(container_text.split())
    joined_words = len(re.sub(r'\s+', ' ', joined_p).split()) if joined_p else 0

    missing_excerpt = ""
    if container_words > joined_words + 20:  # allow small differences
        # compute missing by naive subtraction — find container text not in joined_p
        if joined_p:
            # remove matched paragraph fragments from container_text
            temp = container_text
            for piece in unique_p_texts:
                temp = temp.replace(piece, "")
            missing_excerpt = temp.strip()[:800]  # give first 800 chars of leftover
        else:
            missing_excerpt = container_text[:800]

        if missing_excerpt:
            warnings.append("Container has additional text not captured by paragraph-like tags. See missing_excerpt.")
        else:
            warnings.append("Detected difference in counts but couldn't compute missing excerpt.")

    # Detect likely JS-rendered content (very common cause)
    # If the original HTML contains obvious placeholders or empty content but the container_text is short, warn.
    if "<script" in resp.text.lower() and (container_words == 0 or joined_words == 0):
        warnings.append("Page might be JS-rendered. Try using Playwright or Selenium to render JS before extraction.")

    if debug:
        print("===== DEBUG INFO =====")
        print("URL:", url)
        print("Container chosen tag/classes:", container.name, container.get("class"))
        print("Container words:", container_words)
        print("Paragraph-like nodes found (count):", len(paragraph_tags))
        print("Captured paragraph count:", len(unique_p_texts))
        print("Joined paragraph words:", joined_words)
        print("Warnings:", warnings)
        if missing_excerpt:
            print("Missing excerpt (first 400 chars):")
            print(missing_excerpt[:400])
        print("======================")

    return {
        "full_text": joined_p,
        "p_texts": unique_p_texts,
        "all_p_texts": all_p_texts,
        "container_text": container_text,
        "missing_excerpt": missing_excerpt,
        "warnings": warnings
    }



result = extract_post_body_safe(
url,
class_regex=r"(single-post-body|post-content|entry-content|article-body)",
debug=True
)

print("=== Full text preview ===")
print(result["full_text"][:1200])

if result["missing_excerpt"]:
    print("\n=== Missing excerpt (possible lost text) ===")
    print(result["missing_excerpt"])

if result["warnings"]:
    print("\n=== Warnings ===")
    for w in result["warnings"]:
        print("-", w)

with open("resume.json", "w") as f:
    f.write(json.dumps(result, indent=4))

===== DEBUG INFO =====
URL: https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/628272-teradata-dba-resume-hoffman-estates-il-2
Container chosen tag/classes: div ['single-post-body']
Container words: 2475
Paragraph-like nodes found (count): 24
Captured paragraph count: 20
Joined paragraph words: 2606
=== Full text preview ===
OBJECTIVE Position where Database Designing and DBA skills and experience in database designing, administration and Management will add value in today’s challenging business and Information Technology. SUMMARY 8+ years of experience in IT and Teradata administration, designing database solutions with Architecture. 5+ Years of exclusive Production support and deploy of Teradata Patches, fixtures and DBQL settings. Experienced in archiving, restoring and recovering data on Teradata using ARC utility and TARA GUI. Excellent knowledge in moving database objects between Teradata systems and Teradata Database. Very well trained and experienced in s

In [69]:
def normalize_breaks(soup):
    """Convert <br> tags to newline text nodes so .get_text() uses them."""
    for br in soup.find_all("br"):
        br.replace_with("\n")

def clean_whitespace(text):
    lines = [ln.strip() for ln in text.splitlines()]
    while lines and lines[0] == "":
        lines.pop(0)
    while lines and lines[-1] == "":
        lines.pop()
    return "\n".join([re.sub(r'\s+', ' ', ln) for ln in lines])

def extract_post_body_safe(url,
                           target_class=None,
                           class_regex=None,
                           allow_fallback=True,
                           debug=False):
    """
    Extracts text and structure (<p>, <ul>, <li>) from the main post container.
    """
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
    resp = requests.get(url, headers=headers, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    normalize_breaks(soup)

    container = None
    if target_class:
        container = soup.find("div", class_=target_class)
    if not container and class_regex:
        container = soup.find("div", class_=re.compile(class_regex))
    if not container and allow_fallback:
        divs = soup.find_all("div")
        if divs:
            container = max(divs, key=lambda d: len(d.get_text(strip=True)))
    if not container:
        raise ValueError("Could not find a suitable container.")

    raw_container_text = container.get_text(separator="\n", strip=True)
    container_text = clean_whitespace(raw_container_text)

    # Preserve structure: paragraphs, unordered lists and list items
    structured_content = []
    for element in container.find_all(["p", "ul"], recursive=True):
        if element.name == "p":
            text = clean_whitespace(" ".join(element.stripped_strings))
            if text:
                structured_content.append({
                    "type": "p",
                    "text": text
                })
        elif element.name == "ul":
            items = []
            for li in element.find_all("li", recursive=False):
                li_text = clean_whitespace(" ".join(li.stripped_strings))
                if li_text:
                    items.append(li_text)
            if items:
                structured_content.append({
                    "type": "ul",
                    "items": items
                })

    # Join paragraphs for convenience
    joined_p = "\n\n".join(
        [block["text"] for block in structured_content if block["type"] == "p"]
    )

    container_words = len(container_text.split())
    joined_words = len(joined_p.split()) if joined_p else 0

    warnings = []
    missing_excerpt = ""
    if container_words > joined_words + 20:
        temp = container_text
        for block in structured_content:
            if block["type"] == "p":
                temp = temp.replace(block["text"], "")
            elif block["type"] == "ul":
                for item in block["items"]:
                    temp = temp.replace(item, "")
        missing_excerpt = temp.strip()[:800]
        if missing_excerpt:
            warnings.append("Container has additional text not captured by structured tags.")
    if "<script" in resp.text.lower() and (container_words == 0 or joined_words == 0):
        warnings.append("Page might be JS-rendered. Try using Playwright or Selenium.")

    if debug:
        print("===== DEBUG INFO =====")
        print("Container:", container.get("class"))
        print("Captured paragraphs:", sum(1 for b in structured_content if b["type"] == "p"))
        print("Captured lists:", sum(1 for b in structured_content if b["type"] == "ul"))
        print("Warnings:", warnings)
        print("======================")

    return {
        "url":url,
        "structured_content": structured_content,
        "full_text": joined_p,
        "container_text": container_text,
        "missing_excerpt": missing_excerpt,
        "warnings": warnings
    }

# Example usage
result = extract_post_body_safe(
    all_resume_links[79],
    class_regex=r"(single-post-body|post-content|entry-content|article-body)",
    debug=True
)

# Print preview
for block in result["structured_content"][:5]:
    if block["type"] == "p":
        print(f"PARA: {block['text'][:100]}")
    elif block["type"] == "ul":
        print(f"LIST: {block['items']}")

# Save to JSON
with open("post_content_79.json", "w") as f:
    f.write(json.dumps(result, indent=4, ensure_ascii=False))

===== DEBUG INFO =====
Container: ['single-post-body']
Captured paragraphs: 33
Captured lists: 6
PARA: SUMMARY Over 7 years of experience in the field of Information Technology, this includes pioneering 
PARA: SUMMARY
LIST: ['Over 7 years of experience in the field of Information Technology, this includes pioneering work in business Analysis, requirements, development, testing, implementing and support of Hyperion system applications and Client/Server projects using industry accepted methodologies and procedures.', 'Extensive experience in the development, design, analysis and Implementation of various applications using Hyperion System 9/11, Hyperion Essbase 11.x/9.X, Hyperion Planning 11.x/9.x', 'Strong experience in the design, development, and administration of Hyperion applications.', 'Experience in performing upgrade, migration. Production support and database backups, Performance tuning and Optimization of Hyperion applications.', 'Actively involved in interacting with the users

In [23]:
import json

### Important

In [None]:

class Experience(BaseModel):
    job_role: str
    responsibilities: List[str]
    environment: Optional[str] = None

class Resume(BaseModel):
    professional_summary: List[str]
    technical_skills: List[str]
    experiences: List[Experience]

    
def normalize_breaks(soup):
    """Convert <br> tags to newline text nodes so .get_text() uses them."""
    for br in soup.find_all("br"):
        br.replace_with("\n")

def clean_whitespace(text):
    lines = [ln.strip() for ln in text.splitlines()]
    while lines and lines[0] == "":
        lines.pop(0)
    while lines and lines[-1] == "":
        lines.pop()
    return "\n".join([re.sub(r'\s+', ' ', ln) for ln in lines])

def extract_post_body_safe(url,
                           target_class=None,
                           class_regex=None,
                           allow_fallback=True,
                           debug=False,
                           min_word_threshold=120):
    """
    Extracts structured content (<p>, <ul>, <li>, .media-body job roles)
    from the main post container, skipping oversized text blocks.
    """
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
    resp = requests.get(url, headers=headers, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    normalize_breaks(soup)

    # Identify container
    container = None
    if target_class:
        container = soup.find("div", class_=target_class)
    if not container and class_regex:
        container = soup.find("div", class_=re.compile(class_regex))
    if not container and allow_fallback:
        divs = soup.find_all("div")
        if divs:
            container = max(divs, key=lambda d: len(d.get_text(strip=True)))
    if not container:
        raise ValueError("Could not find a suitable container.")

    raw_container_text = container.get_text(separator="\n", strip=True)
    container_text = clean_whitespace(raw_container_text)

    structured_content = []
    skipped_blocks = []  # 🔹 Track skipped text blocks

    # 1️⃣ Handle .media-body blocks
    for media in container.find_all("div", class_=re.compile(r"media-body")):
        role_data = {}
        job_title_tag = media.find("h3")
        if job_title_tag:
            role_data["job_role"] = clean_whitespace(job_title_tag.get_text())

        sub_blocks = []
        for child in media.find_all(["p", "ul"], recursive=False):
            if child.name == "p":
                text = clean_whitespace(" ".join(child.stripped_strings))
                # 🚫 Skip overly long text blocks
                if len(text.split()) > min_word_threshold:
                    skipped_blocks.append(text[:120] + "...")
                    continue
                if text:
                    sub_blocks.append({"type": "p", "text": text})
            elif child.name == "ul":
                items = []
                for li in child.find_all("li", recursive=False):
                    li_text = clean_whitespace(" ".join(li.stripped_strings))
                    # 🚫 Skip overly long list items
                    if len(li_text.split()) > min_word_threshold:
                        skipped_blocks.append(li_text[:120] + "...")
                        continue
                    if li_text:
                        items.append(li_text)
                if items:
                    sub_blocks.append({"type": "ul", "items": items})
        if sub_blocks or "job_role" in role_data:
            role_data["content"] = sub_blocks
            structured_content.append(role_data)

    # 2️⃣ Handle normal paragraphs and lists not under media-body
    for element in container.find_all(["p", "ul"], recursive=True):
        if element.find_parent("div", class_=re.compile(r"media-body")):
            continue  # already captured above

        if element.name == "p":
            text = clean_whitespace(" ".join(element.stripped_strings))
            # 🚫 Skip overly long text blocks
            if len(text.split()) > min_word_threshold:
                skipped_blocks.append(text[:120] + "...")
                continue
            if text:
                structured_content.append({"type": "p", "text": text})

        elif element.name == "ul":
            items = []
            for li in element.find_all("li", recursive=False):
                li_text = clean_whitespace(" ".join(li.stripped_strings))
                # 🚫 Skip overly long list items
                if len(li_text.split()) > min_word_threshold:
                    skipped_blocks.append(li_text[:120] + "...")
                    continue
                if li_text:
                    items.append(li_text)
            if items:
                structured_content.append({"type": "ul", "items": items})

    # Join all paragraph text for convenience
    joined_p = "\n\n".join(
        [b["text"] for b in structured_content if b.get("type") == "p"]
    )

    container_words = len(container_text.split())
    joined_words = len(joined_p.split()) if joined_p else 0

    warnings = []
    missing_excerpt = ""
    if container_words > joined_words + 20:
        temp = container_text
        for block in structured_content:
            if block.get("type") == "p":
                temp = temp.replace(block["text"], "")
            elif block.get("type") == "ul":
                for item in block["items"]:
                    temp = temp.replace(item, "")
            elif "job_role" in block:
                temp = temp.replace(block["job_role"], "")
                for sub in block.get("content", []):
                    if sub.get("type") == "p":
                        temp = temp.replace(sub["text"], "")
                    elif sub.get("type") == "ul":
                        for item in sub["items"]:
                            temp = temp.replace(item, "")
        missing_excerpt = temp.strip()[:800]
        if missing_excerpt:
            warnings.append("Container has additional text not captured by structured tags.")

    if "<script" in resp.text.lower() and (container_words == 0 or joined_words == 0):
        warnings.append("Page might be JS-rendered.")

    if debug:
        print("===== DEBUG INFO =====")
        print("Container classes:", container.get("class"))
        print("Job roles found:", sum(1 for b in structured_content if "job_role" in b))
        print("Paragraphs:", sum(1 for b in structured_content if b.get("type") == "p"))
        print("Lists:", sum(1 for b in structured_content if b.get("type") == "ul"))
        print("Skipped blocks:", len(skipped_blocks))
        print("Warnings:", warnings)
        print("======================")

    return {
        "structured_content": structured_content,
        "full_text": joined_p,
        "container_text": container_text,
        "missing_excerpt": missing_excerpt,
        "skipped_blocks": skipped_blocks,
        "warnings": warnings
    }
url = "https://www.hireitpeople.com/resume-database/77-oracle-resumes/113390-sr-oracle-developer-data-architect-resume-ny"
# Example usage
result = extract_post_body_safe(
    url,
    class_regex=r"(single-post-body|post-content|entry-content|article-body)",
    debug=True
)

# --- Preview ---
for block in result["structured_content"]:
    if "job_role" in block:
        print(f"\nJOB ROLE: {block['job_role']}")
        for sub in block["content"]:
            if sub["type"] == "p":
                print("  DESC:", sub["text"][:80])
            elif sub["type"] == "ul":
                print("  BULLETS:", sub["items"])
    elif block.get("type") == "p":
        print("PARA:", block["text"][:100])
    elif block.get("type") == "ul":
        print("LIST:", block["items"])

with open("post_content_79_2.json", "w") as f:
    f.write(json.dumps(result, indent=4, ensure_ascii=False))

NameError: name 'BaseModel' is not defined

In [65]:
import requests
from bs4 import BeautifulSoup
import re
import json

def normalize_breaks(soup):
    """Convert <br> tags to newline text nodes so .get_text() uses them."""
    for br in soup.find_all("br"):
        br.replace_with("\n")

def clean_whitespace(text):
    lines = [ln.strip() for ln in text.splitlines()]
    while lines and lines[0] == "":
        lines.pop(0)
    while lines and lines[-1] == "":
        lines.pop()
    return "\n".join([re.sub(r'\s+', ' ', ln) for ln in lines])

def extract_post_body_safe(url,
                           target_class=None,
                           class_regex=None,
                           allow_fallback=True,
                           debug=False,
                           min_word_threshold=120):
    """
    Extracts structured content (<p>, <ul>, <li>, .media-body job roles)
    from the main post container and media-body elements from entire page.
    """
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
    resp = requests.get(url, headers=headers, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    normalize_breaks(soup)

    # Identify main content container
    container = None
    if target_class:
        container = soup.find("div", class_=target_class)
    if not container and class_regex:
        container = soup.find("div", class_=re.compile(class_regex))
    if not container and allow_fallback:
        divs = soup.find_all("div")
        if divs:
            container = max(divs, key=lambda d: len(d.get_text(strip=True)))
    if not container:
        raise ValueError("Could not find a suitable container.")

    raw_container_text = container.get_text(separator="\n", strip=True)
    container_text = clean_whitespace(raw_container_text)

    structured_content = []
    skipped_blocks = []
    processed_elements = set()

    # 🔥 KEY CHANGE: Search for media-body in entire soup, not just container
    for media in soup.find_all("div", class_=re.compile(r"media-body")):
        role_data = {}
        
        # Look for job title - check h3 first, then other headings
        job_title_tag = media.find("h3")
        if not job_title_tag:
            job_title_tag = media.find(["h1", "h2", "h4", "h5", "h6"])
        
        if job_title_tag:
            role_data["job_role"] = clean_whitespace(job_title_tag.get_text())
            processed_elements.add(job_title_tag)

        sub_blocks = []
        # Process all paragraphs and lists within this media-body
        for element in media.find_all(["p", "ul"]):
            if element in processed_elements:
                continue
                
            if element.name == "p":
                text = clean_whitespace(" ".join(element.stripped_strings))
                # Skip overly long text blocks
                if len(text.split()) > min_word_threshold:
                    skipped_blocks.append(text[:120] + "...")
                    continue
                if text:
                    sub_blocks.append({"type": "p", "text": text})
                    processed_elements.add(element)
            elif element.name == "ul":
                items = []
                for li in element.find_all("li", recursive=False):
                    li_text = clean_whitespace(" ".join(li.stripped_strings))
                    # Skip overly long list items
                    if len(li_text.split()) > min_word_threshold:
                        skipped_blocks.append(li_text[:120] + "...")
                        continue
                    if li_text:
                        items.append(li_text)
                if items:
                    sub_blocks.append({"type": "ul", "items": items})
                    processed_elements.add(element)

        # Only add if we have either a job role or content
        if sub_blocks or "job_role" in role_data:
            role_data["content"] = sub_blocks
            structured_content.append(role_data)
            # Mark all elements in this media-body as processed
            for element in media.find_all():
                processed_elements.add(element)

    # Now process content from the main container, skipping already processed elements
    for element in container.find_all(["p", "ul"]):
        if element in processed_elements:
            continue
            
        # Skip if this element is inside any media-body (already processed)
        if element.find_parent("div", class_=re.compile(r"media-body")):
            continue

        if element.name == "p":
            text = clean_whitespace(" ".join(element.stripped_strings))
            # Skip overly long text blocks
            if len(text.split()) > min_word_threshold:
                skipped_blocks.append(text[:120] + "...")
                continue
            if text:
                structured_content.append({"type": "p", "text": text})
                processed_elements.add(element)

        elif element.name == "ul":
            items = []
            for li in element.find_all("li", recursive=False):
                li_text = clean_whitespace(" ".join(li.stripped_strings))
                # Skip overly long list items
                if len(li_text.split()) > min_word_threshold:
                    skipped_blocks.append(li_text[:120] + "...")
                    continue
                if li_text:
                    items.append(li_text)
            if items:
                structured_content.append({"type": "ul", "items": items})
                processed_elements.add(element)

    # Join all paragraph text for convenience
    joined_p = "\n\n".join(
        [b["text"] for b in structured_content if b.get("type") == "p"]
    )

    container_words = len(container_text.split())
    joined_words = len(joined_p.split()) if joined_p else 0

    warnings = []
    missing_excerpt = ""
    if container_words > joined_words + 20:
        temp = container_text
        for block in structured_content:
            if block.get("type") == "p":
                temp = temp.replace(block["text"], "")
            elif block.get("type") == "ul":
                for item in block["items"]:
                    temp = temp.replace(item, "")
            elif "job_role" in block:
                temp = temp.replace(block["job_role"], "")
                for sub in block.get("content", []):
                    if sub.get("type") == "p":
                        temp = temp.replace(sub["text"], "")
                    elif sub.get("type") == "ul":
                        for item in sub["items"]:
                            temp = temp.replace(item, "")
        missing_excerpt = temp.strip()[:800]
        if missing_excerpt:
            warnings.append("Container has additional text not captured by structured tags.")

    if "<script" in resp.text.lower() and (container_words == 0 or joined_words == 0):
        warnings.append("Page might be JS-rendered.")

    if debug:
        print("===== DEBUG INFO =====")
        print("Container classes:", container.get("class"))
        print("Total media-body elements found:", len(soup.find_all("div", class_=re.compile(r"media-body"))))
        print("Job roles found:", sum(1 for b in structured_content if "job_role" in b))
        print("Paragraphs:", sum(1 for b in structured_content if b.get("type") == "p"))
        print("Lists:", sum(1 for b in structured_content if b.get("type") == "ul"))
        print("Skipped blocks:", len(skipped_blocks))
        print("Warnings:", warnings)
        
        # Debug media-body details
        media_bodies = soup.find_all("div", class_=re.compile(r"media-body"))
        for i, media in enumerate(media_bodies):
            h3 = media.find("h3")
            other_heading = media.find(["h1", "h2", "h4", "h5", "h6"])
            print(f"Media-body {i+1}:")
            print(f"  h3 found: {h3 is not None}")
            if h3:
                print(f"  h3 text: '{h3.get_text(strip=True)}'")
            elif other_heading:
                print(f"  Other heading found: '{other_heading.get_text(strip=True)}'")
            else:
                print(f"  No heading found")
        print("======================")

    return {
        "structured_content": structured_content,
        "full_text": joined_p,
        "container_text": container_text,
        "missing_excerpt": missing_excerpt,
        "skipped_blocks": skipped_blocks,
        "warnings": warnings
    }

# Example usage
url = "https://www.hireitpeople.com/resume-database/77-oracle-resumes/142742-oracle-pl-sql-developer-resume-portland-oregon"

result = extract_post_body_safe(
    url,
    class_regex=r"(single-post-body|post-content|entry-content|article-body)",
    debug=True
)

# --- Preview ---
print("\n" + "="*50)
print("EXTRACTED CONTENT PREVIEW")
print("="*50)
for i, block in enumerate(result["structured_content"]):
    if "job_role" in block:
        print(f"\n[{i}] JOB ROLE: {block['job_role']}")
        for sub in block.get("content", []):
            if sub["type"] == "p":
                print(f"    DESC: {sub['text'][:80]}{'...' if len(sub['text']) > 80 else ''}")
            elif sub["type"] == "ul":
                print(f"    BULLETS: {len(sub['items'])} items")
                for j, item in enumerate(sub['items'][:2]):  # Show first 2 bullets
                    print(f"      - {item[:60]}{'...' if len(item) > 60 else ''}")
    elif block.get("type") == "p":
        print(f"\n[{i}] PARA: {block['text'][:100]}{'...' if len(block['text']) > 100 else ''}")
    elif block.get("type") == "ul":
        print(f"\n[{i}] LIST: {len(block['items'])} items")
        for j, item in enumerate(block['items'][:2]):  # Show first 2 items
            print(f"      - {item[:60]}{'...' if len(item) > 60 else ''}")

# Save to file
with open("post_content_fixed.json", "w") as f:
    json.dumps(result, indent=4, ensure_ascii=False)

print(f"\nSaved results to post_content_fixed.json")
print(f"Total structured blocks extracted: {len(result['structured_content'])}")

===== DEBUG INFO =====
Container classes: ['single-post-body']
Total media-body elements found: 1
Job roles found: 1
Paragraphs: 27
Lists: 6
Skipped blocks: 1
Media-body 1:
  h3 found: True
  h3 text: 'Oracle Pl/sql Developer Resume'

EXTRACTED CONTENT PREVIEW

[0] JOB ROLE: Oracle Pl/sql Developer Resume

[1] PARA: SUMMARY:

[2] LIST: 20 items
      - 8+ years of profound working experience in SQL, PL/SQL progr...
      - Expertise in writing SQL queries, Views, Materialized views,...

[3] PARA: TECHNICAL SKILLS:

[4] PARA: Operating System: Windows 7/10, Linux, UNIX (Sun Solaris, HP-UX).

[5] PARA: Oracle 12c/11g/10g/9i, MS: SQL Server.

[6] PARA: Web Technologies: HTML, DHTML, Java Script, XML.

[7] PARA: Languages: C, C++, SQL, PL/ SQL.

[8] PARA: Scripting Languages: UNIX Shell Scripting

[9] PARA: Database Tools: SQL*PLUS, TOAD, SQL*Tool, PL/SQL Developer

[10] PARA: MS: Office (Access, Word, Excel, PowerPoint)

[11] PARA: Version Control: CVS, SVN, GITHUB

[12] PARA: Agile Tools

In [66]:
def extract_post_body_safe(url,
                           target_class=None,
                           class_regex=None,
                           allow_fallback=True,
                           debug=False,
                           min_word_threshold=120):
    """
    Extracts structured content (<p>, <ul>, <li>, .media-body job roles)
    from the main post container, skipping oversized text blocks.
    """
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
    resp = requests.get(url, headers=headers, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    normalize_breaks(soup)

    # Identify container
    container = None
    if target_class:
        container = soup.find("div", class_=target_class)
    if not container and class_regex:
        container = soup.find("div", class_=re.compile(class_regex))
    if not container and allow_fallback:
        divs = soup.find_all("div")
        if divs:
            container = max(divs, key=lambda d: len(d.get_text(strip=True)))
    if not container:
        raise ValueError("Could not find a suitable container.")

    raw_container_text = container.get_text(separator="\n", strip=True)
    container_text = clean_whitespace(raw_container_text)

    structured_content = []
    skipped_blocks = []  # 🔹 Track skipped text blocks
    processed_elements = set()  # 🔹 Track already processed elements to avoid duplicates

    # 1️⃣ Handle .media-body blocks and their content
    for media in container.find_all("div", class_=re.compile(r"media-body")):
        role_data = {}
        
        # FIX: Look for h3 anywhere within media-body, not just direct children
        job_title_tag = media.find("h3")
        if not job_title_tag:
            # Also check for other potential job title tags
            job_title_tag = media.find(["h1", "h2", "h3", "h4"])
        
        if job_title_tag:
            role_data["job_role"] = clean_whitespace(job_title_tag.get_text())
            # Mark this element as processed
            processed_elements.add(job_title_tag)

        sub_blocks = []
        # Process direct children paragraphs and lists
        for child in media.find_all(["p", "ul"], recursive=False):
            processed_elements.add(child)
            
            if child.name == "p":
                text = clean_whitespace(" ".join(child.stripped_strings))
                # 🚫 Skip overly long text blocks
                if len(text.split()) > min_word_threshold:
                    skipped_blocks.append(text[:120] + "...")
                    continue
                if text:
                    sub_blocks.append({"type": "p", "text": text})
            elif child.name == "ul":
                items = []
                for li in child.find_all("li", recursive=False):
                    li_text = clean_whitespace(" ".join(li.stripped_strings))
                    # 🚫 Skip overly long list items
                    if len(li_text.split()) > min_word_threshold:
                        skipped_blocks.append(li_text[:120] + "...")
                        continue
                    if li_text:
                        items.append(li_text)
                if items:
                    sub_blocks.append({"type": "ul", "items": items})
        
        # Also process any paragraphs/lists that might be in nested divs but still part of this media-body
        for element in media.find_all(["p", "ul"]):
            if element in processed_elements:
                continue
                
            if element.name == "p":
                text = clean_whitespace(" ".join(element.stripped_strings))
                if len(text.split()) > min_word_threshold:
                    skipped_blocks.append(text[:120] + "...")
                    continue
                if text and element not in processed_elements:
                    sub_blocks.append({"type": "p", "text": text})
                    processed_elements.add(element)
            elif element.name == "ul":
                items = []
                for li in element.find_all("li", recursive=False):
                    li_text = clean_whitespace(" ".join(li.stripped_strings))
                    if len(li_text.split()) > min_word_threshold:
                        skipped_blocks.append(li_text[:120] + "...")
                        continue
                    if li_text:
                        items.append(li_text)
                if items and element not in processed_elements:
                    sub_blocks.append({"type": "ul", "items": items})
                    processed_elements.add(element)

        if sub_blocks or "job_role" in role_data:
            role_data["content"] = sub_blocks
            structured_content.append(role_data)

    # 2️⃣ Handle normal paragraphs and lists not under media-body and not already processed
    for element in container.find_all(["p", "ul"]):
        if element in processed_elements:
            continue
            
        # Skip if this element is inside any media-body (should already be processed)
        if element.find_parent("div", class_=re.compile(r"media-body")):
            continue

        if element.name == "p":
            text = clean_whitespace(" ".join(element.stripped_strings))
            # 🚫 Skip overly long text blocks
            if len(text.split()) > min_word_threshold:
                skipped_blocks.append(text[:120] + "...")
                continue
            if text:
                structured_content.append({"type": "p", "text": text})
                processed_elements.add(element)

        elif element.name == "ul":
            items = []
            for li in element.find_all("li", recursive=False):
                li_text = clean_whitespace(" ".join(li.stripped_strings))
                # 🚫 Skip overly long list items
                if len(li_text.split()) > min_word_threshold:
                    skipped_blocks.append(li_text[:120] + "...")
                    continue
                if li_text:
                    items.append(li_text)
            if items:
                structured_content.append({"type": "ul", "items": items})
                processed_elements.add(element)

    # Rest of your code remains the same...
    joined_p = "\n\n".join(
        [b["text"] for b in structured_content if b.get("type") == "p"]
    )

    container_words = len(container_text.split())
    joined_words = len(joined_p.split()) if joined_p else 0

    warnings = []
    missing_excerpt = ""
    if container_words > joined_words + 20:
        temp = container_text
        for block in structured_content:
            if block.get("type") == "p":
                temp = temp.replace(block["text"], "")
            elif block.get("type") == "ul":
                for item in block["items"]:
                    temp = temp.replace(item, "")
            elif "job_role" in block:
                temp = temp.replace(block["job_role"], "")
                for sub in block.get("content", []):
                    if sub.get("type") == "p":
                        temp = temp.replace(sub["text"], "")
                    elif sub.get("type") == "ul":
                        for item in sub["items"]:
                            temp = temp.replace(item, "")
        missing_excerpt = temp.strip()[:800]
        if missing_excerpt:
            warnings.append("Container has additional text not captured by structured tags.")

    if "<script" in resp.text.lower() and (container_words == 0 or joined_words == 0):
        warnings.append("Page might be JS-rendered.")

    if debug:
        print("===== DEBUG INFO =====")
        print("Container classes:", container.get("class"))
        print("Job roles found:", sum(1 for b in structured_content if "job_role" in b))
        print("Paragraphs:", sum(1 for b in structured_content if b.get("type") == "p"))
        print("Lists:", sum(1 for b in structured_content if b.get("type") == "ul"))
        print("Skipped blocks:", len(skipped_blocks))
        print("Warnings:", warnings)
        print("======================")

    return {
        "structured_content": structured_content,
        "full_text": joined_p,
        "container_text": container_text,
        "missing_excerpt": missing_excerpt,
        "skipped_blocks": skipped_blocks,
        "warnings": warnings
    }



url = "https://www.hireitpeople.com/resume-database/77-oracle-resumes/142742-oracle-pl-sql-developer-resume-portland-oregon"
# Example usage
result = extract_post_body_safe(
    url,
    class_regex=r"(single-post-body|post-content|entry-content|article-body)",
    debug=True
)

===== DEBUG INFO =====
Container classes: ['single-post-body']
Job roles found: 0
Paragraphs: 27
Lists: 6
Skipped blocks: 1


In [67]:
print(json.dumps(result, indent=4))

{
    "structured_content": [
        {
            "type": "p",
            "text": "SUMMARY:"
        },
        {
            "type": "ul",
            "items": [
                "8+ years of profound working experience in SQL, PL/SQL programming, System Analysis, Design, Development, and Support of Oracle Database \u2026 in Production, Staging, Development, and Testing Environments.",
                "Expertise in writing SQL queries, Views, Materialized views, PL/SQL procedures, functions, packages, triggers, cursors, collections, Ref cursor, cursor variables, System reference cursor , Dynamic SQL.",
                "Experience in developing External Tables, Joins, Indexes, and Sequences.",
                "Experience in query optimization, performance, and tuning (PL/SQL) using SQL Trace , Explain Plan, Indexing, Hints, Bulk Binds, Bulk Collect, Creation of global temporary tables and table partitioning.",
                "Experience in using several built - in database packages 

In [68]:
r = result['structured_content']
content = r.get("content", [])
content

AttributeError: 'list' object has no attribute 'get'

In [None]:
content = result.get("content", result.get("structured_content", []))

In [None]:
from typing import List, Optional
from pydantic import BaseModel
import json

class Experience(BaseModel):
    job_role: str
    responsibilities: List[str]
    environment: Optional[str] = None

class Resume(BaseModel):
    professional_summary: List[str]
    technical_skills: List[str]
    experiences: List[Experience]


def parse_resume_json(input_path: str, output_path: str) -> None:
    """
    Parse and merge a raw JSON resume (with text and tags)
    into a structured Resume model.
    """

    # Load JSON
    with open(input_path, "r") as f:
        data = json.load(f)

    # If JSON has outer key
    content = data.get("content", data.get("structured_content", []))

    # Buffers
    professional_summary, technical_skills = [], []
    experiences = []

    current_section = None
    current_exp = None

    for block in content:
        text = block.get("text", "").strip()
        tag = block.get("type", "").lower()

        if not text:
            continue

        lower_text = text.lower()

        # --- Identify section headers ---
        if "summary" in lower_text:
            current_section = "summary"
            continue
        elif "technical skills" in lower_text:
            current_section = "technical"
            continue
        elif "professional experience" in lower_text or "experience" in lower_text:
            current_section = "experience"
            continue

        # --- Handle sections ---
        if current_section == "summary":
            professional_summary.append(text)

        elif current_section == "technical":
            technical_skills.append(text)

        elif current_section == "experience":
            # Detect start of new experience
            if lower_text.startswith("confidential"):
                # Save previous one
                if current_exp:
                    experiences.append(current_exp)
                current_exp = Experience(job_role="", responsibilities=[], environment=None)

            # Detect job role (next P tag after confidential)
            elif tag == "p" and current_exp and not current_exp.job_role:
                current_exp.job_role = text

            # UL → responsibilities
            elif tag == "ul" and current_exp:
                current_exp.responsibilities.append(text)

            # Detect environment line
            elif "environment" in lower_text and current_exp:
                current_exp.environment = text

    # Append the last experience
    if current_exp:
        experiences.append(current_exp)

    # --- Build and validate structured resume ---
    resume = Resume(
        professional_summary=professional_summary,
        technical_skills=technical_skills,
        experiences=experiences,
    )

    # Save structured output
    with open(output_path, "w") as f:
        json.dump(resume.dict(), f, indent=4)

    print(f"✅ Resume successfully parsed and saved to {output_path}")

In [None]:
import json

def split_sections(input_json_path, output_json_path):
    with open(input_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    structured_content = data.get("structured_content", [])
    result = {}
    current_section = None
    buffer = []

    def flush_section():
        """Helper to flush current buffer into result dict."""
        nonlocal current_section, buffer
        if current_section:
            # Merge text + lists in a readable way
            merged_text = []
            for item in buffer:
                if item["type"] == "p" and "text" in item:
                    merged_text.append(item["text"].strip())
                elif item["type"] == "ul" and "items" in item:
                    merged_text.extend([f"- {i.strip()}" for i in item["items"]])
            result[current_section] = "\n".join(merged_text).strip()
        buffer = []

    # Loop through each block
    for block in structured_content:
        if block["type"] == "p":
            text = block["text"].strip().rstrip(":")
            # Section heading candidates (uppercase)
            if text.isupper() and len(text.split()) <= 3:
                # Flush previous section content
                flush_section()
                # Start new section
                current_section = text
            else:
                buffer.append(block)
        else:
            buffer.append(block)

    # Flush last section
    flush_section()

    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)

    print(f"✅ Output JSON saved to: {output_json_path}")
    return result

In [None]:
split_sections("post_content_79_2.json", "structured_resume.json")

✅ Output JSON saved to: structured_resume.json


{'SUMMARY': '- Over 18 years of experience in Application design, development and data architecture on a wide variety of platforms, operating systems and databases mainly in Banking and Finance domain.\n- Extensive experience managing and supporting multiple projects involving design and development of OLTP & Data warehouse databases in Oracle 12c/11.2/10g , Teradata, Netezza, NoSQL databases - Cassandra , SQL Server across multiple industries such as Banking, Insurance, Financial, Taxation.\n- More than 8 years of experience as D ata architect involved in design and implementation of database solutions, data integration techniques, D ata modeling , Master Data management & data governance, data quality/data profiling techniques\n- Extensive experience in Oracle Sql & PLSQL code development, writing and reviewing procedures, functions, packages , managing indexes and partitions , reviewing execution plans for query tuning and optimization, OEM database monitoring, OBIEE repositories an

In [None]:
content

[]

In [None]:
ls

data_scraping.ipynb     post_content_79_2.json  scraping_utils.py
output.json             resume.json             scraping_v2.ipynb
post_content.json       resume.txt              test.ipynb
post_content_79.json    scraping.ipynb          test.py


In [1]:
import json
from typing import Dict, List, Any

def parse_resume_json(input_json_file: str, output_json_file: str) -> None:
    """
    Parse a resume JSON file and extract structured information based on the specified logic.
    
    Args:
        input_json_file: Path to the input JSON file
        output_json_file: Path to the output JSON file
    """
    
    # Load the input JSON file
    with open(input_json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data = data['structured_content']
    
    # Initialize the result structure
    result = {
        "summary": "",
        "technical_skills": "",
        "professional_experience": []
    }
    
    # Extract content (assuming the JSON has a structure with text content)
    # Adjust this based on your actual JSON structure
    content = data.get('content', []) or data.get('text', []) or data.get('elements', [])
    
    # Phase 1: Extract Summary (from summary to technical skills)
    summary_parts = []
    technical_skills_parts = []
    professional_summary_parts = []
    experiences = []
    current_experience = None
    
    in_summary = False
    in_technical_skills = False
    in_professional_summary = False
    in_experience = False
    found_technical_skills = False
    found_professional_summary = False
    
    for element in content:
        # Check if element is a dictionary with text/tag information
        if isinstance(element, dict):
            text = element.get('text', '').lower().strip()
            tag = element.get('tag', '').lower()
            
            # Start summary when we find "summary"
            if "summary" in text and not found_technical_skills:
                in_summary = True
                in_technical_skills = False
                in_professional_summary = False
                in_experience = False
                continue
            
            # Start technical skills when we find "technical skills"
            if "technical skills" in text:
                in_summary = False
                in_technical_skills = True
                in_professional_summary = False
                in_experience = False
                found_technical_skills = True
                continue
            
            # Start professional summary when we find "professional summary"
            if "professional summary" in text or "professional experience" in text:
                in_summary = False
                in_technical_skills = False
                in_professional_summary = True
                in_experience = False
                found_professional_summary = True
                continue
            
            # Start new experience when we find "confidential"
            if "confidential" in text and tag == 'p':
                # Save previous experience if exists
                if current_experience:
                    experiences.append(current_experience)
                
                # Start new experience
                current_experience = {
                    "job_role": "",
                    "responsibilities": []
                }
                in_summary = False
                in_technical_skills = False
                in_professional_summary = False
                in_experience = True
                continue
            
            # Extract job role (next P tag after confidential)
            if in_experience and tag == 'p' and not current_experience.get("job_role"):
                if "confidential" not in text:  # Skip the confidential tag itself
                    current_experience["job_role"] = element.get('text', '').strip()
                continue
            
            # Extract responsibilities (UL tags within experience)
            if in_experience and tag == 'ul':
                # Extract list items from UL
                list_items = element.get('items', [])
                for item in list_items:
                    if isinstance(item, str):
                        current_experience["responsibilities"].append(item.strip())
                    elif isinstance(item, dict):
                        current_experience["responsibilities"].append(item.get('text', '').strip())
                continue
            
            # Collect content based on current section
            if in_summary:
                summary_parts.append(element.get('text', '').strip())
            elif in_technical_skills:
                technical_skills_parts.append(element.get('text', '').strip())
            elif in_professional_summary and not in_experience:
                professional_summary_parts.append(element.get('text', '').strip())
        
        # Handle string elements directly
        elif isinstance(element, str):
            text = element.lower().strip()
            
            if "summary" in text and not found_technical_skills:
                in_summary = True
                in_technical_skills = False
                in_professional_summary = False
                in_experience = False
            elif "technical skills" in text:
                in_summary = False
                in_technical_skills = True
                in_professional_summary = False
                in_experience = False
                found_technical_skills = True
            elif "professional summary" in text or "professional experience" in text:
                in_summary = False
                in_technical_skills = False
                in_professional_summary = True
                in_experience = False
                found_professional_summary = True
            elif "confidential" in text:
                if current_experience:
                    experiences.append(current_experience)
                current_experience = {"job_role": "", "responsibilities": []}
                in_experience = True
            else:
                if in_summary:
                    summary_parts.append(element.strip())
                elif in_technical_skills:
                    technical_skills_parts.append(element.strip())
                elif in_professional_summary and not in_experience:
                    professional_summary_parts.append(element.strip())
    
    # Add the last experience if exists
    if current_experience:
        experiences.append(current_experience)
    
    # Build the result
    result["summary"] = " ".join(summary_parts).strip()
    result["technical_skills"] = " ".join(technical_skills_parts).strip()
    
    # If no technical skills found, use professional summary as fallback
    if not result["technical_skills"] and professional_summary_parts:
        result["technical_skills"] = " ".join(professional_summary_parts).strip()
    
    result["professional_experience"] = experiences
    
    # Save the result to output JSON file
    with open(output_json_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

# Alternative version with more flexible element parsing
def parse_resume_json_advanced(input_json_file: str, output_json_file: str) -> None:
    """
    Advanced parser with more flexible element handling.
    """
    
    with open(input_json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    result = {
        "summary": "",
        "technical_skills": "",
        "professional_experience": []
    }
    
    content = data.get('content', []) or data.get('text', []) or data.get('elements', []) or []
    
    def extract_text(element):
        """Extract text from various element formats"""
        if isinstance(element, str):
            return element
        elif isinstance(element, dict):
            return element.get('text', '') or element.get('content', '') or ''
        return ''
    
    sections = {
        'summary': [],
        'technical_skills': [],
        'professional_experience': []
    }
    
    current_section = None
    current_experience = None
    
    for element in content:
        text = extract_text(element).lower().strip()
        original_text = extract_text(element).strip()
        
        # Determine current section
        if "summary" in text and current_section != 'technical_skills':
            current_section = 'summary'
        elif "technical skills" in text:
            current_section = 'technical_skills'
        elif "professional summary" in text or "professional experience" in text:
            current_section = 'professional_experience'
        elif "confidential" in text:
            # Start new experience
            if current_experience and current_experience.get('job_role') or current_experience.get('responsibilities'):
                sections['professional_experience'].append(current_experience)
            current_experience = {'job_role': '', 'responsibilities': []}
        
        # Collect content based on section
        if current_section and original_text:
            if current_section == 'professional_experience' and current_experience is not None:
                # Handle job role and responsibilities
                if not current_experience['job_role'] and "confidential" not in text:
                    current_experience['job_role'] = original_text
                elif isinstance(element, dict) and element.get('tag', '').lower() == 'ul':
                    # Extract list items
                    items = element.get('items', [])
                    for item in items:
                        item_text = extract_text(item)
                        if item_text:
                            current_experience['responsibilities'].append(item_text)
            else:
                # For summary and technical skills
                if current_section in sections and "confidential" not in text:
                    sections[current_section].append(original_text)
    
    # Add the last experience
    if current_experience and (current_experience.get('job_role') or current_experience.get('responsibilities')):
        sections['professional_experience'].append(current_experience)
    
    # Build final result
    result["summary"] = " ".join(sections['summary']).strip()
    result["technical_skills"] = " ".join(sections['technical_skills']).strip()
    result["professional_experience"] = sections['professional_experience']
    
    with open(output_json_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)


In [None]:
all_resume_links[0]

'https://www.hireitpeople.com/resume-database/70-oracle-developers-resumes/628272-teradata-dba-resume-hoffman-estates-il-2'

In [None]:
parse_resume_json('post_content_79_2.json', 'output.json')

AttributeError: 'list' object has no attribute 'get'

In [None]:
def naveen_parser(input_json_file: str, output_json_file: str) -> None:
    new_data = {}
    with open(input_json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    flag = None
    list_data = []
    data = data['structured_content']
    for block in data:
        if block.get("type") == "p":
            if(block.get("text").lower().strip() == "summary"):
                flag = "summary"
                new_data["summary"] = block.get("text")
        elif block.get("type") == "ul" and flag != None:
            
            


In [None]:
naveen_parser("post_content_79_2.json", "structured_resume.json")

SUMMARY:
['Over 18 years of experience in Application design, development and data architecture on a wide variety of platforms, operating systems and databases mainly in Banking and Finance domain.', 'Extensive experience managing and supporting multiple projects involving design and development of OLTP & Data warehouse databases in Oracle 12c/11.2/10g , Teradata, Netezza, NoSQL databases - Cassandra , SQL Server across multiple industries such as Banking, Insurance, Financial, Taxation.', 'More than 8 years of experience as D ata architect involved in design and implementation of database solutions, data integration techniques, D ata modeling , Master Data management & data governance, data quality/data profiling techniques', 'Extensive experience in Oracle Sql & PLSQL code development, writing and reviewing procedures, functions, packages , managing indexes and partitions , reviewing execution plans for query tuning and optimization, OEM database monitoring, OBIEE repositories and da

In [None]:
from typing import List, Optional, Dict, Any
from pydantic import BaseModel
from enum import Enum

class SectionType(Enum):
    SUMMARY = "summary"
    TECHNICAL_SKILLS = "technical_skills"
    PROFESSIONAL_EXPERIENCE = "professional_experience"
    UNKNOWN = "unknown"

class Experience(BaseModel):
    company: str
    job_role: str
    responsibilities: List[str]
    environment: Optional[str] = None

class Resume(BaseModel):
    job_role: str
    professional_summary: List[str]
    technical_skills: List[str]
    experiences: List[Experience]

class SectionTracker:
    def __init__(self):
        self.current_section = SectionType.UNKNOWN
        self.section_boundaries = {
            SectionType.SUMMARY: ("SUMMARY:", "TECHNICAL SKILLS:"),
            SectionType.TECHNICAL_SKILLS: ("TECHNICAL SKILLS:", "PROFESSIONAL EXPERIENCE:"),
            SectionType.PROFESSIONAL_EXPERIENCE: ("PROFESSIONAL EXPERIENCE:", None)
        }
    
    def update_section(self, text: str) -> bool:
        """Update current section based on text content"""
        text_upper = text.upper().strip()
        
        for section, (start_marker, end_marker) in self.section_boundaries.items():
            if start_marker and text_upper.startswith(start_marker):
                self.current_section = section
                return True
            elif end_marker and text_upper.startswith(end_marker):
                self.current_section = SectionType(section.value)
                return True
        
        return False

class ExperienceBuilder:
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.current_company = None
        self.current_role = None
        self.current_responsibilities = []
        self.current_environment = None
        self.is_building = False
    
    def start_experience(self, company: str):
        """Start a new experience block"""
        self.reset()
        self.current_company = company
        self.is_building = True
    
    def set_role(self, role: str):
        """Set the job role for current experience"""
        self.current_role = role
    
    def add_responsibility(self, responsibility: str):
        """Add responsibility to current experience"""
        self.current_responsibilities.append(responsibility)
    
    def set_environment(self, environment: str):
        """Set environment for current experience"""
        self.current_environment = environment
    
    def build(self) -> Optional[Experience]:
        """Build and return the experience if valid"""
        if (self.is_building and self.current_company and 
            self.current_role and self.current_responsibilities):
            return Experience(
                company=self.current_company,
                job_role=self.current_role,
                responsibilities=self.current_responsibilities.copy(),
                environment=self.current_environment
            )
        return None

class JSONResumeMerger:
    def __init__(self):
        self.section_tracker = SectionTracker()
        self.experience_builder = ExperienceBuilder()
        self.resume_data = {
            'professional_summary': [],
            'technical_skills': [],
            'experiences': []
        }
    
    def is_confidential_company(self, text: str) -> bool:
        """Check if text indicates a confidential company"""
        return text.upper().startswith("CONFIDENTIAL")
    
    def is_environment_line(self, text: str) -> bool:
        """Check if text is an environment line"""
        return text.upper().startswith("ENVIRONMENT:")
    
    def extract_environment(self, text: str) -> str:
        """Extract environment from environment line"""
        if ":" in text:
            return text.split(":", 1)[1].strip()
        return text.strip()
    
    def process_structured_content(self, structured_content: List[Dict[str, Any]]) -> Resume:
        """Process structured content and build resume"""
        
        i = 0
        while i < len(structured_content):
            item = structured_content[i]
            
            # Update section tracker
            if item['type'] == 'p':
                self.section_tracker.update_section(item['text'])
            
            # Process based on current section
            if self.section_tracker.current_section == SectionType.SUMMARY:
                self._process_summary_section(item)
            
            elif self.section_tracker.current_section == SectionType.TECHNICAL_SKILLS:
                self._process_technical_skills_section(item)
            
            elif self.section_tracker.current_section == SectionType.PROFESSIONAL_EXPERIENCE:
                i = self._process_experience_section(structured_content, i)
            
            i += 1
        
        # Build any remaining experience
        self._finalize_current_experience()
        
        return Resume(**self.resume_data)
    
    def _process_summary_section(self, item: Dict[str, Any]):
        """Process items in summary section"""
        if item['type'] == 'ul':
            self.resume_data['professional_summary'].extend(item['items'])
    
    def _process_technical_skills_section(self, item: Dict[str, Any]):
        """Process items in technical skills section"""
        if item['type'] == 'p' and not item['text'].upper().startswith("TECHNICAL SKILLS:"):
            self.resume_data['technical_skills'].append(item['text'])
    
    def _process_experience_section(self, structured_content: List[Dict[str, Any]], current_index: int) -> int:
        """Process professional experience section"""
        item = structured_content[current_index]
        
        if item['type'] == 'p':
            text = item['text']
            
            # Check for confidential company
            if self.is_confidential_company(text):
                self._finalize_current_experience()
                self.experience_builder.start_experience(text)
            
            # Check for job role (next P tag after confidential)
            elif (self.experience_builder.is_building and 
                  not self.experience_builder.current_role and
                  not self.is_confidential_company(text) and
                  not self.is_environment_line(text)):
                self.experience_builder.set_role(text)
            
            # Check for environment
            elif self.is_environment_line(text):
                environment = self.extract_environment(text)
                self.experience_builder.set_environment(environment)
        
        # Process responsibilities from UL tags
        elif item['type'] == 'ul' and self.experience_builder.is_building:
            for responsibility in item['items']:
                self.experience_builder.add_responsibility(responsibility)
            
            # Check if next item is environment or new experience
            if current_index + 1 < len(structured_content):
                next_item = structured_content[current_index + 1]
                if (next_item['type'] == 'p' and 
                    (self.is_environment_line(next_item['text']) or 
                     self.is_confidential_company(next_item['text']))):
                    self._finalize_current_experience()
        
        return current_index
    
    def _finalize_current_experience(self):
        """Finalize and add current experience to resume data"""
        experience = self.experience_builder.build()
        if experience:
            self.resume_data['experiences'].append(experience)
        self.experience_builder.reset()

# Usage example
def merge_json_resume(json_data: Dict[str, Any]) -> Resume:
    """Main function to merge JSON resume data"""
    merger = JSONResumeMerger()
    
    if 'structured_content' in json_data:
        return merger.process_structured_content(json_data['structured_content'])
    else:
        raise ValueError("No structured_content found in JSON data")

# Example usage with your sample data
if __name__ == "__main__":
    # Your sample JSON data would be loaded here
    sample_json = result
    
    try:
        resume = merge_json_resume(sample_json)
        
        # Print results
        print("PROFESSIONAL SUMMARY:")
        for summary in resume.professional_summary:
            print(f"- {summary}")
        
        print("\nTECHNICAL SKILLS:")
        for skill in resume.technical_skills:
            print(f"- {skill}")
        
        print("\nEXPERIENCES:")
        for exp in resume.experiences:
            print(f"\nCompany: {exp.company}")
            print(f"Role: {exp.job_role}")
            print("Responsibilities:")
            for resp in exp.responsibilities:
                print(f"  - {resp}")
            if exp.environment:
                print(f"Environment: {exp.environment}")
                
    except Exception as e:
        print(f"Error processing resume: {e}")

PROFESSIONAL SUMMARY:
- Over 18 years of experience in Application design, development and data architecture on a wide variety of platforms, operating systems and databases mainly in Banking and Finance domain.
- Extensive experience managing and supporting multiple projects involving design and development of OLTP & Data warehouse databases in Oracle 12c/11.2/10g , Teradata, Netezza, NoSQL databases - Cassandra , SQL Server across multiple industries such as Banking, Insurance, Financial, Taxation.
- More than 8 years of experience as D ata architect involved in design and implementation of database solutions, data integration techniques, D ata modeling , Master Data management & data governance, data quality/data profiling techniques
- Extensive experience in Oracle Sql & PLSQL code development, writing and reviewing procedures, functions, packages , managing indexes and partitions , reviewing execution plans for query tuning and optimization, OEM database monitoring, OBIEE repositor

In [None]:
new_json = {}
section = None
for item in result['structured_content']:
    if item.get("type") == "p" and section != "TECHNICAL SKILLS:" and section != "PROFESSIONAL EXPERIENCE:":
        section = item.get("text").lower().strip()
        new_json[section] = []
    elif item.get("type") == "p" and section == "technical skills" and item.get("text").lower().strip() != "technical skills:":
        new_json[section].extend(item.get("text"))
    elif item.get("type") == "ul":
        new_json[section].extend(item.get("items"))
        


In [None]:
def parse_resume(json_data):
    # Initialize the resume structure
    resume = {
        "professional_summary": [],
        "technical_skills": [],
        "experiences": []
    }
    
    structured_content = json_data.get("structured_content", [])
    
    # Flags to track current section
    in_summary = False
    in_technical_skills = False
    in_professional_experience = False
    
    # Variables for experience parsing
    current_experience = None
    current_job_role = None
    current_responsibilities = []
    current_environment = None
    
    # Iterate through each element in structured_content
    i = 0
    while i < len(structured_content):
        element = structured_content[i]
        
        # Check if we're entering SUMMARY section
        if element["type"] == "p" and "SUMMARY" in element["text"]:
            in_summary = True
            in_technical_skills = False
            in_professional_experience = False
            i += 1
            continue
        
        # Check if we're entering TECHNICAL SKILLS section
        elif element["type"] == "p" and "TECHNICAL SKILLS" in element["text"]:
            in_summary = False
            in_technical_skills = True
            in_professional_experience = False
            i += 1
            continue
        
        # Check if we're entering PROFESSIONAL EXPERIENCE section
        elif element["type"] == "p" and "PROFESSIONAL EXPERIENCE" in element["text"]:
            in_summary = False
            in_technical_skills = False
            in_professional_experience = True
            i += 1
            continue
        
        # Process SUMMARY section
        elif in_summary:
            if element["type"] == "ul":
                resume["professional_summary"].extend(element["items"])
            i += 1
            continue
        
        # Process TECHNICAL SKILLS section
        elif in_technical_skills:
            if element["type"] == "p":
                # Skip the "TECHNICAL SKILLS:" header itself
                if "TECHNICAL SKILLS" not in element["text"]:
                    resume["technical_skills"].append(element["text"])
            i += 1
            continue
        
        # Process PROFESSIONAL EXPERIENCE section
        elif in_professional_experience:
            # Check for "Confidential" P tags to identify new experiences
            if element["type"] == "p" and "Confidential" in element["text"]:
                # Save previous experience if it exists
                if current_experience is not None:
                    experience_data = {
                        "job_role": current_job_role,
                        "responsibilities": current_responsibilities.copy()
                    }
                    if current_environment:
                        experience_data["environment"] = current_environment
                    resume["experiences"].append(experience_data)
                
                # Reset for new experience
                current_job_role = None
                current_responsibilities = []
                current_environment = None
                
                # The next P tag after "Confidential" should be the job role
                if i + 1 < len(structured_content):
                    next_element = structured_content[i + 1]
                    if next_element["type"] == "p":
                        current_job_role = next_element["text"]
                        i += 2  # Skip both confidential and job role
                    else:
                        i += 1
                else:
                    i += 1
                continue
            
            # Check for UL tags (responsibilities)
            elif element["type"] == "ul" and current_job_role:
                current_responsibilities.extend(element["items"])
                i += 1
                continue
            
            # Check for "Environment" P tags
            elif element["type"] == "p" and "Environment:" in element["text"]:
                current_environment = element["text"].replace("Environment:", "").strip()
                i += 1
                continue
            
            # Regular P tag in experience section (might be environment or other info)
            elif element["type"] == "p":
                # If we don't have a job role yet but we're in an experience, this might be it
                if not current_job_role and "Confidential" not in element["text"]:
                    current_job_role = element["text"]
                i += 1
                continue
            
            else:
                i += 1
                continue
        
        else:
            i += 1
            continue
    
    # Don't forget to add the last experience
    if current_job_role:
        experience_data = {
            "job_role": current_job_role,
            "responsibilities": current_responsibilities.copy()
        }
        if current_environment:
            experience_data["environment"] = current_environment
        resume["experiences"].append(experience_data)
    
    return resume

# Alternative version that returns a Resume object (if you want to use the Pydantic model)
def parse_resume_to_object(json_data):
    parsed_data = parse_resume(json_data)
    return Resume(**parsed_data)

# Usage example:
if __name__ == "__main__":
    # Assuming your JSON data is in a variable called 'sample_json'
    resume_data = parse_resume(result)
    
    # Print the results to verify
    print("PROFESSIONAL SUMMARY:")
    for item in resume_data["professional_summary"]:
        print(f"- {item}")
    
    print("\nTECHNICAL SKILLS:")
    for skill in resume_data["technical_skills"]:
        print(f"- {skill}")
    
    print("\nEXPERIENCES:")
    for exp in resume_data["experiences"]:
        print(f"\nJob Role: {exp['job_role']}")
        if 'environment' in exp:
            print(f"Environment: {exp['environment']}")
        print("Responsibilities:")
        for resp in exp['responsibilities']:
            print(f"- {resp}")

PROFESSIONAL SUMMARY:
- Over 18 years of experience in Application design, development and data architecture on a wide variety of platforms, operating systems and databases mainly in Banking and Finance domain.
- Extensive experience managing and supporting multiple projects involving design and development of OLTP & Data warehouse databases in Oracle 12c/11.2/10g , Teradata, Netezza, NoSQL databases - Cassandra , SQL Server across multiple industries such as Banking, Insurance, Financial, Taxation.
- More than 8 years of experience as D ata architect involved in design and implementation of database solutions, data integration techniques, D ata modeling , Master Data management & data governance, data quality/data profiling techniques
- Extensive experience in Oracle Sql & PLSQL code development, writing and reviewing procedures, functions, packages , managing indexes and partitions , reviewing execution plans for query tuning and optimization, OEM database monitoring, OBIEE repositor

### Parser

In [9]:
def parse_resume(json_data):
    # Initialize the resume structure
    resume = {
        "professional_summary": [],
        "technical_skills": [],
        "experiences": []
    }
    
    structured_content = json_data.get("structured_content", [])
    
    # Flags to track current section
    in_summary = False
    in_technical_skills = False
    in_professional_experience = False
    
    # Variables for experience parsing
    current_experience = None
    current_job_role = None
    current_responsibilities = []
    current_environment = None
    experience_started = False
    
    # Iterate through each element in structured_content
    i = 0
    while i < len(structured_content):
        element = structured_content[i]
        
        # Check if we're entering SUMMARY section
        if element["type"] == "p" and "SUMMARY" in element["text"]:
            in_summary = True
            in_technical_skills = False
            in_professional_experience = False
            i += 1
            continue
        
        # Check if we're entering TECHNICAL SKILLS section
        elif element["type"] == "p" and "TECHNICAL SKILLS" in element["text"]:
            in_summary = False
            in_technical_skills = True
            in_professional_experience = False
            i += 1
            continue
        
        # Check if we're entering PROFESSIONAL EXPERIENCE section
        elif element["type"] == "p" and "PROFESSIONAL EXPERIENCE" in element["text"]:
            in_summary = False
            in_technical_skills = False
            in_professional_experience = True
            i += 1
            continue
        
        # Process SUMMARY section
        elif in_summary:
            if element["type"] == "ul":
                resume["professional_summary"].extend(element["items"])
            i += 1
            continue
        
        # Process TECHNICAL SKILLS section
        elif in_technical_skills:
            if element["type"] == "p":
                # Skip the "TECHNICAL SKILLS:" header itself
                if "TECHNICAL SKILLS" not in element["text"]:
                    resume["technical_skills"].append(element["text"])
            i += 1
            continue
        
        # Process PROFESSIONAL EXPERIENCE section
        elif in_professional_experience:
            # Check for "Confidential" P tags to identify new experiences
            if element["type"] == "p" and "Confidential" in element["text"]:
                # Save previous experience if it exists and has data
                if experience_started and current_job_role:
                    experience_data = {
                        "job_role": current_job_role,
                        "responsibilities": current_responsibilities.copy()
                    }
                    if current_environment:
                        experience_data["environment"] = current_environment
                    resume["experiences"].append(experience_data)
                
                # Reset for new experience
                current_job_role = None
                current_responsibilities = []
                current_environment = None
                experience_started = True
                
                # The next P tag after "Confidential" should be the job role
                if i + 1 < len(structured_content):
                    next_element = structured_content[i + 1]
                    if next_element["type"] == "p":
                        current_job_role = next_element["text"]
                        i += 2  # Skip both confidential and job role
                    else:
                        i += 1
                else:
                    i += 1
                continue
            
            # Check for UL tags (responsibilities)
            elif element["type"] == "ul" and current_job_role:
                current_responsibilities.extend(element["items"])
                i += 1
                continue
            
            # Check for "Environment" P tags
            elif element["type"] == "p" and "Environment:" in element["text"]:
                current_environment = element["text"].replace("Environment:", "").strip()
                i += 1
                continue
            
            # Check for "Environment" without colon (some entries might have different formatting)
            elif element["type"] == "p" and "Environment" in element["text"] and current_job_role:
                current_environment = element["text"].replace("Environment", "").strip()
                if current_environment.startswith(":"):
                    current_environment = current_environment[1:].strip()
                i += 1
                continue
            
            # Regular P tag in experience section (might be environment or other info)
            elif element["type"] == "p":
                # If we don't have a job role yet but we're in an experience, this might be it
                if not current_job_role and "Confidential" not in element["text"] and "PROFESSIONAL EXPERIENCE" not in element["text"]:
                    current_job_role = element["text"]
                i += 1
                continue
            
            else:
                i += 1
                continue
        
        else:
            i += 1
            continue
    
    # Don't forget to add the last experience if it exists
    if experience_started and current_job_role:
        experience_data = {
            "job_role": current_job_role,
            "responsibilities": current_responsibilities.copy()
        }
        if current_environment:
            experience_data["environment"] = current_environment
        resume["experiences"].append(experience_data)
    
    return resume

# Alternative version that returns a Resume object
def parse_resume_to_object(json_data):
    parsed_data = parse_resume(json_data)
    return Resume(**parsed_data)

# Usage example:
if __name__ == "__main__":
    # Assuming your JSON data is in a variable called 'sample_json'
    resume_data = parse_resume(result)
    with open("structured_resume.json", "w") as f:
        json.dump(resume_data, f, indent=4)
    
    # Print the results to verify
    print("PROFESSIONAL SUMMARY:")
    for item in resume_data["professional_summary"]:
        print(f"- {item}")
    
    print("\nTECHNICAL SKILLS:")
    for skill in resume_data["technical_skills"]:
        print(f"- {skill}")
    
    print("\nEXPERIENCES:")
    for idx, exp in enumerate(resume_data["experiences"]):
        #print(f"\n--- Experience {idx + 1} ---")
        print("\n")
        print(f"Job Role: {exp['job_role']}")
        if 'environment' in exp:
            print(f"Environment: {exp['environment']}")
        print("Responsibilities:")
        for resp in exp['responsibilities']:
            print(f"- {resp}")

PROFESSIONAL SUMMARY:
- Over 18 years of experience in Application design, development and data architecture on a wide variety of platforms, operating systems and databases mainly in Banking and Finance domain.
- Extensive experience managing and supporting multiple projects involving design and development of OLTP & Data warehouse databases in Oracle 12c/11.2/10g , Teradata, Netezza, NoSQL databases - Cassandra , SQL Server across multiple industries such as Banking, Insurance, Financial, Taxation.
- More than 8 years of experience as D ata architect involved in design and implementation of database solutions, data integration techniques, D ata modeling , Master Data management & data governance, data quality/data profiling techniques
- Extensive experience in Oracle Sql & PLSQL code development, writing and reviewing procedures, functions, packages , managing indexes and partitions , reviewing execution plans for query tuning and optimization, OEM database monitoring, OBIEE repositor

In [58]:
resume_data = parse_resume(result)
with open("structured_resume.json", "w") as f:
    json.dump(resume_data, f, indent=4)

# Print the results to verify
print("PROFESSIONAL SUMMARY:")
for item in resume_data["professional_summary"]:
    print(f"- {item}")

print("\nTECHNICAL SKILLS:")
for skill in resume_data["technical_skills"]:
    print(f"- {skill}")

print("\nEXPERIENCES:")
for idx, exp in enumerate(resume_data["experiences"]):
    print(f"\n--- Experience {idx + 1} ---")
    print(f"Job Role: {exp['job_role']}")
    if 'environment' in exp:
        print(f"Environment: {exp['environment']}")
    print("Responsibilities:")
    for resp in exp['responsibilities']:
        print(f"- {resp}")

PROFESSIONAL SUMMARY:
- 8+ years of profound working experience in SQL, PL/SQL programming, System Analysis, Design, Development, and Support of Oracle Database … in Production, Staging, Development, and Testing Environments.
- Expertise in writing SQL queries, Views, Materialized views, PL/SQL procedures, functions, packages, triggers, cursors, collections, Ref cursor, cursor variables, System reference cursor , Dynamic SQL.
- Experience in developing External Tables, Joins, Indexes, and Sequences.
- Experience in query optimization, performance, and tuning (PL/SQL) using SQL Trace , Explain Plan, Indexing, Hints, Bulk Binds, Bulk Collect, Creation of global temporary tables and table partitioning.
- Experience in using several built - in database packages such as DBMS JOB, DBMS OUTPUT , UTL FILE , and DBMS SQL to name a few.
- Experience in all the phases of SDLC, including requirements gathering, system design, Development, Unit testing, Integration testing, implementation, and prod