In [2]:
import requests
import re
from bs4 import BeautifulSoup, Tag
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import time

from pprint import pprint

import html2text
from lxml import html

In [16]:
def scrape_resume_markdown(url: str, session=None) -> str:
    """Fetch a resume page and convert main content to Markdown-like text."""
    if session is None:
        session = requests.Session()

    response = session.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "lxml")

    # find main content blocks
    target_divs = soup.find_all("div", class_=["media-body", "single-post-body"])
    if not target_divs:
        return ""

    markdown_parts = []

    for div in target_divs:
        div_html = str(div)
        tree = html.fromstring(div_html)

        for element in tree.iter():
            tag = element.tag.lower()

            # headings
            if tag in ["h1", "h2", "h3", "u", "strong", "b"]:
                text = element.text_content().strip()
                if text:
                    markdown_parts.append(f"\n# {text}\n")

            # paragraphs
            elif tag == "p":
                text = element.text_content().strip()
                if text:
                    markdown_parts.append(f"{text}\n")

            # unordered lists
            elif tag == "ul":
                for li in element:
                    li_text = li.text_content().strip()
                    if li_text:
                        markdown_parts.append(f"- {li_text}\n")

            # ordered lists
            elif tag == "ol":
                for i, li in enumerate(element, 1):
                    li_text = li.text_content().strip()
                    if li_text:
                        markdown_parts.append(f"{i}. {li_text}\n")

    markdown_text = "".join(markdown_parts)
    return markdown_text.strip()

In [17]:
markdown_resume = scrape_resume_markdown(url=test_urls[4])
print(markdown_resume)

# .net Developer/analyst/architect Resume

# 4.00
PROFESSIONAL SUMMARY

# PROFESSIONAL SUMMARY
- I have 10 years of professional expertise in  the field of software engineering, where I graduated and built my  experience as a senior software engineer, system 
   analyst, system architect and knowledge  in IT project management.
- Theoretically and radically my knowledge is  based on the axis of Object Oriented  
  Concepts and Object Oriented Software  Engineering (OOSE).
- I have been practically using .Net and all the  other related technologies since the inception of .Net right from the time,  when it was in Beta version.
- I worked with VB (4, 5 and 6) and (C, C++ and  VC++6) before I started my  professional work, then moved to  .NET(1.0,1.1,2.0,3.0,3.5 and 4.0).
- I have good experience in .Net and all the  aspects of this tool, which I have used  to create Windows, Web applications and  Services in both VB.Net and C#.
- As to the Database, I utilized both Oracle and  SQL (2000, 

In [1]:
test_urls = [
    "https://www.hireitpeople.com/resume-database/80-peoplesoft-resumes/627367-peoplesoft-hcm-hrms-functional-consultant-resume-3",
    "https://www.hireitpeople.com/resume-database/80-peoplesoft-resumes/613216-peoplesoft-functional-consultant-business-analyst-resume-atlanta-ga-4",
    "https://www.hireitpeople.com/resume-database/69-help-desk-support-resumes/614271-help-desk-support-trading-floor-support-resume-new-york-ny",
    "https://www.hireitpeople.com/resume-database/69-help-desk-support-resumes/606776-computer-technician-field-associate-resume-new-york-ny-11",
    "https://www.hireitpeople.com/resume-database/63-net-developers-architects-resumes/442-net-developeranalystarchitect-resume-",
]

In [30]:
def scrape_resume(url: str, session=None) -> dict:
    """Scrape details from an individual resume page and return plain text."""
    if session is None:
        session = requests.Session()

    response = session.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "lxml")
    data = {}

    target_divs = soup.find_all("div", class_=["single-post-body"])
    print(target_divs)
    text_blocks = []
    for div in target_divs:
        text = div.get_text(separator=" ", strip=True)
        if text:
            text_blocks.append(text)

    clean_text = "\n\n".join(text_blocks)

    return data


In [31]:
data = scrape_resume(url=test_urls[0])
# print(data)

[<div class="single-post-body">
<p>
</p><p><u><strong>SUMMARY</strong></u></p>
<ul>
<li>8 years of consultancy experience including in PeopleSoft HCM,Global Payroll,Business Analysis, Project Planning &amp; implementation of complex information technology projects in different sectors.</li>
<li>Experience in Requirement Gathering, JAD sessions, Writing Specifications, Testing, UAT, Implementation, Change Management, Defect Management, Issue Resolution and Support.</li>
<li>Excellent understanding of the software methodologies such as Agile, waterfall and RUP.</li>
<li>Coordinate and supervise the progress of all phases of the project, those of Strategy, Planning, Structure, Construct, Transition and Deploy.</li>
<li>Extensive knowledge about the concepts of PeopleSoft Time and Labor.</li>
<li>Excellent communication, team building and project management skills.</li>
<li>Implementation of PeopleSoft HCM and Global Payroll version 9.0.</li>
<li>Supporting NA Payroll, Time &amp; Labor, Be

In [None]:
def make_list(ul):
    if isinstance(ul, str):
        soup = BeautifulSoup(ul, "lxml")
        ul_tag = soup.find("ul")
        if not ul_tag:
            return []
    elif isinstance(ul, Tag):
        ul_tag = ul
    else:
        return []

    points = []
    for li in ul_tag.find_all("li", recursive=False):
        text = li.get_text(" ", strip=True)
        if text:
            points.append(text)
    return points


In [238]:
def scrape_resume(url: str, session=None) -> dict:
    """Scrape details from an individual resume page, keeping DOM tags without duplication."""
    if session is None:
        session = requests.Session()

    response = session.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "lxml")
    sections = {}

    target_divs = soup.find_all("div", class_=["media-body", "single-post-body"])
    print(target_divs)
    def get_real_keys(strong_tags):
        possible_keys = {"TECHNICAL SKILLS", "SKILLS", "PROFESSIONAL EXPERIENCE", "EXPERIENCE", "SUMMARY", "OBJECTIVE"}
        ret_list = []
        for tag in strong_tags:
            for possible_key in possible_keys:
                if possible_key in tag:
                    ret_list.append(tag)
        return ret_list

    for div in target_divs:
        real_keys = div.find_all("u")
        # strong_tags = div.find_all("strong")
        # real_keys = get_real_keys(strong_tags)
        print(real_keys)
        for i, u_tag in enumerate(real_keys):
            key = u_tag.get_text(strip=True).replace(":", "").upper()

            content_tags = []
            for sibling in u_tag.parent.next_siblings:
                if isinstance(sibling, Tag):
                    if sibling.find("u"):
                        break
                    content_tags.append(sibling)

            sections[key] = content_tags

    return sections


res_dict = scrape_resume(test_urls[0])
# for key in res_dict.keys():
#     print(key, res_dict[key], end="\n -------------- \n")

[<div class="media-body">
<div>
<h3>Peoplesoft Hcm / Hrms Functional Consultant Resume</h3>
<div class="resume-rated mt-0">
<div class="rating-group">
<input checked="" class="rating__input rating__input--none" disabled="" id="rated-none" readonly="" type="radio" value="0"/>
<label aria-label="1 star" class="rating__label" for="rated-1"><i class="rating__icon rating__icon--star fa fa-star"></i></label>
<input class="rating__input" disabled="" id="rated-1" readonly="" type="radio" value="1"/>
<label aria-label="2 stars" class="rating__label" for="rated-2"><i class="rating__icon rating__icon--star fa fa-star"></i></label>
<input class="rating__input" disabled="" id="rated-2" readonly="" type="radio" value="2"/>
<label aria-label="3 stars" class="rating__label" for="rated-3"><i class="rating__icon rating__icon--star fa fa-star"></i></label>
<input class="rating__input" disabled="" id="rated-3" readonly="" type="radio" value="3"/>
<label aria-label="4 stars" class="rating__label" for="rate

In [235]:
def parse_professional_experience(res_dict):
    if "PROFESSIONAL EXPERIENCE" not in res_dict:
        return []

    items = res_dict["PROFESSIONAL EXPERIENCE"]
    prev_col, curr_col = False, None
    exps_list, exp_dict = [], {}
    i = 0

    while i < len(items):
        item = items[i]

        if item.find("strong"):
            strip_strong = item.get_text(strip=True)
            curr_col = ":" in strip_strong
            if prev_col and not curr_col and exp_dict:
                exps_list.append(exp_dict)
                exp_dict = {}

            exp_len = len(exp_dict)
            if exp_len == 0:
                exp_dict["company_name"] = strip_strong
            elif exp_len == 1:
                exp_dict["job_role"] = strip_strong
            else:
                text_lower = strip_strong.lower()

                if "responsi" in text_lower:
                    if i + 1 < len(items):
                        exp_dict["responsibilities"] = make_list(items[i+1])

                elif "environment" in text_lower[:20]:
                    exp_dict["environment"] = " ".join(strip_strong.split(":")[1:]).strip()

            prev_col = curr_col

        i += 1

    if exp_dict:
        exps_list.append(exp_dict)

    return exps_list


In [236]:
res_dict = scrape_resume(test_urls[0])
# print(res_dict)
print(len(res_dict))
for exp in parse_px(res_dict):
    pprint(exp)

[]
[<u><strong>SUMMARY</strong></u>, <u><strong>TECHNICAL SKILLS</strong></u>, <u><strong>PROFESSIONAL EXPERIENCE</strong></u>]
3
{'company_name': 'Confidential, New jersey',
 'environment': 'Application Release 9.1 Tools 8.48,8.49,Microsoft Office '
                'Tools,MS Project, Windows Server 2003, Microsoft SQL Server.',
 'job_role': 'PeopleSoft HCM / HRMS Functional Consultant',
 'responsibilities': ['Making of the initial project plan with activities by '
                      'phases, key milestones and deliverables according to '
                      'the scope of the RFP.',
                      'Participating in meetings with members of the board '
                      'notifying progress and proposing solutions for some '
                      'problems within the project.',
                      'Co-ordination with multiple teams and integration '
                      'architects, in performing analysis for issues regarding '
                      'speed and stabilit