# Web Scraping Resume Skills with BeautifulSoup

This notebook demonstrates how to extract skillset information from an HTML resume file using BeautifulSoup.

## Features:
- Reads the resume HTML file
- Parses and extracts the Skills section
- Organizes skills by category
- Displays formatted output

In [25]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [26]:
resume_url = input("Enter link to resume webpage (S3 URL): ")
# e.g., "https://your-bucket.s3.amazonaws.com/resume.html"

try:
    # Send GET request to fetch the webpage
    response = requests.get(resume_url)
    response.raise_for_status()  # Raise an error for bad status codes

    print(f"Successfully fetched webpage. Status Code: {response.status_code}")
    print(f"Content Length: {len(response.content)} bytes")

except requests.exceptions.RequestException as e:
    print(f"Error fetching webpage: {e}")
    response = None


Successfully fetched webpage. Status Code: 200
Content Length: 8083 bytes


In [27]:
# Step 2: Parse the HTML content
if response:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Pretty print the HTML structure (first 1000 characters)
    print("HTML Structure Preview:")
    print(soup.prettify()[:1000])


HTML Structure Preview:
<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Resume - Krish Aggarwal
  </title>
  <style>
   * {
      margin: 0;
      padding: 0;
      box-sizing: border-box;
    }
    body {
      font-family: 'Times New Roman', Times, serif;
      max-width: 850px;
      margin: 0 auto;
      padding: 40px 50px;
      line-height: 1.5;
      color: #000;
      background: #fff;
    }
    h1 {
      text-align: center;
      font-size: 28px;
      font-weight: bold;
      margin-bottom: 8px;
    }
    .contact-info {
      text-align: center;
      font-size: 11px;
      margin-bottom: 20px;
      display: flex;
      justify-content: center;
      align-items: center;
      gap: 8px;
      flex-wrap: wrap;
    }
    .contact-info a {
      color: #0066cc;
      text-decoration: none;
    }
    .contact-info a:hover {
      text-decoration: underline;
   


In [28]:
# Step 3: Extract Skills Section
# This will try multiple common patterns for skills sections

def extract_skills(soup):
    """
    Extract skills from resume webpage using multiple strategies
    """
    skills = []

    # Strategy 1: Look for sections with 'skill' in id or class
    skills_section = (
        soup.find(id=lambda x: x and 'skill' in x.lower()) or
        soup.find(class_=lambda x: x and 'skill' in str(x).lower()) or
        soup.find('section', class_=lambda x: x and 'skill' in str(x).lower()) or
        soup.find('div', class_=lambda x: x and 'skill' in str(x).lower())
    )

    if skills_section:
        print("Found skills section using Strategy 1 (id/class matching)")

        # Extract text from list items
        list_items = skills_section.find_all(['li', 'span', 'p'])
        for item in list_items:
            skill_text = item.get_text(strip=True)
            if skill_text and len(skill_text) > 0:
                skills.append(skill_text)

        # If no list items, get all text
        if not skills:
            skills_text = skills_section.get_text(separator='\n', strip=True)
            skills = [line.strip() for line in skills_text.split('\n') if line.strip()]

    # Strategy 2: Look for headings containing 'skill'
    if not skills:
        print("Trying Strategy 2 (heading-based search)")
        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        for heading in headings:
            if 'skill' in heading.get_text().lower():
                # Get the next sibling elements
                next_element = heading.find_next_sibling()
                while next_element:
                    if next_element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                        break
                    if next_element.name in ['ul', 'ol']:
                        list_items = next_element.find_all('li')
                        skills.extend([item.get_text(strip=True) for item in list_items])
                    elif next_element.name in ['p', 'div']:
                        text = next_element.get_text(strip=True)
                        if text:
                            skills.append(text)
                    next_element = next_element.find_next_sibling()
                if skills:
                    break

    # Strategy 3: Search for all lists and filter
    if not skills:
        print("Trying Strategy 3 (comprehensive list search)")
        all_lists = soup.find_all(['ul', 'ol'])
        for lst in all_lists:
            # Check if parent or previous sibling mentions skills
            context = ""
            if lst.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
                context = lst.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']).get_text().lower()

            if 'skill' in context:
                list_items = lst.find_all('li')
                skills.extend([item.get_text(strip=True) for item in list_items])
                break

    return skills


In [29]:
# Step 4: Execute extraction and display results
if response:
    extracted_skills = extract_skills(soup)

    print("\n" + "="*50)
    print("EXTRACTED SKILLS")
    print("="*50)

    if extracted_skills:
        for idx, skill in enumerate(extracted_skills, 1):
            print(f"{idx}. {skill}")

        print(f"\nTotal skills found: {len(extracted_skills)}")
    else:
        print("No skills found. The HTML structure might be different.")
        print("\nTip: Inspect the HTML structure manually to identify the skills section.")
        print("You can view the full HTML by uncommenting the line below:")
        print("# print(soup.prettify())")

    print("="*50)


Found skills section using Strategy 1 (id/class matching)

EXTRACTED SKILLS
1. Programming Languages:Java, JavaScript, C, C++, Python
2. Backend & APIs:Spring Boot, RESTful Web Services, Node.js, Express.js, JWT Authentication
3. Frontend:React.js, HTML, CSS, JavaScript
4. Databases:MySQL, MongoDB
5. Cloud & DevOps:AWS (EC2, basic deployment concepts), Git, GitHub
6. Testing & Quality:API Testing (Postman), Debugging, Refactoring, Performance Optimization (Basics)
7. Development Practices:Agile/Scrum, Microservices Architecture, Secure Coding Fundamentals

Total skills found: 7


In [30]:
# Step 5: Save skills to a CSV file
if response and extracted_skills:
    # Create a DataFrame
    skills_df = pd.DataFrame({
        'Skill_Number': range(1, len(extracted_skills) + 1),
        'Skill': extracted_skills
    })

    # Save to CSV
    output_file = 'extracted_skills.csv'
    skills_df.to_csv(output_file, index=False)
    print(f"\nSkills saved to: {output_file}")

    # Display the DataFrame
    print("\nSkills DataFrame:")
    print(skills_df)



Skills saved to: extracted_skills.csv

Skills DataFrame:
   Skill_Number                                              Skill
0             1  Programming Languages:Java, JavaScript, C, C++...
1             2  Backend & APIs:Spring Boot, RESTful Web Servic...
2             3           Frontend:React.js, HTML, CSS, JavaScript
3             4                           Databases:MySQL, MongoDB
4             5  Cloud & DevOps:AWS (EC2, basic deployment conc...
5             6  Testing & Quality:API Testing (Postman), Debug...
6             7  Development Practices:Agile/Scrum, Microservic...


In [None]:
# Step 6: Advanced extraction - Get all sections
def extract_all_sections(soup):
    """
    Extract all major sections from the resume
    """
    sections = {}

    # Find all headings
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

    for heading in headings:
        section_title = heading.get_text(strip=True)
        section_content = []

        # Get content until next heading
        next_element = heading.find_next_sibling()
        while next_element and next_element.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            text = next_element.get_text(strip=True)
            if text:
                section_content.append(text)
            next_element = next_element.find_next_sibling()

        if section_content:
            sections[section_title] = section_content

    return sections


In [None]:
# Optional: Extract all sections for complete resume data
if response:
    all_sections = extract_all_sections(soup)

    print("\n" + "="*50)
    print("ALL RESUME SECTIONS")
    print("="*50)

    for section_title, content in all_sections.items():
        print(f"\n### {section_title} ###")
        for item in content:
            print(f"  - {item}")

    print("="*50)

