```css

 ___ ___  _ _  ___  ___
<_-</   \| | ||   \<_-<
/__/\___/`___||  _//__/
              |_|      

```
~ material for beautifulsoup ‧₊˚ ⋅ 𓐐𓎩 ‧₊˚ ⋅

₊˚ ⋅ table of contents ₊˚ ⋅ 
* [basics](#the-basics)
* [finding elements](#section-1)
* [using css selectors](#using-css-selectors)
* [extracting data from elements](#extracting-data-from-elements)
* [scraping example](#scraping-example)
* [exercise one](#exercise-one)
* [exercise two](#exercise-two)


In [None]:
!pip install requests beautifulsoup4 lxml pandas openpyxl

## the basics

In [None]:
import requests
import csv
import pandas as pd
from bs4 import BeautifulSoup

# --- Step 1: Get the HTML content ---
URL = "https://example.com/"

# It's good practice to include headers to mimic a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Referer': 'https://www.google.com/',
    'DNT': '1',  # Do Not Track Request Header
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0',
}

try:
    response = requests.get(URL, headers=headers, timeout=10) # timeout in seconds
    response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX)
    html_content = response.content # Use .content for binary response (bytes), .text for text
except requests.exceptions.RequestException as e:
    print(f"Error fetching URL {URL}: {e}")
    # exit() # Or handle error appropriately

# --- Step 2: Parse the HTML ---
# Common parsers: 'lxml' (fast, needs pip install lxml), 'html.parser' (Python built-in, slower)
soup = BeautifulSoup(html_content, 'lxml')
# soup = BeautifulSoup(html_content, 'html.parser') # Alternative

# --- Step 3: (Optional) Prettify to inspect structure (for debugging) ---
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Example Domain
  </title>
  <meta charset="utf-8"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <style type="text/css">
   body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
  </style>
 </head>
 <body>
  <div>
   <h1>
    Example Domain
   </h1>
   <p>
    This dom

## finding elements

In [7]:
# Find the first occurrence of a tag
first_p_tag = soup.find('p')
if first_p_tag:
    print(f"First <p>: {first_p_tag.text}")

# Find all occurrences of a tag (returns a list-like ResultSet)
all_a_tags = soup.find_all('a')

# Print all <a> tags with their text and href attributes
# for a_tag in all_a_tags:
#     print(f"Link text: {a_tag.text}, Href: {a_tag.get('href')}")

# Get the page title
page_title = soup.title
if page_title:
    print(f"Page Title: {page_title.name} - {page_title.string}") # .name is 'title', .string is the text

First <p>: This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.
Page Title: title - Example Domain


## using css selectors
`select()` and `select_one()`

In [14]:
# --- Select by Tag ---
all_paragraphs_css = soup.select('p')

# --- Select by Class ---
elements_with_class_css = soup.select('.someClassName') # Note the leading dot for class

# --- Select by ID ---
element_with_id_css = soup.select_one('#uniqueElementId') # Note the leading hash for ID

# --- Select by Attribute ---
# Tag with an attribute
links_with_href = soup.select('a[href]')

# Tag with a specific attribute value
specific_link = soup.select_one('a[href="http://example.com"]')

# Tag with an attribute starting with a value
links_starting_with_https = soup.select('a[href^="https://"]')

# Tag with an attribute ending with a value
image_files = soup.select('img[src$=".jpg"]')

# Tag with an attribute containing a value
links_containing_search = soup.select('a[href*="search"]')

# --- Descendant Combinator (space) ---
# Selects <a> tags that are descendants of a <div> with class 'container'
links_in_container = soup.select('div.container a')

# --- Child Combinator (>) ---
# Selects <p> tags that are direct children of a <div> with ID 'content'
paragraphs_in_content_div = soup.select('div#content > p')

# --- Adjacent Sibling Combinator (+) ---
# Selects the <p> immediately following an <h2>
p_after_h2 = soup.select_one('h2 + p')

# --- General Sibling Combinator (~) ---
# Selects all <p> siblings that follow an <h2>
all_p_siblings_after_h2 = soup.select('h2 ~ p')

# --- Multiple Selectors (comma separated) ---
# Selects all <h2> tags AND all <p> tags with class 'important'
headings_and_important_text = soup.select('h2, p.important')

# --- Example usage with select_one ---
first_article_title = soup.select_one('article.post h1.title')
# if first_article_title:
# print(f"First article title: {first_article_title.text.strip()}")

## extracting data from elements
Once you have a specific HTML tag (e.g., from `soup.find(...)` or `soup.select_one(...)`), here's how to get information out of it.

In [None]:
# Assuming 'element' is a Tag object returned by find() or select_one()
# Example: element = soup.find('a', class_='my-link')

# --- Get Text Content ---
# .text: Returns all text within the tag, including children, stripped of whitespace at ends.
# .string: If a tag has only one child and it's a NavigableString, .string gives that string.
#          Otherwise, it's None. Safer to use .text or .get_text().
# .get_text(): More robust. Can specify a separator for text from different tags.
#              .get_text(strip=True) removes whitespace from start/end of each string segment
#              before joining.

if element:
    print(f"Text: {element.text.strip()}") # .strip() is common to remove leading/trailing whitespace
    print(f"Text (get_text, strip=True): {element.get_text(strip=True)}")
    print(f"Text (get_text with separator): {element.get_text(separator=' | ', strip=True)}")

# --- Get Attribute Values ---
# Use dictionary-like access or .get() method
if element and element.name == 'a': # Check if it's an 'a' tag for 'href'
    link_url = element['href'] # Raises KeyError if 'href' doesn't exist
    link_url_safe = element.get('href') # Returns None if 'href' doesn't exist
    data_id = element.get('data-id')

    print(f"Href (direct): {link_url}")
    print(f"Href (safe .get()): {link_url_safe}")
    print(f"Data-ID: {data_id}")

# --- Get Tag Name ---
if element:
    print(f"Tag Name: {element.name}")

# --- Get All Attributes as a Dictionary ---
if element:
    print(f"All attributes: {element.attrs}")

## scraping example

In [None]:
# Sample HTML (replace with actual fetching if needed)
sample_html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<div class="article">
  <h2>Article 1</h2>
  <p>Content for article 1. <a href="/article1-details">Read more...</a></p>
</div>
<div class="article">
  <h2>Article 2</h2>
  <p>Content for article 2. <a href="/article2-details">Read more...</a></p>
</div>

<p class="story">...</p>
</body>
</html>
"""
soup_sample = BeautifulSoup(sample_html_doc, 'lxml')

# With actual url
# soup_sample = BeautifulSoup(response.content, 'lxml')

# --- Extract title ---
title = soup_sample.title.string
print(f"Page Title: {title}")

# --- Extract all sister names and links ---
sisters_data = []
sister_tags = soup_sample.select('a.sister') # Using CSS selector
for sister_tag in sister_tags:
    name = sister_tag.get_text(strip=True)
    link = sister_tag.get('href')
    sisters_data.append({'name': name, 'link': link})
print(f"Sisters: {sisters_data}")

# --- Extract article titles and their first paragraph ---
articles_data = []
article_divs = soup_sample.select('div.article')
for article_div in article_divs:
    article_title_tag = article_div.select_one('h2') # Find h2 within this article_div
    article_content_tag = article_div.select_one('p') # Find p within this article_div
    
    article_title = article_title_tag.get_text(strip=True) if article_title_tag else "N/A"
    article_content = article_content_tag.get_text(strip=True) if article_content_tag else "N/A"
    
    articles_data.append({'title': article_title, 'content_preview': article_content})
print(f"Articles: {articles_data}")

Page Title: The Dormouse's story
Sisters: [{'name': 'Elsie', 'link': 'http://example.com/elsie'}, {'name': 'Lacie', 'link': 'http://example.com/lacie'}, {'name': 'Tillie', 'link': 'http://example.com/tillie'}]
Articles: [{'title': 'Article 1', 'content_preview': 'Content for article 1.Read more...'}, {'title': 'Article 2', 'content_preview': 'Content for article 2.Read more...'}]


### if pagination is needed

In [None]:
for page in range(1, 4):
    url = f"https://example.com/news?page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    # then same as above...

## exercise one

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# webpage url
url = "https://www.nytimes.com/books/best-sellers/combined-print-and-e-book-fiction/"

# send req to the url
response = requests.get(url)

# converting response to BS object
content = BeautifulSoup(response.content, 'html')

# find the table with class 'css-12yzwg4'
books = content.find_all('li', {"class": "css-sggj6j"})

# loop through each book and extract details
book_list = []
for book in books:

    # book title
    title = book.find('h3', {"class": "css-2jegzb"}).get_text(strip = True)

    # book author
    author = book.find('p', {"class": "css-1aaqvca"}).get_text(strip = True)

    data = {
        'Title': title,
        'Author': author,
    }

    book_list.append(data)

book_list_df = pd.DataFrame(book_list)
print(book_list_df)

# Save to CSV
# book_list_df.to_csv('best_selling_books.csv', index=False)

                          Title                 Author
0        A CURSE CARVED IN BONE  by Danielle L. Jensen
1                    THE TENANT     by Freida McFadden
2       THE EMPEROR OF GLADNESS         by Ocean Vuong
3      GREAT BIG BEAUTIFUL LIFE         by Emily Henry
4              CAN'T GET ENOUGH        by Kennedy Ryan
5             ONE GOLDEN SUMMER      by Carley Fortune
6                    THE DEVILS     by Joe Abercrombie
7                   FEVER BEACH        by Carl Hiaasen
8   REMARKABLY BRIGHT CREATURES     by Shelby Van Pelt
9            SHIELD OF SPARROWS        by Devney Perry
10                   MY FRIENDS     by Fredrik Backman
11                   ONYX STORM      by Rebecca Yarros
12          MARBLE HALL MURDERS    by Anthony Horowitz
13                  FOURTH WING      by Rebecca Yarros
14          I HOPE YOU REMEMBER         by Josie Balka


## exercise two

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = "https://www.courts.com.sg/furniture/furniture/study-desks"

response = requests.get(url)

# Converting response to BS object
content = BeautifulSoup(response.content, 'html') # or 'lxml'

products = content.find_all('li', {"class": "item product product-item"})

product_list = []
for product in products:

    # Product name
    product_name = product.find('h3', {"class": "product name product-item-name"}).get_text(strip=True)

    # Current product price
    current_price_content = product.find('span', {"class": "special-price"})

    #  Check for special price, if not found, use regular price
    if current_price_content is not None:
        current_price = current_price_content.find('span', {"class": "price"}).get_text(strip=True)
    else:
        current_price = product.find('span', {"class": "price"}).get_text(strip=True)

    # Original product price
    # og_price_content = product.find('span', {"class": "old-price"})
    # if og_price_content is not None:
    #     og_price = ""
    # else:
    #     og_price = og_price_content.find('span', {"class": "price"}).get_text(strip=True)

    og_price_content = product.find('span', {"class": "old-price"})
    if og_price_content is not None:
        og_price = og_price_content.find('span', {"class": "price"}).get_text(strip=True)
    else:
        og_price = ""

    data = {
        'Name': product_name,
        'Original Price': og_price,
        'Current Price': current_price,
    }

    product_list.append(data)

product_list_df = pd.DataFrame(product_list, index=False)
print(product_list_df)

# NOTE: Other saving options
# Save to Excel
# product_list_df.to_excel('study_desks.xlsx', index=False, engine='openpyxl')
# Save to CSV
# product_list_df.to_csv('study_desks.csv', index=False)
# Save to JSON
# product_list_df.to_json('study_desks.json', orient='records', lines=True)
# Save to txt
# product_list_df.to_csv('study_desks.txt', index=False, sep='\t') # Tab-separated

                                                 Name Original Price  \
0                INDEX AGENT OFFICE CHAIR (HIGH BACK)       S$359.00   
1   COOLERMASTER CMI-GCR2C-GY CALIBER R2C GAMING C...       S$499.00   
2   COOLERMASTER CMI-GCX1C-GY CALIBER X1C GAMING C...       S$599.00   
3           ORTHO BACK BACK SUPPORT - ASSORTED COLOUR                  
4                             JOURNALIST OFFICE CHAIR       S$179.00   
5                       INDEX TEMPO MESH OFFICE CHAIR       S$189.00   
6                            MORGEN MESH CHAIR (BLUE)       S$199.00   
7                           MORGEN MESH CHAIR (BLACK)       S$199.00   
8                    HEALING ORTHO BACK FOLDING CHAIR                  
9        CURVO HOME OFFICE CHAIR BLACK MID BACK CHAIR                  
10        CURVO HOME OFFICE CHAIR GREY MID BACK CHAIR                  
11             INDEX EXECUTIVE HIGH BACK OFFICE CHAIR       S$289.00   
12                 MOLLER HI BACK DIRECTOR MESH CHAIR           