In [2]:
import requests
from lxml import html

# Fetch a web page
url = "https://example.com"
response = requests.get(url)

# Check if the request was successful
print(f"Status code: {response.status_code}")
print(f"Content length: {len(response.content)} bytes")

# The HTML content is in response.text
print("\nFirst 500 characters of HTML:")
print(response.text[:500])

Status code: 200
Content length: 513 bytes

First 500 characters of HTML:
<!doctype html><html lang="en"><head><title>Example Domain</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href="https://iana.org/domains/example">Learn more</a></div></


In [None]:
# Parse the HTML content
tree = html.fromstring(response.text)

# Extract the title
title = tree.xpath('//title/text()')
print(f"Page title: {title[0] if title else 'Not found'}")

# Extract all paragraph text
paragraphs = tree.xpath('//p/text()')
print(f"\nParagraphs found: {len(paragraphs)}")
for i, para in enumerate(paragraphs, 1):
    print(f"{i}. {para}")

# Extract the heading
heading = tree.xpath('//h1/text()')
print(f"\nHeading: {heading[0] if heading else 'Not found'}")

Page title: Example Domain

Paragraphs found: 1
1. This domain is for use in documentation examples without needing permission. Avoid use in operations.

Heading: Example Domain


In [6]:
import requests
from lxml import html

# 1.1. fetch the HTML from example.com using requests.get().
url = "https://example.com"
response = requests.get(url)

# 1.2. Check the status code to verify the request was successful.
if response.status_code == 200:
    print(f"Status code: {response.status_code}")
    
    # 1.3. Parse the HTML using lxml.html.fromstring()
    tree = html.fromstring(response.text)

    # 1.4. Extract and display the page title
    page_title = tree.xpath('//title/text()')
    title_text = page_title[0] if page_title else 'not found'
    print(f"Page Title: {title_text}")

    # 1.5. Extract and display all paragraph text and the <h1> heading text.
    heading_h1 = tree.xpath('//h1/text()')
    h1_text = heading_h1[0] if heading_h1 else 'Not found'
    print(f"H1 Heading: {h1_text}")

    paragraphs = tree.xpath('//p/text()')
    for i, p_text in enumerate(paragraphs, 1):
        print(f"Paragraph {i}: {p_text.strip()}")

else:
    print(f"Error status code: {response.status_code}")

Status code: 200
Page Title: Example Domain
H1 Heading: Example Domain
Paragraph 1: This domain is for use in documentation examples without needing permission. Avoid use in operations.


In [7]:
import requests
from lxml import html

url = "https://quotes.toscrape.com"
response = requests.get(url)
tree = html.fromstring(response.text)

# Find all quote containers (divs with class="quote")
# You can find this by inspecting the page!
quote_containers = tree.xpath('//div[@class="quote"]')

print(f"Found {len(quote_containers)} quote containers\n")

# Extract data from each container
for i, container in enumerate(quote_containers[:3], 1):  # First 3 quotes
    # Extract quote text from within this container
    quote_text = container.xpath('.//span[@class="text"]/text()')[0]

    # Extract author from within this container
    author = container.xpath('.//small[@class="author"]/text()')[0]

    # Extract tags from within this container
    tags = container.xpath('.//a[@class="tag"]/text()')

    print(f"Quote {i}:")
    print(f"  Text: {quote_text}")
    print(f"  Author: {author}")
    print(f"  Tags: {', '.join(tags)}")
    print()

Found 10 quote containers

Quote 1:
  Text: “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
  Author: Albert Einstein
  Tags: change, deep-thoughts, thinking, world

Quote 2:
  Text: “It is our choices, Harry, that show what we truly are, far more than our abilities.”
  Author: J.K. Rowling
  Tags: abilities, choices

Quote 3:
  Text: “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
  Author: Albert Einstein
  Tags: inspirational, life, live, miracle, miracles



In [8]:
import requests
from lxml import html

# 2.1 & 2.2: Identify the HTML structure of quote containers using inspect element.
url = "https://quotes.toscrape.com"
response = requests.get(url)

tree = html.fromstring(response.text)

# 2.3: Write code to extract quote text, author, and tags for each quote.
quote_containers = tree.xpath('//div[@class="quote"]')

print(f"Found {len(quote_containers)}  quote containers\n")

for i, container in enumerate(quote_containers, 1):
    text = container.xpath('.//span[@class="text"]/text()')[0]
    author = container.xpath('.//small[@class="author"]/text()')[0]
    tags = container.xpath('.//a[@class="tag"]/text()')

    print(f"Quote {i}:")
    print(f"  Text: {text}")
    print(f"  Author: {author}")
    print(f"  Tags: {', '.join(tags)}")
    print("-" * 30)

# 2.4: Extract the "Next" link’s href attribute (if it exists).
next_link = tree.xpath('//li[@class="next"]/a/@href')

if next_link:
    print(f"\nNext page found: {next_link[0]}")
else:
    print("\nNo 'Next' link found on this page.")

Found 10  quote containers

Quote 1:
  Text: “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
  Author: Albert Einstein
  Tags: change, deep-thoughts, thinking, world
------------------------------
Quote 2:
  Text: “It is our choices, Harry, that show what we truly are, far more than our abilities.”
  Author: J.K. Rowling
  Tags: abilities, choices
------------------------------
Quote 3:
  Text: “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
  Author: Albert Einstein
  Tags: inspirational, life, live, miracle, miracles
------------------------------
Quote 4:
  Text: “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
  Author: Jane Austen
  Tags: aliteracy, books, classic, humor
------------------------------
Quote 5:
  Text: “Imperfection is beauty, madness is genius and it's better 

2.5 I used inspect -> Elements to see the html files.

In [9]:
import requests
from lxml import html

url = "https://the-examples-book.com"
response = requests.get(url)

# Always check the status code first
if response.status_code == 200:
    print("Successfully fetched the page!")
    tree = html.fromstring(response.text)

    # Use inspect element to find what you want to extract
    # For example, let's find all links on the page
    links = tree.xpath('//a/@href')
    print(f"\nFound {len(links)} links on the page")

    # Show first 10 links
    for i, link in enumerate(links[:10], 1):
        print(f"{i}. {link}")
else:
    print(f"Error: Status code {response.status_code}")

Successfully fetched the page!

Found 68 links on the page
1. https://the-examples-book.com
2. crp/
3. meeting-etiquette/
4. personal/
5. projects/
6. seminar-ta/home
7. internal/main/introduction
8. workshops/
9. ./
10. tools/


In [10]:
import requests
from lxml import html

# 3.1 & 3.2:  Identify at least 3 different types of elements you want to extract.

url = "https://the-examples-book.com"
response = requests.get(url)

# 3.3: Write code to fetch and parse the page.
if response.status_code == 200:
    print("Successfully fetched the page!")
    tree = html.fromstring(response.text)
    # 3.4 & 3.5: Extract the information you identified and Display the results in a readable format.
    headings = tree.xpath('//h1/text() | //h2/text()')
    print(f"Found {len(headings)} heading")
    for i, head in enumerate(headings[:5], 1):
        print(f"Heading {i}: {head.strip()}")

    links = tree.xpath('//a/@href')
    print(f"Found {len(links)} link")
    for i, link in enumerate(links[:10], 1):
        print(f"Link {i}: {link}")

    paragraphs = tree.xpath('//p/text()')
    print(f"Found {len(paragraphs)}paragraphs")
    for i, para in enumerate(paragraphs[:3], 1): 
        if para.strip():
            print(f"{i}:{para.strip()[:100]}") 

else:
    print(f"Error: Status code {response.status_code}")

Successfully fetched the page!
Found 6 heading
Heading 1: The Examples Book
Heading 2: Overview
Heading 3: Core Topics
Heading 4: Course Links
Heading 5: Grant Awarded Programs
Found 68 link
Link 1: https://the-examples-book.com
Link 2: crp/
Link 3: meeting-etiquette/
Link 4: personal/
Link 5: projects/
Link 6: seminar-ta/home
Link 7: internal/main/introduction
Link 8: workshops/
Link 9: ./
Link 10: tools/
Found 28paragraphs
1:Welcome to The All New Examples Book! This book contains a collection of information and examples th
2:seminar at
3:. The Examples Book is open to anyone. Even if you aren’t a student at Purdue we hope you find helpf


In [12]:
import requests
from lxml import html
import time
from urllib.parse import urljoin

base_url = "https://quotes.toscrape.com"
current_url = base_url
all_quotes = []
page_num = 1

while True:
    print(f"Scraping page {page_num}...")
    response = requests.get(current_url)
    tree = html.fromstring(response.text)

    # Extract quotes from current page
    quote_containers = tree.xpath('//div[@class="quote"]')
    for container in quote_containers:
        text = container.xpath('.//span[@class="text"]/text()')[0]
        author = container.xpath('.//small[@class="author"]/text()')[0]
        tags = container.xpath('.//a[@class="tag"]/text()')
        all_quotes.append({"text": text, "author": author, "tags": tags})

    # Check for "Next" button using inspect element to find the selector
    next_button = tree.xpath('//li[@class="next"]/a/@href')

    if next_button:
        # Construct full URL (handling relative URLs)
        next_url = next_button[0]
        # Use urljoin to properly combine URLs
        current_url = urljoin(base_url, next_url)
        page_num += 1
        time.sleep(1)  # Be polite - wait between requests
    else:
        print("No more pages!")
        break

print(f"\nTotal quotes scraped: {len(all_quotes)}")
print(f"\nFirst 3 quotes:")
for i, quote in enumerate(all_quotes[:3], 1):
    print(f"{i}. {quote['text']} - {quote['author']} - Tags: {', '.join(quote['tags'])}")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
No more pages!

Total quotes scraped: 100

First 3 quotes:
1. “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” - Albert Einstein - Tags: change, deep-thoughts, thinking, world
2. “It is our choices, Harry, that show what we truly are, far more than our abilities.” - J.K. Rowling - Tags: abilities, choices
3. “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.” - Albert Einstein - Tags: inspirational, life, live, miracle, miracles


In [13]:
import requests
from lxml import html
import time
from urllib.parse import urljoin

base_url = "https://quotes.toscrape.com"
current_url = base_url
all_quotes = []
page_num = 1

while True:
    print(f"Scraping page {page_num}...")
    response = requests.get(current_url)
    tree = html.fromstring(response.text)

    # 4.3. Extract quote text, author, and tags from each page.
    quote_containers = tree.xpath('//div[@class="quote"]')
    for container in quote_containers:
        text = container.xpath('.//span[@class="text"]/text()')[0]
        author = container.xpath('.//small[@class="author"]/text()')[0]
        tags = container.xpath('.//a[@class="tag"]/text()')
        
        # 4.4. Store all data in a structured format (list of dictionaries).
        all_quotes.append({"text": text, "author": author, "tags": tags})

    # 4.1. Write code to scrape all pages from quotes.toscrape.com
    next_button = tree.xpath('//li[@class="next"]/a/@href')
    # 4.2. Handle pagination by following "Next" links (use inspect element to find the selector).
    if next_button:
        next_url = next_button[0]
        current_url = urljoin(base_url, next_url)
        page_num += 1
        time.sleep(1) 
    else:
        print("No more pages!")
        break

# 4.5. Print the total number of quotes scraped and display a few examples.
print(f"\nTotal quotes scraped: {len(all_quotes)}")
print(f"\nFirst 3 quotes:")
for i, quote in enumerate(all_quotes[:3], 1):
    print(f"{i}. {quote['text']} - {quote['author']} - Tags: {', '.join(quote['tags'])}")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
No more pages!

Total quotes scraped: 100

First 3 quotes:
1. “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” - Albert Einstein - Tags: change, deep-thoughts, thinking, world
2. “It is our choices, Harry, that show what we truly are, far more than our abilities.” - J.K. Rowling - Tags: abilities, choices
3. “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.” - Albert Einstein - Tags: inspirational, life, live, miracle, miracles


In [14]:
import requests
from lxml import html
from urllib.parse import urljoin

url = "https://quotes.toscrape.com"
response = requests.get(url)
tree = html.fromstring(response.text)

# Extract author profile links
author_link_elements = tree.xpath('//a[contains(@href, "/author/")]')
author_dict = {}

for elem in author_link_elements:
    author_name = elem.text
    author_href = elem.get('href')
    # Convert relative URL to absolute
    full_url = urljoin(url, author_href)
    author_dict[author_name] = full_url

print("Author name to profile URL mapping:")
for name, profile_url in list(author_dict.items())[:5]:
    print(f"{name}: {profile_url}")

# Extract tag links
tag_elements = tree.xpath('//a[@class="tag"]')
print(f"\nFound {len(tag_elements)} tag links")
print("\nFirst 10 tags:")
for i, tag_elem in enumerate(tag_elements[:10], 1):
    tag_text = tag_elem.text
    tag_href = tag_elem.get('href')
    tag_url = urljoin(url, tag_href)
    print(f"{i}. {tag_text} -> {tag_url}")

Author name to profile URL mapping:
(about): https://quotes.toscrape.com/author/Steve-Martin

Found 40 tag links

First 10 tags:
1. change -> https://quotes.toscrape.com/tag/change/page/1/
2. deep-thoughts -> https://quotes.toscrape.com/tag/deep-thoughts/page/1/
3. thinking -> https://quotes.toscrape.com/tag/thinking/page/1/
4. world -> https://quotes.toscrape.com/tag/world/page/1/
5. abilities -> https://quotes.toscrape.com/tag/abilities/page/1/
6. choices -> https://quotes.toscrape.com/tag/choices/page/1/
7. inspirational -> https://quotes.toscrape.com/tag/inspirational/page/1/
8. life -> https://quotes.toscrape.com/tag/life/page/1/
9. live -> https://quotes.toscrape.com/tag/live/page/1/
10. miracle -> https://quotes.toscrape.com/tag/miracle/page/1/


In [16]:
import requests
from lxml import html
from urllib.parse import urljoin

url = "https://quotes.toscrape.com"
response = requests.get(url)
tree = html.fromstring(response.text)

# 5.1. Extract all author profile links from the page.
author_link_elements = tree.xpath('//a[contains(@href, "/author/")]')
# 5.3. Create a dictionary mapping author names to their profile URLs (use absolute URLs).
author_dict = {}

for elem in author_link_elements:
    author_name = elem.text.strip()
    author_href = elem.get('href')
    full_url = urljoin(url, author_href)
    author_dict[author_name] = full_url

# 5.2. Extract tag names and their corresponding links.
tag_elements = tree.xpath('//a[@class="tag"]')
tag_data = []
for tag_elem in tag_elements:
    name = tag_elem.text
    link = urljoin(url, tag_elem.get('href'))
    tag_data.append((name, link))

# 5.4. Create a list of all unique tags.
unique_tags = sorted(list(set([tag[0] for tag in tag_data])))

# 5.5. Display the results in a clear format.
print("--- Author Profile Mapping (First 5) ---")
for name, profile_url in list(author_dict.items())[:5]:
    print(f"{name}: {profile_url}")

print(f"\n--- Unique Tags Found ({len(unique_tags)}) ---")
print(", ".join(unique_tags))

print("\n--- Tag Links (First 10) ---")
for i, (name, link) in enumerate(tag_data[:10], 1):
    print(f"{i}. {name} -> {link}")

--- Author Profile Mapping (First 5) ---
(about): https://quotes.toscrape.com/author/Steve-Martin

--- Unique Tags Found (30) ---
abilities, adulthood, aliteracy, be-yourself, books, change, choices, classic, deep-thoughts, edison, failure, friends, friendship, humor, inspirational, life, live, love, miracle, miracles, misattributed-eleanor-roosevelt, obvious, paraphrased, reading, simile, success, thinking, truth, value, world

--- Tag Links (First 10) ---
1. change -> https://quotes.toscrape.com/tag/change/page/1/
2. deep-thoughts -> https://quotes.toscrape.com/tag/deep-thoughts/page/1/
3. thinking -> https://quotes.toscrape.com/tag/thinking/page/1/
4. world -> https://quotes.toscrape.com/tag/world/page/1/
5. abilities -> https://quotes.toscrape.com/tag/abilities/page/1/
6. choices -> https://quotes.toscrape.com/tag/choices/page/1/
7. inspirational -> https://quotes.toscrape.com/tag/inspirational/page/1/
8. life -> https://quotes.toscrape.com/tag/life/page/1/
9. live -> https://quote