In [1]:
from lxml import html

# A simple HTML string
html_string = """
<html>
  <head>
    <title>My First Web Page</title>
  </head>
  <body>
    <h1>Welcome to Web Scraping</h1>
    <p>This is a paragraph.</p>
    <p>This is another paragraph.</p>
  </body>
</html>
"""

# Parse the HTML string into a tree structure
tree = html.fromstring(html_string)

# Now we can extract elements from the tree
print("HTML parsed successfully!")
print(f"Type of tree: {type(tree)}")

HTML parsed successfully!
Type of tree: <class 'lxml.html.HtmlElement'>


In [2]:
# Extract the title (it's inside a <title> tag)
title = tree.xpath('//title/text()')
print(f"Page title: {title[0] if title else 'Not found'}")

# Extract all paragraph text
paragraphs = tree.xpath('//p/text()')
print(f"\nFound {len(paragraphs)} paragraphs:")
for i, para in enumerate(paragraphs, 1):
    print(f"{i}. {para}")

# Extract the heading
heading = tree.xpath('//h1/text()')
print(f"\nHeading: {heading[0] if heading else 'Not found'}")

Page title: My First Web Page

Found 2 paragraphs:
1. This is a paragraph.
2. This is another paragraph.

Heading: Welcome to Web Scraping


In [9]:
# 1.1
my_html_string = """
<html>
  <head>
    <title>Porject 1 practice</title>
  </head>
  <body>
    <h1>This is a title</h1>
    <p>This is first paragraph.</p>
    <p>This is second paragraph.</p>
  </body>
</html>
"""
# 1.2 parse html
tree_q1 = html.fromstring(my_html_string)

#1.3 Extract page title
title_q1 = tree_q1.xpath('//title/text()')
print(f'Title page : {title_q1[0] if title_q1 else "Not found"}')

#1.4 Extract all paragraph
paragraph_q1 = tree_q1.xpath('//p/text()')
print(f'paragraph text: ')
for i, para in enumerate(paragraph_q1,1):
    print(i, para)

#1.5 Extract heading
heading_q1 = tree_q1.xpath('//h1/text()')
print(f'Heading : {heading_q1[0] if heading_q1 else "Not found"}')

Title page : Porject 1 practice
paragraph text: 
1 This is first paragraph.
2 This is second paragraph.
Heading : This is a title


In [10]:
from lxml import html

# HTML with more structure
html_string = """
<html>
  <body>
    <div class="quote">
      <span class="text">The only way to do great work is to love what you do.</span>
      <small class="author">Steve Jobs</small>
    </div>
    <div class="quote">
      <span class="text">Innovation distinguishes between a leader and a follower.</span>
      <small class="author">Steve Jobs</small>
    </div>
    <div class="quote">
      <span class="text">Stay hungry, stay foolish.</span>
      <small class="author">Steve Jobs</small>
    </div>
  </body>
</html>
"""

tree = html.fromstring(html_string)

# Different XPath patterns:
# //tag - Find all 'tag' elements anywhere in the document
# //tag[@attribute='value'] - Find elements with specific attribute value
# //tag/text() - Get text content of elements

# Find all quotes (they're in <span> tags with class="text")
quotes = tree.xpath('//span[@class="text"]/text()')
print(f"Found {len(quotes)} quotes:")
for i, quote in enumerate(quotes, 1):
    print(f"{i}. {quote}")

# Find all authors (they're in <small> tags with class="author")
authors = tree.xpath('//small[@class="author"]/text()')
print(f"\nFound {len(authors)} authors:")
for i, author in enumerate(authors, 1):
    print(f"{i}. {author}")

Found 3 quotes:
1. The only way to do great work is to love what you do.
2. Innovation distinguishes between a leader and a follower.
3. Stay hungry, stay foolish.

Found 3 authors:
1. Steve Jobs
2. Steve Jobs
3. Steve Jobs


In [11]:
# Using CSS selectors (alternative to XPath)
# CSS selector syntax:
# tag - Select all 'tag' elements
# .classname - Select elements with class="classname"
# #id - Select element with id="id"
# tag.classname - Select 'tag' elements with class="classname"

quotes_css = tree.cssselect('span.text')
print(f"Found {len(quotes_css)} quotes using CSS selector:")
for i, quote_elem in enumerate(quotes_css, 1):
    print(f"{i}. {quote_elem.text_content()}")

Found 3 quotes using CSS selector:
1. The only way to do great work is to love what you do.
2. Innovation distinguishes between a leader and a follower.
3. Stay hungry, stay foolish.


In [31]:
#2.1
html_string_q2 = """
<html>
  <body>
    <div class="quote">
      <span class="text">Those who seek death shall live, those who seek life shall die.</span>
      <small class="author">Yi Sun-sin</small>
    </div>
    <div class="quote">
      <span class="text">Be yourself; everyone else is already taken.</span>
      <small class="author">Oscar Wilde</small>
    </div>
    <div class="quote">
      <span class="text">Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.</span>
      <small class="author">Albert Einstein</small>
    </div>
  </body>
</html>
"""

tree_q2 = html.fromstring(html_string_q2)

#2.2 Extract all qote texts
quotes_q2 = tree_q2.xpath('//span[@class="text"]/text()')
print(f"Quote texts : ")
for i, quote in enumerate(quotes_q2, 1):
    print(i, quote)

#2.3 Extract all author names
names_q2 = tree_q2.xpath('//small[@class="author"]/text()')
print("\nAuthor names: ")
for i, name in enumerate(names_q2, 1):
    print(i, name)

#2.4 Extract quotes using css selectors
quotes_css = tree_q2.cssselect('span.text')
print(f"\nQuotes using css selector:")
for i, quote in enumerate(quotes_css, 1):
    print(f"{i} {quote.text_content()}")

#2.5 Display pair of quote and author
print('\n')
for q, a in zip(quotes_q2, names_q2):
    print(f'{q} - {a}')

Quote texts : 
1 Those who seek death shall live, those who seek life shall die.
2 Be yourself; everyone else is already taken.
3 Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.

Author names: 
1 Yi Sun-sin
2 Oscar Wilde
3 Albert Einstein

Quotes using css selector:
1 Those who seek death shall live, those who seek life shall die.
2 Be yourself; everyone else is already taken.
3 Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.


Those who seek death shall live, those who seek life shall die. - Yi Sun-sin
Be yourself; everyone else is already taken. - Oscar Wilde
Two things are infinite: the universe and human stupidity; and I'm not sure about the universe. - Albert Einstein


In [39]:
from lxml import html

# HTML with nested structure
html_string = """
<html>
  <body>
    <div class="quote">
      <span class="text">The only way to do great work is to love what you do.</span>
      <small class="author">Steve Jobs</small>
      <div class="tags">
        <a class="tag">inspiration</a>
        <a class="tag">work</a>
      </div>
    </div>
    <div class="quote">
      <span class="text">Innovation distinguishes between a leader and a follower.</span>
      <small class="author">Steve Jobs</small>
      <div class="tags">
        <a class="tag">innovation</a>
        <a class="tag">leadership</a>
      </div>
    </div>
  </body>
</html>
"""

tree = html.fromstring(html_string)

# Find all quote containers (divs with class="quote")
quote_containers = tree.xpath('//div[@class="quote"]')

print(f"Found {len(quote_containers)} quote containers\n")

# Extract data from each container
for i, container in enumerate(quote_containers, 1):
    # Extract quote text from within this container
    # Notice the .// - the dot means "starting from the current element"
    quote_text = container.xpath('.//span[@class="text"]/text()')[0]

    # Extract author from within this container
    author = container.xpath('.//small[@class="author"]/text()')[0]

    # Extract tags from within this container
    tags = container.xpath('.//a[@class="tag"]/text()')

    print(f"Quote {i}:")
    print(f"  Text: {quote_text}")
    print(f"  Author: {author}")
    print(f"  Tags: {', '.join(tags)}")
    print()

Found 2 quote containers

Quote 1:
  Text: The only way to do great work is to love what you do.
  Author: Steve Jobs
  Tags: inspiration, work

Quote 2:
  Text: Innovation distinguishes between a leader and a follower.
  Author: Steve Jobs
  Tags: innovation, leadership



In [41]:
#3.1
html_string_q3 = """
<html>
  <body>
    <div class="quote">
      <span class="text">Those who seek death shall live, those who seek life shall die.</span>
      <small class="author">Yi Sun-sin</small>
      <div class="tags">
        <a class="tag">life</a>
        <a class="tag">death</a>
        <a class="tag">war</a>
      </div>
    </div>
    <div class="quote">
      <span class="text">Be yourself; everyone else is already taken.</span>
      <small class="author">Oscar Wilde</small>
      <div class="tags">
        <a class="tag">identity</a>
        <a class="tag">life</a>
        <a class="tag">honesty</a>
      </div>
    </div>
    <div class="quote">
      <span class="text">Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.</span>
      <small class="author">Albert Einstein</small>
      <div class="tags">
        <a class="tag">science</a>
        <a class="tag">humor</a>
        <a class="tag">philosophy</a>
      </div>
    </div>
  </body>
</html>
"""

tree_q3 = html.fromstring(html_string_q3)

#3.2 Find all quote containers
quote_q3 = tree_q3.xpath('//div[@class="quote"]')
print(f"Quote texts : ")
for i, quote in enumerate(quote_q3, 1):
    print(i, quote)


#3.3 extract the quote text, author, tags for each container
for i, container in enumerate(quote_q3, 1):
    # Extract quote text 
    quote_text = container.xpath('.//span[@class="text"]/text()')[0]
    # Extract author
    author = container.xpath('.//small[@class="author"]/text()')[0]
    # Extract tags
    tags = container.xpath('.//a[@class="tag"]/text()')

    #3.4 diplay results
    print(f"Quote {i}:")
    print(f"  Text: {quote_text}")
    print(f"  Author: {author}")
    print(f"  Tags: {', '.join(tags)}")
    print()

#3.5
# .// finds only inside the container where // finds from whole html file

Quote texts : 
1 <Element div at 0x203e9564460>
2 <Element div at 0x203e95c9720>
3 <Element div at 0x203e95c9a90>
Quote 1:
  Text: Those who seek death shall live, those who seek life shall die.
  Author: Yi Sun-sin
  Tags: life, death, war

Quote 2:
  Text: Be yourself; everyone else is already taken.
  Author: Oscar Wilde
  Tags: identity, life, honesty

Quote 3:
  Text: Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.
  Author: Albert Einstein
  Tags: science, humor, philosophy



In [42]:
from lxml import html

# HTML with links
html_string = """
<html>
  <body>
    <a href="/author/steve-jobs">Steve Jobs</a>
    <a href="/author/albert-einstein">Albert Einstein</a>
    <a href="/author/maya-angelou">Maya Angelou</a>
    <a href="/tag/inspiration">Inspiration</a>
    <a href="/tag/work">Work</a>
  </body>
</html>
"""

tree = html.fromstring(html_string)

# Extract href attributes from links
# Method 1: Using XPath with @attribute
author_links = tree.xpath('//a[contains(@href, "/author/")]/@href')
print("Author links (Method 1 - direct attribute extraction):")
for link in author_links:
    print(f"  {link}")

# Method 2: Get the element first, then access attributes
author_link_elements = tree.xpath('//a[contains(@href, "/author/")]')
print("\nAuthor links (Method 2 - get element then attribute):")
for elem in author_link_elements:
    print(f"  Text: {elem.text}")
    print(f"  Href: {elem.get('href')}")
    print()

Author links (Method 1 - direct attribute extraction):
  /author/steve-jobs
  /author/albert-einstein
  /author/maya-angelou

Author links (Method 2 - get element then attribute):
  Text: Steve Jobs
  Href: /author/steve-jobs

  Text: Albert Einstein
  Href: /author/albert-einstein

  Text: Maya Angelou
  Href: /author/maya-angelou



In [43]:
# Example: Scraping a simple HTML table
html_table = """
<table>
  <tr>
    <th>Name</th>
    <th>Age</th>
    <th>City</th>
  </tr>
  <tr>
    <td>Alice</td>
    <td>25</td>
    <td>New York</td>
  </tr>
  <tr>
    <td>Bob</td>
    <td>30</td>
    <td>London</td>
  </tr>
  <tr>
    <td>Charlie</td>
    <td>35</td>
    <td>Tokyo</td>
  </tr>
</table>
"""

tree = html.fromstring(html_table)

# Extract table headers
headers = tree.xpath('//th/text()')
print(f"Headers: {headers}")

# Extract all rows
rows = tree.xpath('//tr')
data = []
for row in rows[1:]:  # Skip header row (index 0)
    cells = row.xpath('.//td/text()')
    if cells:  # Only add non-empty rows
        data.append(cells)

print("\nTable data:")
for row in data:
    print(row)

Headers: ['Name', 'Age', 'City']

Table data:
['Alice', '25', 'New York']
['Bob', '30', 'London']
['Charlie', '35', 'Tokyo']


In [48]:
# 4.1 
html_string_q4 = html_string_links = """
<html>
  <body>
    <div id="nav">
        <a href="https://google.com">Google</a>
        <a href="https://bing.com">Bing</a>
        <a href="https://duckduckgo.com">DuckDuckGo</a>
    </div>
  </body>
</html>
"""

#4.2
html_table_q4 = """
<html>
  <body>
    <table id="users">
      <tr>
        <th>ID</th>
        <th>Username</th>
        <th>Role</th>
      </tr>
      <tr>
        <td>101</td>
        <td>dev_jane</td>
        <td>Admin</td>
      </tr>
      <tr>
        <td>102</td>
        <td>qa_bob</td>
        <td>Tester</td>
      </tr>
    </table>
  </body>
</html>
"""
tree_q4 = html.fromstring(html_string_q4)

#4.3. create dictionary mapping link text to href
link_elements = tree_q4.xpath('//div[@id="nav"]/a')
link_dict = {}

#4.5. show both methods of extraction
print("Link Dictionary:")
for elem in link_elements:
    url = elem.get('href')
    name = elem.text
    link_dict[name] = url
print(link_dict)

# 4.4. Display table data
table_q4 = html.fromstring(html_table_q4)
rows = table_q4.xpath('//table[@id="users"]//tr')
print("\nTable Data:")
headers = table_q4.xpath('.//th/text()')
print(f"Headers: {headers}")
for row in rows[1:]:
    cells = row.xpath('.//td/text()')
    print(f"Row: {cells}")


Link Dictionary:
{'Google': 'https://google.com', 'Bing': 'https://bing.com', 'DuckDuckGo': 'https://duckduckgo.com'}

Table Data:
Headers: ['ID', 'Username', 'Role']
Row: ['101', 'dev_jane', 'Admin']
Row: ['102', 'qa_bob', 'Tester']


In [49]:
from lxml import html

# More complex HTML structure
html_string = """
<html>
  <body>
    <div class="book">
      <h3><a title="The Great Gatsby">The Great Gatsby</a></h3>
      <p class="price">$12.99</p>
      <p class="availability">In stock</p>
      <p class="rating star-rating Four">Rating: 4 stars</p>
    </div>
    <div class="book">
      <h3><a title="1984">1984</a></h3>
      <p class="price">$10.99</p>
      <p class="availability">In stock</p>
      <p class="rating star-rating Five">Rating: 5 stars</p>
    </div>
    <div class="book">
      <h3><a title="To Kill a Mockingbird">To Kill a Mockingbird</a></h3>
      <p class="price">$11.99</p>
      <p class="availability">Out of stock</p>
      <p class="rating star-rating Three">Rating: 3 stars</p>
    </div>
  </body>
</html>
"""

tree = html.fromstring(html_string)

# Find all book containers
books = tree.xpath('//div[@class="book"]')

print(f"Found {len(books)} books\n")

# Extract information from each book
for i, book in enumerate(books, 1):
    # Book title (stored in title attribute of <a> tag)
    title_elem = book.xpath('.//h3/a')[0]
    title = title_elem.get('title')

    # Book price
    price = book.xpath('.//p[@class="price"]/text()')[0]

    # Availability
    availability = book.xpath('.//p[@class="availability"]/text()')[0]

    # Star rating (stored in class name like "star-rating Four")
    rating_elem = book.xpath('.//p[contains(@class, "star-rating")]')[0]
    rating_class = rating_elem.get('class')
    # Extract the rating word (Four, Five, Three, etc.)
    rating = rating_class.split()[-1] if rating_class else "Unknown"

    print(f"Book {i}:")
    print(f"  Title: {title}")
    print(f"  Price: {price}")
    print(f"  Availability: {availability}")
    print(f"  Rating: {rating}")
    print()

Found 3 books

Book 1:
  Title: The Great Gatsby
  Price: $12.99
  Availability: In stock
  Rating: Four

Book 2:
  Title: 1984
  Price: $10.99
  Availability: In stock
  Rating: Five

Book 3:
  Title: To Kill a Mockingbird
  Price: $11.99
  Availability: Out of stock
  Rating: Three



In [52]:
#5.1
html_string_q5 = """
<html>
  <body>
    <div class="product">
      <h3><a title="Gaming Laptop X1">Gaming Laptop X1</a></h3>
      <span class="price">$1200</span>
      <div class="meta">
        <span class="stock">In Stock</span>
        <span class="category">Electronics</span>
      </div>
    </div>
    <div class="product">
      <h3><a title="Mechanical Keyboard">Mechanical Keyboard</a></h3>
      <span class="price">$85</span>
      <div class="meta">
        <span class="stock">Low Stock</span>
        <span class="category">Accessories</span>
      </div>
    </div>
    <div class="product">
      <h3><a title="Wireless Mouse">Wireless Mouse</a></h3>
      <span class="price">$45</span>
      <div class="meta">
        <span class="stock">Out of Stock</span>
        <span class="category">Accessories</span>
      </div>
    </div>
  </body>
</html>
"""

tree_q5 = html.fromstring(html_string_q5)
products = []

#5.2 Extract multiple pieces of information from each item
product_cards = tree_q5.xpath('//div[@class="product"]')

for card in product_cards:
    item = {}
    item['title'] = card.xpath('.//h3/a/@title')[0]
    item['price'] = card.xpath('.//span[@class="price"]/text()')[0]
    item['stock'] = card.xpath('.//span[@class="stock"]/text()')[0]
    item['category'] = card.xpath('.//span[@class="category"]/text()')[0]
    
    #5.3. Store data in the list
    products.append(item)

#5.4 Print first few items
print('First two items : ')
for p in products[:2]:
    print(p)

#5.4. Display items
print(f"Total products extracted: {len(products)}\n")



First two items : 
{'title': 'Gaming Laptop X1', 'price': '$1200', 'stock': 'In Stock', 'category': 'Electronics'}
{'title': 'Mechanical Keyboard', 'price': '$85', 'stock': 'Low Stock', 'category': 'Accessories'}
Total products extracted: 3

