In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import json

In [2]:
def save_to_csv(filename, data, headers):
    with open(filename, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        writer.writerows(data)

def save_to_json(filename, data):
    with open(filename, mode='w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

In [3]:
url = 'https://baraasalout.github.io/test.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# 1. Extract Text Data

In [5]:
fileName = 'Extract_Text_Data.csv'
header = ['Type', 'Content']
data = []
# Tags
tag_mapping ={
    'h1': 'Heading',
    'h2': 'Heading',
    'p': 'Paragraph',
    'li': 'List'
}

# Extract Elements
for tag in soup.find_all(['h1', 'h2', 'p', 'li']):
    tag_type = tag_mapping.get(tag.name)
    content = tag.get_text(strip=True)
    if content:
        data.append({
            'Type': tag_type,
            'Content': content
        })

# Save to CSV
save_to_csv(fileName, data, header)
print(f"Data saved to {fileName}.")

Data saved to Extract_Text_Data.csv.


# 2. Extract Table Data

In [7]:
table = soup.find("table")
fileName = 'Extract_Table_Data.csv'
header = ['Product Name', 'Price', 'Stock Status']
data = []

for row in table.find_all("tr")[1:]:  # skip the header row
    cols = row.find_all("td")
    if len(cols) >= 3: # Make sure we get the desired data
        product_name = cols[0].get_text(strip=True)
        price = cols[1].get_text(strip=True)
        stock_status = cols[2].get_text(strip=True)
        
        data.append({
            'Product Name': product_name,
            'Price': price,
            'Stock Status': stock_status
        })

# Save to CSV
save_to_csv(fileName, data, header)
print(f"Data saved to {fileName}.")

Data saved to Extract_Table_Data.csv.


# 3. Extract Product Information

In [9]:
fileName = 'Product_Information.json'
books = soup.find("div", class_="book-products")
book_divs = books.find_all("div", recursive=False)
books_data = []

for book in book_divs:
    title = book.find("strong").get_text(strip=True)
    price = book.find_all("p", style=lambda s: s and "color: green" in s)[0].get_text(strip=True)
    availability = book.find_all("p", style=lambda s: s and "color: green" in s)[1].get_text(strip=True)
    button = book.find("button").get_text(strip=True)
    
    books_data.append({
        'title': title,
        'price': price,
        'availability': availability,
        'button': button
    })

# Save to JSON
save_to_json(fileName, books_data)
print(f"Data saved to {fileName}.")

Data saved to Product_Information.json.


# 4. Extract Form Details

In [11]:
fileName = 'form_fields.json'
form = soup.find("form")
fields_data = []

# Extract <input> fields
for field in form.find_all("input"):
    fields_data.append({
        "name": field.get("name"),
        "type": field.get("type"),
        "default_value": field.get("placeholder") or field.get("value")
    })

# Extract <select> fields
for select in form.find_all("select"):
    options = [opt.get_text(strip=True) for opt in select.find_all("option")]

# Add options to the check box
for field in fields_data:
    if field['type'] == 'checkbox':
        field['Options'] = options

# Save to JSON
save_to_json(fileName, fields_data)
print(f"Data saved to {fileName}.")

Data saved to form_fields.json.


# 5. Extract Links and Multimedia

In [13]:
fileName = 'Links_and_Multimedia.json'
videos_link = soup.find_all("iframe", src=True)

videos = [{"video_src": iframe.get('src')} for iframe in videos_link] # if there is other videos

# Save to JSON
save_to_json(fileName, videos)
print(f"Data saved to {fileName}.")

Data saved to Links_and_Multimedia.json.


# 6. Featured Products Challenge

In [15]:
fileName = 'Featured_Products.json'
products = soup.find_all(class_='product-card')
products_list = []

for product in products:
    products_list.append({
        'id': product.get('data-id'),
        'name': product.find(class_='name').get_text(strip=True),
        'price': product.find(class_='price').get_text(strip=True),
        'colors': product.find(class_='colors').get_text(strip=True).replace('Available colors: ', "")
    })

# Save to JSON
save_to_json(fileName, products_list)
print(f"Data saved to {fileName}.")

Data saved to Featured_Products.json.
