Level 1 (Basic) Task 1: Data Collection and Web Scraping:

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

print("Starting our web scraping robot...")

# --- Part 1: Fetching a Single Page ---
print("\n--- Step 1: Fetching a Single Page ---")
url = 'http://books.toscrape.com/'
print(f"Attempting to fetch URL: {url}")

try:
    # Send a request to the website to get its content
    response = requests.get(url)
    response.raise_for_status() # This will raise an HTTPError for bad responses (4xx or 5xx)
    print("Successfully fetched the page content.")

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    print("Page content parsed successfully with BeautifulSoup.")

    # --- Part 2: Extracting Data from the Page ---
    print("\n--- Step 2: Extracting Data from the Page ---")
    books_data = [] # A list to store all the book information

    # Find all book articles on the page
    # Each book is enclosed in an <article> tag with class 'product_pod'
    book_articles = soup.find_all('article', class_='product_pod')
    print(f"Found {len(book_articles)} book articles on this page.")

    for book in book_articles:
        # Extract Title
        # Title is in an <h3> tag, inside an <a> tag's 'title' attribute
        title_tag = book.find('h3').find('a')
        title = title_tag['title'] if title_tag else 'N/A'

        # Extract Price
        # Price is in a <p> tag with class 'price_color'
        price_tag = book.find('p', class_='price_color')
        price = price_tag.get_text(strip=True) if price_tag else 'N/A'

        # Extract Rating
        # Rating is in a <p> tag with class 'star-rating' followed by the rating word
        rating_tag = book.find('p', class_=lambda x: x and 'star-rating' in x.split())
        rating = 'N/A'
        if rating_tag:
            # The rating word is the second class (e.g., 'Three' in 'star-rating Three')
            # We convert it to a more readable format
            rating_classes = rating_tag['class']
            if len(rating_classes) > 1:
                rating_word = rating_classes[1]
                rating_map = {
                    'One': '1 out of 5',
                    'Two': '2 out of 5',
                    'Three': '3 out of 5',
                    'Four': '4 out of 5',
                    'Five': '5 out of 5'
                }
                rating = rating_map.get(rating_word, 'Unknown Rating')

        # Add the extracted data to our list
        books_data.append({
            'Title': title,
            'Price': price,
            'Rating': rating
        })

    print("Extracted data from the first page:")
    for book in books_data[:5]: # Print first 5 for a quick check
        print(f"  - Title: {book['Title']}, Price: {book['Price']}, Rating: {book['Rating']}")

    # --- Part 3: Handling Pagination (Going through multiple pages) ---
    print("\n--- Step 3: Handling Pagination ---")
    all_books_data = [] # A new list to store data from all pages
    base_url = 'http://books.toscrape.com/catalogue/' # Base URL for pagination
    page_num = 1
    has_next_page = True

    while has_next_page:
        current_page_url = f'{base_url}page-{page_num}.html' if page_num > 1 else url
        print(f"Scraping page {page_num}: {current_page_url}")

        try:
            page_response = requests.get(current_page_url)
            page_response.raise_for_status()
            page_soup = BeautifulSoup(page_response.text, 'html.parser')

            # Extract data from the current page (same logic as before)
            current_page_books = page_soup.find_all('article', class_='product_pod')
            if not current_page_books: # If no books found, assume end of pages
                print(f"No more books found on page {page_num}. Ending pagination.")
                has_next_page = False
                continue # Skip to the next iteration to exit loop

            for book in current_page_books:
                title_tag = book.find('h3').find('a')
                title = title_tag['title'] if title_tag else 'N/A'

                price_tag = book.find('p', class_='price_color')
                price = price_tag.get_text(strip=True) if price_tag else 'N/A'

                rating_tag = book.find('p', class_=lambda x: x and 'star-rating' in x.split())
                rating = 'N/A'
                if rating_tag:
                    rating_classes = rating_tag['class']
                    if len(rating_classes) > 1:
                        rating_word = rating_classes[1]
                        rating_map = {
                            'One': '1 out of 5', 'Two': '2 out of 5', 'Three': '3 out of 5',
                            'Four': '4 out of 5', 'Five': '5 out of 5'
                        }
                        rating = rating_map.get(rating_word, 'Unknown Rating')

                all_books_data.append({
                    'Title': title,
                    'Price': price,
                    'Rating': rating
                })

            # Check for "next" button to determine if there's another page
            next_button = page_soup.find('li', class_='next')
            if next_button:
                page_num += 1
            else:
                has_next_page = False
                print("No 'Next' button found. All pages scraped.")

        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {current_page_url}: {e}")
            has_next_page = False # Stop if there's an error

    print(f"\nTotal books scraped across all pages: {len(all_books_data)}")

    # --- Part 4: Storing Data in a Structured Format (CSV) ---
    print("\n--- Step 4: Storing Data in a Structured Format (CSV) ---")
    # Convert the list of dictionaries into a pandas DataFrame
    df = pd.DataFrame(all_books_data)

    # Save the DataFrame to a CSV file
    csv_filename = 'scraped_books_data.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"Data successfully saved to '{csv_filename}'.")
    print("You can open this file with Excel or any spreadsheet program.")

except requests.exceptions.RequestException as e:
    print(f"An error occurred during the initial request: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

print("\nWeb scraping robot finished its mission!")

Starting our web scraping robot...

--- Step 1: Fetching a Single Page ---
Attempting to fetch URL: http://books.toscrape.com/
Successfully fetched the page content.
Page content parsed successfully with BeautifulSoup.

--- Step 2: Extracting Data from the Page ---
Found 20 book articles on this page.
Extracted data from the first page:
  - Title: A Light in the Attic, Price: Â£51.77, Rating: 3 out of 5
  - Title: Tipping the Velvet, Price: Â£53.74, Rating: 1 out of 5
  - Title: Soumission, Price: Â£50.10, Rating: 1 out of 5
  - Title: Sharp Objects, Price: Â£47.82, Rating: 4 out of 5
  - Title: Sapiens: A Brief History of Humankind, Price: Â£54.23, Rating: 5 out of 5

--- Step 3: Handling Pagination ---
Scraping page 1: http://books.toscrape.com/
Scraping page 2: http://books.toscrape.com/catalogue/page-2.html
Scraping page 3: http://books.toscrape.com/catalogue/page-3.html
Scraping page 4: http://books.toscrape.com/catalogue/page-4.html
Scraping page 5: http://books.toscrape.com/cata