In [1]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Website URL
url = "http://quotes.toscrape.com"


Reason for choosing: This website is designed for web scraping practice. It contains structured data (quotes, authors, and tags), which makes it ideal for learning to extract and organize information. Scraping it is safe and ethical because it’s intended for educational purposes

Data to Collect
Quote Text: The text of each quote.
Author: The person who said the quote.
Tags: Categories or keywords associated with the quote.


In [3]:
# Lists to store scraped data
quotes_list = []
authors_list = []
tags_list = []

In [7]:
headers = {"User-Agent": "Mozilla/5.0"}  # <-- Make sure this is here


In [8]:
import time

In [9]:
while url:
    print("Scraping:", url)
    
    try:
        # Add timeout and retries
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise error if request fails
        
        soup = BeautifulSoup(response.text, 'html.parser')
        quotes = soup.find_all('div', class_='quote')
        
        for quote in quotes:
            text = quote.find('span', class_='text').text.strip()
            author = quote.find('small', class_='author').text.strip()
            tags = [tag.text for tag in quote.find_all('a', class_='tag')]
            
            quotes_list.append(text)
            authors_list.append(author)
            tags_list.append(", ".join(tags))
        
        # Find the next page
        next_button = soup.find('li', class_='next')
        if next_button:
            next_page = next_button.find('a')['href']
            url = "https://quotes.toscrape.com" + next_page
        else:
            url = None

        # Be nice to the server
        time.sleep(1)

    except requests.exceptions.Timeout:
        print("⚠️ Timeout occurred. Retrying in 5 seconds...")
        time.sleep(5)
        continue  # retry the same URL
    except requests.exceptions.RequestException as e:
        print("❌ Error:", e)
        break

print("✅ Finished scraping all pages!")


Scraping: http://quotes.toscrape.com
Scraping: https://quotes.toscrape.com/page/2/
Scraping: https://quotes.toscrape.com/page/3/
Scraping: https://quotes.toscrape.com/page/4/
Scraping: https://quotes.toscrape.com/page/5/
Scraping: https://quotes.toscrape.com/page/6/
Scraping: https://quotes.toscrape.com/page/7/
Scraping: https://quotes.toscrape.com/page/8/
Scraping: https://quotes.toscrape.com/page/9/
Scraping: https://quotes.toscrape.com/page/10/
✅ Finished scraping all pages!


In [10]:
# Create DataFrame
data = pd.DataFrame({
    'Quote': quotes_list,
    'Author': authors_list,
    'Tags': tags_list
})

In [11]:
# Save as CSV
data.to_csv('quotes_Babirye.csv', index=False)

print("Scraping complete! Data saved as quotes_Babirye.csv")

Scraping complete! Data saved as quotes_Babirye.csv


website Description: “Quotes to Scrape” contains famous quotes from various authors, organized by tags for easy browsing.

Extracted Information: The dataset includes the text of the quote, the author, and tags associated with each quote.

Format: Saved in CSV as quotes_Babirye.csv with three columns: Quote, Author, Tags.

Challenges / Ethical Issues:

The main challenge was navigating to collect all quotes across multiple pages.
looping of the pages the kernel just kept running longer than expected

Ethical consideration: this website is fully fully ethical