# Class-based version

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

class QuoteScraper:
    """
    Scrapes quotes, authors, and tags from http://quotes.toscrape.com/
    and saves the results in both CSV and JSON formats.
    """
    def __init__(self, base_url='http://quotes.toscrape.com/page/'):
        self.base_url = base_url
        self.quotes = []

    def fetch_page(self, page_number):
        """
        Fetches the content of a given page number. Returns BeautifulSoup object or None on failure.
        """
        try:
            response = requests.get(f"{self.base_url}{page_number}/", timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.RequestException as e:
            print(f"[Error] Could not fetch page {page_number}: {e}")
            return None

    def parse_quotes(self, soup):
        """
        Parses all quotes on a given page soup and appends to the quotes list.
        """
        quote_divs = soup.find_all('div', class_='quote')
        for div in quote_divs:
            text = div.find('span', class_='text').get_text(strip=True)
            author = div.find('small', class_='author').get_text(strip=True)
            tags = [tag.get_text(strip=True) for tag in div.find_all('a', class_='tag')]
            self.quotes.append({
                'quote': text,
                'author': author,
                'tags': tags
            })

    def scrape_all_pages(self):
        """
        Loops through all paginated pages until no more quotes are found.
        """
        print("[Info] Starting scrape...")
        page = 1
        while True:
            print(f"[Info] Fetching page {page}...")
            soup = self.fetch_page(page)
            if not soup:
                break
            if not soup.find_all('div', class_='quote'):
                print("[Info] No more quotes found. Stopping.")
                break
            self.parse_quotes(soup)
            page += 1
            time.sleep(1)  # Be polite: add a short delay

    def save_to_csv(self, filename='quotes.csv'):
        """
        Saves the scraped quotes to a CSV file.
        """
        df = pd.DataFrame(self.quotes)
        df.to_csv(filename, index=False)
        print(f"[Info] Quotes saved to {filename}")

    def save_to_json(self, filename='quotes.json'):
        """
        Saves the scraped quotes to a JSON file.
        """
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.quotes, f, ensure_ascii=False, indent=4)
        print(f"[Info] Quotes saved to {filename}")

if __name__ == "__main__":
    scraper = QuoteScraper()
    scraper.scrape_all_pages()
    scraper.save_to_csv()
    scraper.save_to_json()


[Info] Starting scrape...
[Info] Fetching page 1...
[Info] Fetching page 2...
[Info] Fetching page 3...
[Info] Fetching page 4...
[Info] Fetching page 5...
[Info] Fetching page 6...
[Info] Fetching page 7...
[Info] Fetching page 8...
[Info] Fetching page 9...
[Info] Fetching page 10...
[Info] Fetching page 11...
[Info] No more quotes found. Stopping.
[Info] Quotes saved to quotes.csv
[Info] Quotes saved to quotes.json


In [2]:
df = pd.read_csv('quotes.csv')

In [3]:
df.head()

Unnamed: 0,quote,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"['change', 'deep-thoughts', 'thinking', 'world']"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"['abilities', 'choices']"
2,“There are only two ways to live your life. On...,Albert Einstein,"['inspirational', 'life', 'live', 'miracle', '..."
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"['aliteracy', 'books', 'classic', 'humor']"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"['be-yourself', 'inspirational']"


# Function-based version.

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

BASE_URL = 'http://quotes.toscrape.com/page/'

def fetch_page(page_number):
    try:
        response = requests.get(f"{BASE_URL}{page_number}/", timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"[Error] Could not fetch page {page_number}: {e}")
        return None

def parse_quotes(soup):
    quotes = []
    quote_divs = soup.find_all('div', class_='quote')
    for div in quote_divs:
        text = div.find('span', class_='text').get_text(strip=True)
        author = div.find('small', class_='author').get_text(strip=True)
        tags = [tag.get_text(strip=True) for tag in div.find_all('a', class_='tag')]
        quotes.append({'quote': text, 'author': author, 'tags': tags})
    return quotes

def scrape_all_pages():
    all_quotes = []
    page = 1
    while True:
        print(f"[Info] Fetching page {page}...")
        soup = fetch_page(page)
        if not soup or not soup.find_all('div', class_='quote'):
            break
        quotes = parse_quotes(soup)
        all_quotes.extend(quotes)
        page += 1
        time.sleep(1)
    return all_quotes

def save_to_csv(quotes, filename='quotes.csv'):
    df = pd.DataFrame(quotes)
    df.to_csv(filename, index=False)
    print(f"[Info] Quotes saved to {filename}")

def save_to_json(quotes, filename='quotes.json'):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(quotes, f, ensure_ascii=False, indent=4)
    print(f"[Info] Quotes saved to {filename}")

if __name__ == "__main__":
    quotes = scrape_all_pages()
    save_to_csv(quotes)
    save_to_json(quotes)


[Info] Fetching page 1...
[Info] Fetching page 2...
[Info] Fetching page 3...
[Info] Fetching page 4...
[Info] Fetching page 5...
[Info] Fetching page 6...
[Info] Fetching page 7...
[Info] Fetching page 8...
[Info] Fetching page 9...
[Info] Fetching page 10...
[Info] Fetching page 11...
[Info] Quotes saved to quotes.csv
[Info] Quotes saved to quotes.json
