In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

In [None]:

def scrape_books():
    base_url = "http://books.toscrape.com/catalogue/page-{}.html"
    books_data = []
    page = 1
    max_pages = 30 # Limiting to 3 pages for demo purposes

    while page <= max_pages:
        try:
            # Send request to the page
            response = requests.get(base_url.format(page))
            response.raise_for_status()
            
            # Parse HTML content
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all book containers
            books = soup.find_all('article', class_='product_pod')
            
            if not books:
                break
                
            for book in books:
                # Extract book details
                title = book.find('h3').find('a')['title']
                price = book.find('p', class_='price_color').text.strip()
                rating = book.find('p', class_='star-rating')['class'][1]
                availability = book.find('p', class_='instock availability').text.strip()
                book_url = book.find('h3').find('a')['href']
                book_image = book.find('img')['src']
                
                books_data.append({
                    'title': title,
                    'price': price,
                    'rating': rating,
                    'availability': availability,
                    'book_url': f"http://books.toscrape.com/catalogue/{book_url}",
                    'book_image': f"http://books.toscrape.com/{book_image.lstrip('../')}"
                })
            
            print(f"Scraped page {page}")
            page += 1
            time.sleep(1)  # Be polite to the server
            
        except requests.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            break
    
    return books_data

def save_to_csv(data, filename='books.csv'):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

def save_to_json(data, filename='books.json'):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
    print(f"Data saved to {filename}")

def main():
    print("Starting web scraping...")
    books_data = scrape_books()
    
    if books_data:
        save_to_csv(books_data)
        save_to_json(books_data)
        print(f"Scraped {len(books_data)} books successfully")
    else:
        print("No data scraped")

if __name__ == "__main__":
    main()

Starting web scraping...
Scraped page 1
Scraped page 2
Scraped page 3
Scraped page 4
Scraped page 5
Scraped page 6
Scraped page 7
Scraped page 8
Scraped page 9
Scraped page 10
Scraped page 11
Scraped page 12
Scraped page 13
Scraped page 14
Scraped page 15
Scraped page 16
Scraped page 17
Scraped page 18
Scraped page 19
Scraped page 20
Scraped page 21
Scraped page 22
Scraped page 23
Scraped page 24
Scraped page 25
Scraped page 26
Scraped page 27
Scraped page 28
Scraped page 29
Scraped page 30
Data saved to books.csv
Data saved to books.json
Scraped 600 books successfully


In [12]:
df = pd.read_csv('books.csv')
df

Unnamed: 0,title,price,rating,availability
0,A Light in the Attic,Â£51.77,Three,In stock
1,Tipping the Velvet,Â£53.74,One,In stock
2,Soumission,Â£50.10,One,In stock
3,Sharp Objects,Â£47.82,Four,In stock
4,Sapiens: A Brief History of Humankind,Â£54.23,Five,In stock
...,...,...,...,...
595,The Grand Design,Â£13.76,Three,In stock
596,The Firm,Â£45.56,Three,In stock
597,The Fault in Our Stars,Â£47.22,One,In stock
598,The False Prince (The Ascendance Trilogy #1),Â£56.00,Five,In stock


In [7]:
pd.read_csv('amazon_products.csv')

Unnamed: 0,Title,Price
