In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def retrieve_hp():
    # Define the base URL for the website
    base_url = "https://books.toscrape.com/catalogue/page-{}.html"
    
    # Define headers to mimic a browser request
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}
    
    # Create an empty list to store book data
    all_books = []

    # We'll loop through the pages, starting from page 1
    page_number = 1
    
    while True:
        # Format the URL with the current page number
        page_url = base_url.format(page_number)
        
        # Send a GET request to fetch the HTML content of the current page
        response = requests.get(page_url, headers=headers)
        
        # If the page doesn't exist or returns an error, stop scraping
        if response.status_code != 200:
            break
        
        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all books on the page
        books = soup.find_all('article', class_='product_pod')
        
        # Extract details for each book (title, price, rating)
        for book in books:
            title = book.find('h3').find('a')['title']
            price = book.find('p', class_='price_color').text
            rating = book.find('p', class_='star-rating')['class'][1]  # Extract rating from the class name
            
            # Append each book's data to the all_books list
            all_books.append([title, price, rating])

        # Check if there is a "next" page link (to continue scraping)
        next_page = soup.find('li', class_='next')
        if next_page:
            page_number += 1  # Move to the next page
        else:
            break  # No more pages, stop the loop
    
    # Create a DataFrame from the extracted data
    df = pd.DataFrame(all_books, columns=['Title', 'Price', 'Rating'])
    
    # Clean the Price column by removing any non-numeric characters and converting it to a float
    df['Price'] = df['Price'].apply(lambda x: float(''.join(filter(str.isdigit, x[1:]))))  # Remove £ and convert to float
    
    # Optionally, sort by price
    df.sort_values(by='Price', inplace=True)
    
    return df

# Call the function and display the result
book_data = retrieve_hp()
(book_data)


Unnamed: 0,Title,Price,Rating
638,An Abundance of Katherines,1000.0,Five
501,The Origin of Species,1001.0,Four
716,The Tipping Point: How Little Things Can Make ...,1002.0,Two
84,Patience,1016.0,Three
302,Greek Mythic History,1023.0,Five
...,...,...,...
366,The Diary of a Young Girl,5990.0,Three
560,The Barefoot Contessa Cookbook,5992.0,Five
860,Civilization and Its Discontents,5995.0,Two
617,Last One Home (New Beginnings #1),5998.0,Three
