In [1]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup

In [2]:
def scaler_extractor(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content)
    
    cols = ['product_page_url', 'title', 'universal_product_code', 'price_including_tax', 
            'price_excluding_tax', 'number_available', 'category', 'review_rating']
    df = pd.DataFrame(columns = cols)
    
    booklib = soup.findAll('li', class_ = "col-xs-6 col-sm-4 col-md-3 col-lg-3")
    for book in booklib:
        '''
        SOME DATA SUCH AS UPC, PRICE (TAX), NUMBER AVAILABLE IS ONLY AVAILABLE IN THE INDIVIDUAL BOOK URLS. 
        THEREFORE, WE WILL FIRST GET THE DATA THAT IS AVAILABLE ON THE TRAVEL BOOKS URL DIRECTLY
        SEE BELOW CELLS FOR STEP-BY-STEP GUIDE
        '''
        
        # 1. Product Page URL
        short_url = book.h3.a["href"]
        suffix = "http://books.toscrape.com/catalogue"
        short_url = short_url.split("/")[3:]
        product_page_url = suffix + "/" + short_url[0] + "/" + short_url[1]
        
        # 2. Book Title
        title = book.h3.a["title"]
        
        # 3. Category
        category = "Travel"
        
        # 4. Review Rating
        rating = book.p["class"][1]
        
        # To get UPC, Price, Avaialability we need to get individual URLs for each book and get info 
        # from "Product Information" table
        
        book_req = requests.get(product_page_url)
        booksoup = BeautifulSoup(book_req.content)
        bookinfo = booksoup.find('table', class_ = "table table-striped") #Only need one-table to we will use "find"
        rows = bookinfo.findAll("tr")
        
        # 5. UPC, Price, Availability
        UPC = rows[0].find("td").text
        price_exc_tax = rows[2].find("td").text
        price_inc_tax = rows[3].find("td").text
        stock = rows[5].find("td").text
        
        vals = [product_page_url, title, UPC, price_exc_tax, price_inc_tax, stock, category, rating]
        df = pd.concat([df, pd.DataFrame([vals], columns=df.columns)], axis=0, ignore_index=True)
    
    return df

In [3]:
url = "https://books.toscrape.com/catalogue/category/books/travel_2/index.html"
df = scaler_extractor(url)
df.to_csv("WebScrapping.csv", index = False)