## WEBSCRAPING LIBRARY SITE PROJECT

Project goal: scrape all data about books from the website and create a dataframe with all the informations available.
The data we can scrap consists in:
- Genre
- Rating
- Price
- Stock availability
- Genre

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

In [2]:
url = "https://books.toscrape.com/catalogue/page-1.html"

request = requests.get(url).text
soup = bs(request, "html.parser")


In [3]:
# Create a list for all titles

def create_titles_list():
    books = soup.find_all("article")
    titles = []
    for title in range(len(books)):
        titles.append(books[title].h3.a.get("title"))
    return(titles)
    
titles = create_titles_list()
titles

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas"]

In [4]:
# Create a list for all the prices

def create_prices_list():
    book_prices = soup.find_all(class_= "price_color")
    prices = []
    for price in range(len(book_prices)):
        p = book_prices[price].text
        p = p.replace("Â£", "£")
        prices.append(p)
    return prices

prices = create_prices_list()    
prices

['£51.77',
 '£53.74',
 '£50.10',
 '£47.82',
 '£54.23',
 '£22.65',
 '£33.34',
 '£17.93',
 '£22.60',
 '£52.15',
 '£13.99',
 '£20.66',
 '£17.46',
 '£52.29',
 '£35.02',
 '£57.25',
 '£23.88',
 '£37.59',
 '£51.33',
 '£45.17']

In [5]:
# Create a list for the ratings

def create_ratings_list():
    book_ratings = soup.find_all(class_ ="star-rating")
    strbookratings = ""
    for i in range(len(book_ratings)):
        s = str(book_ratings[i])
        strbookratings = strbookratings + s      
        L = strbookratings.split("\n")
    O = L[::6]
    O.pop(-1)
    ratings = []
    for i in range(len(O)):
        h = O[i].split(" ")
        ratings.append(h[-1][:-2])
    return ratings

ratings = create_ratings_list()
ratings

['Three',
 'One',
 'One',
 'Four',
 'Five',
 'One',
 'Four',
 'Three',
 'Four',
 'One',
 'Two',
 'Four',
 'Five',
 'Five',
 'Five',
 'Three',
 'One',
 'One',
 'Two',
 'Two']

In [6]:
# Stock availability is reported in the each book's webpage
# first we need to scrap the url for each webpage

url_stock = "https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html"

# We need to define a function that scraps each books' webpage

def scrap_stocks(url_stock):
    request_stock = requests.get(url_stock).text
    soup_stock = bs(request_stock, "html.parser")
    availab = soup_stock.find("td", string = lambda text: "stock" in text)  # isolate the stock data
    stravailab = str(availab)        # convert the tag into a string and then into a list
    listavailab = stravailab.split(" ")
    stocks = []            # isolate the number of available copies
    for i in range(len(listavailab)):
        if "(" in listavailab[i]:
            num = int(listavailab[i].replace("(", ""))
            stocks.append(num)
    return stocks      # returns a single element list with available copies for that book

scrap_stocks(url_stock)


[22]

In [7]:
# We need to replicate the process for each book
# Let's first gather a list of all the urls contained in the first page

def get_page_urls(soup):
    hrefs = soup.find_all("a", href = True, title = True)
    urls_page = []
    for l in range(len(hrefs)):
        hrefs[l] = str(hrefs[l])
        hrefs[l] = hrefs[l].replace('<a href="', 'https://books.toscrape.com/catalogue/')
        hrefs[l] = hrefs[l].split('"')
        urls_page.append(hrefs[l][0])
    return urls_page

get_page_urls(soup)




['https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
 'https://books.toscrape.com/catalogue/soumission_998/index.html',
 'https://books.toscrape.com/catalogue/sharp-objects_997/index.html',
 'https://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html',
 'https://books.toscrape.com/catalogue/the-requiem-red_995/index.html',
 'https://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html',
 'https://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html',
 'https://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html',
 'https://books.toscrape.com/catalogue/the-black-maria_991/index.html',
 'https://books.toscrape.com/catalogue/starving-hearts-triangular-trade-tr

In [8]:
# Now we need to combine the two functions so that from each url we can retrieve the number of copies

def get_page_stocks():
    urls = get_page_urls(soup)
    stocks_in_page = []
    for i in range(len(urls)):
        url_stock = urls[i]
        stocks_in_page.extend(scrap_stocks(url_stock))
    return stocks_in_page

stocks = get_page_stocks()

In [9]:
# We now have a list for the titles, the prices, the ratings and the stocks.
# Let's create a dataframe to summarize our data

data = {"Title": titles, "Price": prices, "Rating": ratings, "In stock": stocks}
df = pd.DataFrame(data)
df

Unnamed: 0,Title,Price,Rating,In stock
0,A Light in the Attic,£51.77,Three,22
1,Tipping the Velvet,£53.74,One,20
2,Soumission,£50.10,One,20
3,Sharp Objects,£47.82,Four,20
4,Sapiens: A Brief History of Humankind,£54.23,Five,20
5,The Requiem Red,£22.65,One,19
6,The Dirty Little Secrets of Getting Your Dream...,£33.34,Four,19
7,The Coming Woman: A Novel Based on the Life of...,£17.93,Three,19
8,The Boys in the Boat: Nine Americans and Their...,£22.60,Four,19
9,The Black Maria,£52.15,One,19


In [10]:
# We have succesfully summarized all the data in the first page
# However, there are 49 more
# We can create aggregate the data from all pages by using a for loop

# WARNING: the following loop will take a few minutes to complete 

alltitles = []
allprices = []
allratings = []
allstocks = []

for i in range(1, 51):
    url = "https://books.toscrape.com/catalogue/page-" + str(i) + ".html"
    request = requests.get(url).text
    soup = bs(request, "html.parser")
    

    alltitles.extend(create_titles_list())    
    allprices.extend(create_prices_list())   
    allratings.extend(create_ratings_list())
    allstocks.extend(get_page_stocks())
    print("Page " + str(i) + " completed")

    



Page 1completed
Page 2completed
Page 3completed
Page 4completed
Page 5completed
Page 6completed
Page 7completed
Page 8completed
Page 9completed
Page 10completed
Page 11completed
Page 12completed
Page 13completed
Page 14completed
Page 15completed
Page 16completed
Page 17completed
Page 18completed
Page 19completed
Page 20completed
Page 21completed
Page 22completed
Page 23completed
Page 24completed
Page 25completed
Page 26completed
Page 27completed
Page 28completed
Page 29completed
Page 30completed
Page 31completed
Page 32completed
Page 33completed
Page 34completed
Page 35completed
Page 36completed
Page 37completed
Page 38completed
Page 39completed
Page 40completed
Page 41completed
Page 42completed
Page 43completed
Page 44completed
Page 45completed
Page 46completed
Page 47completed
Page 48completed
Page 49completed
Page 50completed


In [27]:
# The final dataframe!

Books2Scrap = {"Title": alltitles, "Price": allprices, "Rating": allratings, "Stocks": allstocks}

df = pd.DataFrame(Books2Scrap)
df


Unnamed: 0,Title,Price,Rating,Stocks
0,A Light in the Attic,£51.77,Three,22
1,Tipping the Velvet,£53.74,One,20
2,Soumission,£50.10,One,20
3,Sharp Objects,£47.82,Four,20
4,Sapiens: A Brief History of Humankind,£54.23,Five,20
...,...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,£55.53,One,1
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",£57.06,Four,1
997,A Spy's Devotion (The Regency Spies of London #1),£16.97,Five,1
998,1st to Die (Women's Murder Club #1),£53.98,One,1


In [24]:
# First index is 0, let's change it to 1

I = list(range(1, 1001))

Books2Scrap = {"Index": I, "Title": alltitles, "Price": allprices, "Rating": allratings, "Stocks": allstocks}

df = pd.DataFrame(Books2Scrap)
df.set_index("Index", inplace = True)

df


Unnamed: 0_level_0,Title,Price,Rating,Stocks
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,A Light in the Attic,£51.77,Three,22
2,Tipping the Velvet,£53.74,One,20
3,Soumission,£50.10,One,20
4,Sharp Objects,£47.82,Four,20
5,Sapiens: A Brief History of Humankind,£54.23,Five,20
...,...,...,...,...
996,Alice in Wonderland (Alice's Adventures in Won...,£55.53,One,1
997,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",£57.06,Four,1
998,A Spy's Devotion (The Regency Spies of London #1),£16.97,Five,1
999,1st to Die (Women's Murder Club #1),£53.98,One,1


In [26]:
# Only one thing left to do: export the dataframe in a csv format

df.to_csv("/Users/Luca/Desktop/df.csv") 