# bookdepository.com bestsellers webscaping
### https://data36.com/beautiful-soup-tutorial-web-scraping/

In [59]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests

# Basics

In [60]:
#The URL we want to scape:
url = "https://www.bookdepository.com/bestsellers" 

response = requests.get(url)

In [61]:
#If the response value (status code) = 200 then the request was successful
response

<Response [200]>

In [62]:
#Get the HTML contet (that's what we need)
html = response.content

In [63]:
#Creating a BeautifulSoup object
#lxml is the parser
#soup is more readable than html 
soup = bs(html, "lxml")

# bs methods:

In [8]:
#Get the title html
soup.title

<title>
	Bestselling books online with free delivery at Book Depository</title>

In [13]:
#Get only the h1 text
soup.h1.get_text()

'Bestsellers – our most popular items, updated daily.'

In [14]:
#Get only an element's attribute:
soup.a["href"]

'/help/topic/HelpId/53/How-we-use-cookies#helpContent'

In [15]:
#Return only 1 element
soup.find("h1")

<h1>Bestsellers – our most popular items, updated daily.</h1>

In [17]:
#Returns all element - in a list!  -- soup("h2") does the same!
soup.find_all('h2')

[<h2>We use cookies to improve this site</h2>,
 <h2>Are you happy to accept cookies?</h2>,
 <h2>Cookie Preferences</h2>,
 <h2>Essential</h2>,
 <h2>Performance and Analytics</h2>,
 <h2>Advertising</h2>,
 <h2>Top Authors</h2>,
 <h2>Bestselling Series</h2>,
 <h2>Books By Language</h2>,
 <h2>Filter your search</h2>]

# Extract Data - Getting the book titles

In [25]:
all_h3 = soup.find_all("h3", class_="title") #_ after the class is bcs the "class"
for h3 in all_h3:
    print(h3.get_text(strip = True))       #Strip -- removes whitespaces

Letters to You
It Ends With Us: The most heartbreaking novel you'll ever read
Seven Husbands of Evelyn Hugo
Heaven Official's Blessing: Tian Guan Ci Fu (Novel) Vol. 2
Chainsaw Man, Vol. 2
Ugly Love
The Spanish Love Deception
Grandmaster of Demonic Cultivation
The Midnight Library
Atomic Habits
Heaven Official's Blessing
Fast 800 Keto
Reminders of Him
Verity
November 9
Chainsaw Man, Vol. 4
Chainsaw Man, Vol. 3
The Song of Achilles
Chainsaw Man, Vol. 9
Normal People
The Love Hypothesis
The Scum Villain's Self-Saving System
Before the Coffee Gets Cold
The Heart Principle
The Real Anthony Fauci
Where the Crawdads Sing
Chainsaw Man, Vol. 6
Chainsaw Man, Vol. 5
The Body Keeps the Score
The Hating Game


# Get the book formats

In [27]:
#find all paragraph elements with the class of format that are inside of a div element with the class of item-info
#.select() is similar than find_all(), but van handle css selectors
formats = soup.select("div.item-info p.format")

In [29]:
formats  #Its a list again!

[<p class="format">Hardback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Hardback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="format">Paperback</p>,
 <p class="forma

In [30]:
#Creating a pandas series
formats_series = pd.Series(formats)

In [31]:
formats_series

0      [Hardback]
1     [Paperback]
2     [Paperback]
3     [Paperback]
4     [Paperback]
5     [Paperback]
6     [Paperback]
7     [Paperback]
8     [Paperback]
9     [Paperback]
10    [Paperback]
11    [Paperback]
12    [Paperback]
13    [Paperback]
14    [Paperback]
15    [Paperback]
16    [Paperback]
17    [Paperback]
18    [Paperback]
19    [Paperback]
20    [Paperback]
21    [Paperback]
22    [Paperback]
23    [Paperback]
24     [Hardback]
25    [Paperback]
26    [Paperback]
27    [Paperback]
28    [Paperback]
29    [Paperback]
dtype: object

In [32]:
formats_series.value_counts()

[Paperback]    28
[Hardback]      2
dtype: int64

# Getting the publication dates

In [38]:
dates = soup.find_all("p", class_="published")
dates = [date.get_text()[-4:] for date in dates] #some magic what i dont exactly understand right now
dates_series = pd.Series(dates)
dates_series.value_counts()

2021    13
2022     6
2019     3
2016     2
2015     2
2017     2
2020     1
2018     1
dtype: int64

# Getting the prices

In [None]:
final_prices = []
prices = soup.find_all("p", class_="price")

for price in prices:
    original_price = price.find("span", class_="rrp")
    if original_price:
        current_price = str(original_price.previousSibling).strip()
        current_price = float(current_price.split("€")[0].replace(",", "."))
    else:
        current_price = float(price.get_text(strip=True).split("€")[0].replace(",", "."))
        final_prices.append(current_price)