# Book depository web scraping
Website : https://www.bookdepository.com
In this project we are collecting all books from the website using Python BeautifulSoup library. 
The website is having different category books.
The result data is saved into a CSV file, which could be used for data analysis purpose.

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re


In [3]:
# Function to get clean books data and store into list
# store all lists into dictionary and return it
def getOnePageCleanBooksData(books, url):
    titles = []
    authors = []
    bookPublishDates = []
    bookFormats = []
    salePrices = []
    oldPrices = []
    for book in books:
        
        # to get book title
        title = book.find('h3', class_ = 'title')
        titles.append(title.text.strip('\n '))

        # to get author name
        author = book.find('p', class_ = 'author')
        authors.append(author.text.strip(' \n '))

        # to get book published date
        bookPublishedDate = book.find('p', class_= "published")
        bookPublishDates.append(bookPublishedDate.text.strip())

        # to get book format
        bookFormat = book.find('p', class_= "format")
        bookFormats.append(bookFormat.text.strip())

        # to get book sale price
        price = book.find('span', class_="sale-price")
        if price:
            price_text = book.find('span', class_="sale-price").get_text()
            # remove extra char from value e.g. $
            salePrice = (re.findall(r'\d+[.,]\d+',price_text))
            salePriceFloat = float(salePrice[0].replace(',','.'))
            price = salePriceFloat
        salePrices.append(price)
        
        
        # to get original/old book price
        rrpPrice = book.find('span', class_ = "rrp omnibus")
        if rrpPrice:
            rrpPrice_text = book.find('span', class_="rrp omnibus").get_text()
            # remove extra char from value e.g. $
            oldPrice = (re.findall(r'\d+[.,]\d+',rrpPrice_text))
            oldPriceFloat = float(oldPrice[0].replace(',','.'))
            rrpPrice = oldPriceFloat
       
        if not rrpPrice:
            rrpPrice = price
        
        oldPrices.append(rrpPrice)
        
        category = url.split("/")[-1]
        
    booksDict = {
    'Title': titles,
    'Author': authors,
    'Published date': bookPublishDates,
    'Format': bookFormats,
    'Sale Price': salePrices,
    'Old Price': oldPrices,
     'Category':   category
        
    }   
    return booksDict

        

In [4]:
# get soup result from one url
def getSoup(url):
    results = requests.get(url)
    if(results.status_code == 200):
        soup = BeautifulSoup(results.text, 'lxml')
        books = soup.find_all('div', class_='book-item')
    return books    

In [5]:
# function to get all book category links from Main/Home webpage

def getCategroywiseBooksLinks():
    URL = 'https://www.bookdepository.com/'

    # Fetch all the HTML source from the url
    response = requests.get(URL)

    # Parse HTML and extract links
    soup = BeautifulSoup(response.text, 'html.parser')
    #links = soup.select('a')
    links = soup.find_all('a')
    # To find all category wise links
    searchLinks = []
    for link in links:
        if link.get('href') != None:
            if 'https://' not in link.get('href'):
                searchLinks.append('https://www.bookdepository.com' + link.get('href')) # Convert relative URL to absolute URL

    link_to_check = 'https://www.bookdepository.com/category'
    categori_Links = []
    for searchLink in searchLinks:

        if link_to_check in searchLink:
            categori_Links.append(searchLink)
            
    return categori_Links        

In [6]:
categorywiseBooksUrl = getCategroywiseBooksLinks() #collect all book category url

combineDF = []
dataframe_collections = {} 
#url = 'https://www.bookdepository.com/category/2/Art-Photography'
for url in categorywiseBooksUrl:
    books = getSoup(url)
    booksDict = getOnePageCleanBooksData(books, url)
    booksData = pd.DataFrame(booksDict)  # convert one category books data dictionary into dataframe
    combineDF.append(booksData)
    
    category = url.split("/")[-1] # to get category from url
   # dataframe_collections[category]= booksData
    print(category)
    

Art-Photography
Audio-Books
Biography
Business-Finance-Law
Childrens-Books
Computing
Crafts-Hobbies
Crime-Thriller
Dictionaries-Languages
Entertainment
Fiction
Food-Drink
Graphic-Novels-Anime-Manga
Health
History-Archaeology
Home-Garden
Humour
Medical
Mind-Body-Spirit
Natural-History
Personal-Development
Poetry-Drama
Reference
Religion
Romance
Science-Geography
Science-Fiction-Fantasy-Horror
Society-Social-Sciences
Sport
Stationery
Teaching-Resources-Education
Technology-Engineering
Teen-Young-Adult
Transport
Travel-Holiday-Guides
Books-for-Ages-0-2
Books-for-Ages-3-5
Books-for-Ages-6-8
Books-for-Ages-9-11
Teen-Young-Adult
Autobiography-General
Fantasy
Business-Management
Military-History
Childrens-Books
Fiction
Crime
Graphic-Novels-Anime-Manga
Fiction
Stationery
Childrens-Books
Fiction
Graphic-Novels-Anime-and-Manga
Food-and-Drink
Crafts-and-Hobbies
Art-and-Photography
Biography
Crime-and-Thriller


In [7]:
# to get all categories from url
categoryLists = [] 
for link in categorywiseBooksUrl:
    categoryLists.append(link.split("/")[-1])
    

In [8]:
result=pd.concat(combineDF)  # combine list of dataframes into dataframe

In [9]:
result.shape

(9238, 7)

In [10]:
result.Category.value_counts()

Fiction                           1012
Childrens-Books                    675
Graphic-Novels-Anime-Manga         590
Stationery                         454
Biography                          316
Graphic-Novels-Anime-and-Manga     295
Mind-Body-Spirit                   261
Food-and-Drink                     238
Food-Drink                         238
Poetry-Drama                       227
Art-Photography                    218
Art-and-Photography                218
Home-Garden                        210
Crime-Thriller                     207
Dictionaries-Languages             207
Crime-and-Thriller                 207
Teaching-Resources-Education       204
History-Archaeology                194
Personal-Development               186
Audio-Books                        167
Science-Geography                  161
Religion                           159
Transport                          157
Sport                              154
Humour                             145
Health                   

In [11]:
# write dataFrame to CSV file -( creat file before then execute )
result.to_csv('AllBookData.csv',index=False)