# Amazon Bestsellers WebScrapping project
### The aim of this project is to scrape the urls of the different categories of books available in Amazon.in/books page and to get top 50 books under each category

#### Steps to Achieve this
- Scrape the Category name and category urls from the book page
- Save the result in the form of csv file
- Write a function to scrape all the relevant book info from each url
- Write a function to save info from each category as a separate csv file and save everything in a folder.

In [1]:
#We will first import all the relevant libraries.
import os
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import csv
import time

In [2]:
#The URL of the main page from which we will start working on the project
bestseller_url = 'https://www.amazon.in/gp/bestsellers/books/ref=zg_bs_unv_books_1_1318104031_1'

In [3]:
#We will now define a function to scape all the URLs and Title of categories present on the Page

def scrape_category_info(URL):
    response = requests.get(URL)
    if response.status_code != 200:
        raise Exception('Failed to Scrape {}'.format(URL))
    #We use Beautifulsoup to read the html
    soup = bs(response.text, 'html.parser')
    category_tag = soup.find_all('div' , {'class': '_p13n-zg-nav-tree-all_style_zg-browse-item__1rdKf _p13n-zg-nav-tree-all_style_zg-browse-height-large__1z5B8'})
    category_dict =[{ 
    'name' : cat.find('a').text,
    'url' : 'https://amazon.in' + cat.find('a')['href']} for cat in category_tag[1:]]
    return pd.DataFrame(category_dict)

In [7]:
#write a function to do this at once and store it in a pd dataframe and save it as csv as well
def book_info(category_url):
    page = requests.get(category_url)


    if page.status_code != 200:
        raise Exception("Failed to download{}. Skipping...".format(category_url))
    topic_doc = bs(page.text, 'html.parser')
    #Searching for tags which contain the information we need
    book_name_tags = topic_doc.find_all('div', {'class': '_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y'})
    book_rating_tag = topic_doc.find_all('div', {'class': 'a-icon-row'})
    book_type_tag = topic_doc.find_all('div', {'class': 'a-row a-size-small'})
    book_price_tag = topic_doc.find_all('div', {'class': 'a-row'})

    #Creating empty lists
    book_name = []
    book_author = []
    book_ratings = []
    book_star = []
    book_type = []
    book_price = []
    #loops to save information
    for i in range(len(book_name_tags)):
        if i % 2 == 0:
            book_name.append(book_name_tags[i].text.strip())
        elif i % 2 != 0:
            book_author.append(book_name_tags[i].text.strip())

    for i in range(len(book_rating_tag)):
        rating_tag = book_rating_tag[i].find_all('span')
        book_star.append(rating_tag[0].text.strip())
        book_ratings.append(rating_tag[1].text.strip())

    for i in range(len(book_type_tag)):
        if i % 2 != 0:
            type = book_type_tag[1].find_all('span', {'class': 'a-size-small a-color-secondary a-text-normal'})
            book_type.append(type[0].text)

    for i in range(1, len(book_price_tag)):
        if i % 4 == 0:
            book_price_tag1 = book_price_tag[i].find_all('span', {'class':'p13n-sc-price'})
            if len(book_price_tag1) != 0:
                price = book_price_tag1[0].text.strip()
                price = price[1:]
                book_price.append(price)
    #creating dictionary
    book_dict = {
        'Name' : book_name,
        'Author' : book_author,
        'No_Of_Ratings' : book_ratings,
        'Stars' : book_star,
        'Type' : book_type,
        'Price' : book_price
    }
    #Converting dict to Pandas DataFrame
    book_df = pd.DataFrame({key:pd.Series(value) for key, value in book_dict.items()})

    return book_df

In [5]:
def scrape_books(URL):
    os.makedirs('AmazonBestSellers', exist_ok=True)
    path = 'AmazonBestSellers/'
    
    category_df = scrape_category_info(URL)
    if os.path.exists(path + 'categories.csv'):
            print("The file {} already exists. Skipping...".format(path + 'categories.csv'))
    category_df.to_csv('AmazonBestSellers/{}.csv'.format('categories'), index=None)
        
    for index, row in category_df.iterrows():
        if os.path.exists(path + row['name'] + '.csv'):
            print("The file {} already exists. Skipping...".format(path + row['name'] + '.csv'))
        print('Scraping top repositories for "{}"'.format(row['name']))
        book_df = book_info(row['url'])
        book_df.to_csv(path + '{}.csv'.format(row['name']), index=None)
    return print('Scrapping is now complete.')

In [6]:
bestseller_url 

'https://www.amazon.in/gp/bestsellers/books/ref=zg_bs_unv_books_1_1318104031_1'

In [None]:
scrape_books(bestseller_url)