In [36]:
import requests
import pandas as pd
import numpy as np
import os
import time
from IPython.display import display
from bs4 import BeautifulSoup
import json
from datetime import datetime

In [37]:

#Fetching Data From all pages plus Data Preperation (Cleaning, Formatting, Replacing, Null Values Check, Duplicates Check, Calculated fields, parameters, etc)


url = "https://api.bookscouter.com/v4/search/new?term=all+books&itemsPerPage=10&page=1"

headers = {
  'authority': 'api.bookscouter.com',
  'accept': 'application/ld+json',
  'accept-language': 'en-US,en;q=0.9',
  'cookie': '_gcl_au=1.1.174576093.1691816838',
  'origin': 'https://bookscouter.com',
  'referer': 'https://bookscouter.com/',
  'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
  'sec-ch-ua-mobile': '?0',
  'sec-ch-ua-platform': '"Windows"',
  'sec-fetch-dest': 'empty',
  'sec-fetch-mode': 'cors',
  'sec-fetch-site': 'same-site',
  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}

publisher_popularity = {}
data_list = []  # Initializing empty list to store data from all pages
current_year = datetime.now().year
famous_threshold = 1000
high_threshold = 500
medium_threshold = 200

# Calculate sentiment label based on conditions
def determine_label(rating_value, rating_category):
    if rating_value >= 4.5 and rating_category == 'Excellent':
        return 'Happy'
    elif 4 <= rating_value <= 4.4 and rating_category == 'Good':
        return 'Neutral'
    else:
        return 'Sad'

page = 1
while True:
    url = f"https://api.bookscouter.com/v4/search/new?term=all+books&itemsPerPage=10&page={page}"
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        json_data = response.json()
        items = json_data.get('hydra:member', [])
        
        if not items:
            break  # Break the loop if there are no more items
        
        for item in items:

            #----------------------------------------------------------------------------------------------------------------

            authors = ', '.join(item.get('author', []))  # Join author names using comma separator
            if not authors:  # Check for null or empty authors
                authors = "Unknown"

            #----------------------------------------------------------------------------------------------------------------

            categories = ', '.join([category['name'] for category in item['categories']])
            if not categories:  # Check for null or empty authors
                categories = "N/A"

            #----------------------------------------------------------------------------------------------------------------
                
            description_html = item.get('description', '')
            description_text = BeautifulSoup(description_html, 'html.parser').get_text()  # Remove HTML tags
            if not description_text:  # Check for null or empty description
                description_text = "No Info"

            #----------------------------------------------------------------------------------------------------------------

            publisher = item.get('publisher')
            if publisher is None or publisher == '':
                publisher = "Unknown"
            elif isinstance(publisher, dict) and publisher.get('name'):
                publisher = publisher.get('name')

            publisher_popularity[publisher] = publisher_popularity.get(publisher, 0) + rating_count
            def categorize_publisher_popularity(popularity):
                if popularity >= famous_threshold:
                    return "Famous"
                elif popularity >= high_threshold:
                    return "High"
                elif popularity >= medium_threshold:
                    return "Medium"
                else:
                    return "Low"
            
            publisher_popularity_category = categorize_publisher_popularity(publisher_popularity[publisher])

            #----------------------------------------------------------------------------------------------------------------

            binding = item.get('binding')
            if binding is None or binding == '':
                binding = "Normal"
            elif isinstance(binding, dict) and binding.get('name'):
                binding = binding.get('name')

            #----------------------------------------------------------------------------------------------------------------
            
            num_pages = item.get('numberOfPages')
            if num_pages is None or num_pages == '':
                continue  # Skip the item

            #----------------------------------------------------------------------------------------------------------------

            published_date = item.get('publishedDate')
            if not published_date:
                continue  

            try:
                pub_year = datetime.strptime(published_date, "%Y-%m-%d").year 
            except ValueError:
                pub_year = current_year  # If date format is incorrect, use current year as default (Adding new Calculated field 'Published Year')

            #----------------------------------------------------------------------------------------------------------------

            booksage = current_year - pub_year #Adding new Calculated field 'Books Age'

            #----------------------------------------------------------------------------------------------------------------

            amazon_price = item.get('amazonLowestPrice') #Adding new Calculated field 'Price Category'
            if not amazon_price:
                amazon_price = 0

            price_category = "Low Price"
            if amazon_price > 50:
                price_category = "High Price"
            elif amazon_price > 20:
                price_category = "Moderate Price"
            
            #----------------------------------------------------------------------------------------------------------------

            rating_count = item.get('aggregateRating', {}).get('ratingCount', 0)  # Get rating count or use 0 if not available

            #----------------------------------------------------------------------------------------------------------------

            #Category Count: Count how many categories the book belongs to.

            category_count = len(item['categories']) # Adding new Calculated fields

            #----------------------------------------------------------------------------------------------------------------

            #Adding Adding new Calculated field Price to Rating Ratio
            # Calculating the ratio of Amazon price to the aggregate rating value. Gives you an idea of the price-value relationship.

            rating_value = item.get('aggregateRating', {}).get('ratingValue', 0)
            price_to_rating_ratio = amazon_price / (rating_value + 1)  # Adding 1 to avoid division by zero

            #----------------------------------------------------------------------------------------------------------------

            # Calculate Age Category
            def categorize_book_age(age):
                if age <= 1:
                    return "New Release"
                elif age <= 5:
                    return "Recent"
                elif age <= 10:
                    return "Moderate Age"
                else:
                    return "Old"

            age_category = categorize_book_age(booksage)

            #----------------------------------------------------------------------------------------------------------------

            # Calculate Rating Category
            def categorize_book_rating(rating_value):
                if rating_value >= 4.5:
                    return "Excellent"
                elif rating_value >= 4:
                    return "Good"
                elif rating_value >= 3.5:
                    return "Average"
                else:
                    return "Below Average"

            rating_category = categorize_book_rating(item.get('aggregateRating', {}).get('ratingValue', 0))

            #----------------------------------------------------------------------------------------------------------------
            
            sentiment = determine_label(rating_value, rating_category)

            #----------------------------------------------------------------------------------------------------------------

            data = {
                "ID": item.get('isbn13'),
                "Title": item.get('title'),
                "Amazon Price": item.get('amazonLowestPrice'),
                "Price Category": price_category,
                "Price to Rating Value": price_to_rating_ratio,
                "Authors": authors,
                "Published Date": published_date,
                "Publication Year": pub_year,
                "Publisher": publisher,
                "Publisher Popularity": publisher_popularity_category,
                "Binding": binding,
                "Books Age": booksage,
                "Age Category": age_category,
                "Number of Pages": num_pages,
                "Amazon Sales Rank": item.get('amazonSalesRank'),
                "Amazon Count": item.get('amazonCount'),
                "Description": description_text,
                "Categories": categories,
                "Category Count": category_count,
                "Aggregate Rating": item.get('aggregateRating', {}).get('ratingValue'),
                "Rating Count": rating_count,
                "Rating Category": rating_category,
                "Readers Sentiment": sentiment
            }
            data_list.append(data)
        
        page += 1  # Move to the next page
    else:
        print(f"Request failed with status code: {response.status_code}")
        break  # Exit the loop in case of an error



# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)

# Drop rows with null values
df.dropna(inplace=True)

# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Print the DataFrame
display(df)



  description_text = BeautifulSoup(description_html, 'html.parser').get_text()  # Remove HTML tags


Unnamed: 0,ID,Title,Amazon Price,Price Category,Price to Rating Value,Authors,Published Date,Publication Year,Publisher,Publisher Popularity,...,Number of Pages,Amazon Sales Rank,Amazon Count,Description,Categories,Category Count,Aggregate Rating,Rating Count,Rating Category,Readers Sentiment
0,9780448402147,All Aboard Airplanes (All Aboard Books),5.42,Low Price,0.967857,"Evans, Frank",1994-05-04,1994,Grosset&Dunlap,Low,...,32,2192714,21,"Briefly looks at the history of airplanes, and...","Literature & Fiction, Cars, Trains & Things Th...",2,4.6,9,Excellent,Happy
1,9781250859020,Book of All Books,13.46,Low Price,2.639216,"Calasso, Roberto",2022-11-15,2022,Picador Paper,Low,...,464,140624,28,No Info,Bible Study & Reference,1,4.1,6,Good,Neutral
2,9781896616537,All About Arthritis (All About Books),5.95,Low Price,1.144231,"Flynn M.B.A., Laura",2017-03-15,2017,MediScript Communications. Inc.,Low,...,56,0,6,The goal of this book is to help you to better...,"Health, Fitness & Dieting",1,4.2,1,Good,Neutral
3,9781896616544,All About Strokes (All About Books),3.75,Low Price,0.765306,"Flynn M.B.A., Laura",2017-03-22,2017,MediScript Communications. Inc.,Low,...,50,10459302,8,A stroke is usually the result of a clot that ...,Nursing,1,3.9,8,Average,Sad
4,9798636721772,All About Fish: From All About Books For Kids ...,7.99,Low Price,1.536538,"Arning, L.E.",2020-04-28,2020,Independently published,Low,...,29,637807,2,No Info,"Animals, Arts, Music & Photography, Comics & G...",5,4.2,1,Good,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2292,9781935694359,Dreaming of Arches National Park (An education...,7.94,Low Price,1.443636,"Grant Collier, Stephanie Lowman, All Dreaming ...",2019-03-06,2019,Collier Publishing LLC,Low,...,36,1539827,10,Dreaming of Arches National Park tells the sto...,"History, United States",2,4.5,10,Excellent,Happy
2293,9781935694427,Dreaming of Rocky Mountain National Park (An e...,11.32,Low Price,1.985965,"Grant Collier, Stephanie Lowman, All Dreaming ...",2020-04-07,2020,Collier Publishing LLC,Low,...,36,1624001,8,Dreaming of Rocky Mountain National Park tells...,"History, Science Fiction & Fantasy, Americas, ...",4,4.7,6,Excellent,Happy
2294,9781935694502,Dreaming of California (An educational childre...,18.94,Low Price,3.507407,"Grant Collier, Stephanie Lowman, All Dreaming ...",2020-09-19,2020,Collier Publishing LLC,Low,...,40,335595,1,"""Dreaming of California"" tells the story of Pa...","History, Science Fiction & Fantasy, United States",3,4.4,5,Good,Neutral
2295,9781935694595,Dreaming of Arizona (An educational children's...,12.91,Low Price,2.634694,"Grant Collier, Stephanie Lowman, All Dreaming ...",2021-10-08,2021,Collier Publishing LLC,Low,...,40,889942,7,About the Author\nGrant Collier has been worki...,"History, United States",2,3.9,2,Average,Sad


In [38]:
#Saving Cleaned Data into csv

df.to_csv('book_data.csv', index=False)  # Change the filename if needed