Import necessary libraries

In [18]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Scrape web page and print df

In [29]:
# idea for web scraping: scrape audible best-selling books
# Put the link to Audible Best-Sellers page here no matter the category
URL = "https://www.audible.com/search?sort=popularity-rank&ref=a_search_l1_catBackAll&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=S4VEHTG7BAZHCPTD9YB7"
URL = "https://www.audible.com/search?crid=31E3J6JGY980O&i=na-audible-us&k=brandon+sanderson&keywords=brandon+sanderson&ref-override=a_search_t1_header_search&sort=pubdate-desc-rank&sprefix=brand%2Cna-audible-us%2C170&url=search-alias%3Dna-audible-us&ref=a_search_c1_sort_1&pf_rd_p=073d8370-97e5-4b7b-be04-aa06cf22d7dd&pf_rd_r=VRAHHY22TC5WB0JFHJFR"
URL = "https://www.audible.com/adblbestsellers?searchCategory=18573518011&ref=a_adblbests_l1_catRefs_10&pf_rd_p=2ea8d46b-3372-49db-8ad4-77416e49695f&pf_rd_r=Z5CN827PYCK5J0XNQ8SM"
URL = "https://www.audible.com/adblbestsellers?searchCategory=18580540011&ref=a_adblbests_l1_catRefs_20&pf_rd_p=2ea8d46b-3372-49db-8ad4-77416e49695f&pf_rd_r=1DF75X5DWF19XQYMVZ07"

from scraping import audible_scraper

df = audible_scraper(URL)
df.head()

Unnamed: 0,Book_Name,Description,Author,Rating,Num_of_Ratings,Regular_Price,Audio_Length,Language
0,Sapiens,A Brief History of Humankind,Yuval Noah Harari,4.5 out of 5 stars,"51,616 ratings",$36.50,15 hrs and 18 mins,English
1,Starry Messenger,Cosmic Perspectives on Civilization,Neil deGrasse Tyson,5 out of 5 stars,"1,355 ratings",$15.49,7 hrs and 17 mins,English
2,Braiding Sweetgrass,"Indigenous Wisdom, Scientific Knowledge and th...",Robin Wall Kimmerer,5 out of 5 stars,"8,479 ratings",$34.99,16 hrs and 44 mins,English
3,The Song of the Cell,An Exploration of Medicine and the New Human,Siddhartha Mukherjee,4.5 out of 5 stars,81 ratings,$28.34,16 hrs and 3 mins,English
4,Breath,The New Science of a Lost Art,James Nestor,4.5 out of 5 stars,"7,774 ratings",$14.95,7 hrs and 18 mins,English


In [21]:
def check_book_availability(book_name_to_check, df):
    # check if the book is present in df
    # if not, finish the program straightaway
    # otherwise, check whether it was ranked before
    # Availability of rating is sufficient to check whether you can buy it or not
    if book_name_to_check in df["Book_Name"].values:
        print(f"\nThe book with the title '{book_name_to_check}' is present in the catalogue")
        book_characteristics = df.loc[df['Book_Name'] == book_name_to_check]
        if book_characteristics["Num_of_Ratings"].values.all() == "Not rated yet":
            print("but unfortunately isn't available for buying yet")
        elif book_characteristics["Rating"].values.all() is None:
            print("but unfortunately isn't available for buying yet")
        else:
            print("and is available for buying")
    else:
        print(f"\nThe book with the title '{book_name_to_check}' isn't present in the catalogue")

check_book_availability("The Lost Metal", df)
check_book_availability("fjdslkf", df)
check_book_availability("Centers of Gravity", df)
check_book_availability("The Shepherd's Crown", df)


The book with the title 'The Lost Metal' isn't present in the catalogue

The book with the title 'fjdslkf' isn't present in the catalogue

The book with the title 'Centers of Gravity' isn't present in the catalogue

The book with the title 'The Shepherd's Crown' isn't present in the catalogue


Create connection to MySQL

In [22]:
# later put related to MySQL code and imports into a separate file
from sqlalchemy import create_engine
from env_vars import DB_USER, DB_PASSWORD

def establish_connection():
    DB_TO_WORK_WITH = "audible_books_db"
    DB_HOST = "localhost:3306"
    return create_engine(
        f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_TO_WORK_WITH}",
        pool_recycle=3600)

engine = establish_connection()

Load df to db

In [23]:
# If you load to db straightaway, you will get dublicate indexes and (maybe) values
# To solve this problem, read everything from your DB table into df
# and append your just scraped info to it with resetting the index
# Warning: may be slow with big amount of data
def append_to_DB(input_df):
    all_info_from_db_df = pd.read_sql("select * from books;", engine, index_col="id")
    # combine two dfs
    df_combined = pd.concat([all_info_from_db_df, input_df])
    # remove dublicates and reset index
    df_combined.drop_duplicates(subset=["Book_Name", "Author", "Regular_Price", "Audio_Length", "Language"],
                                ignore_index=True, inplace=True)
    # load all info to DB
    df_combined.to_sql(name="books", if_exists='replace', con=engine, index_label="id")

append_to_DB(df)

Read from your db

In [24]:
pd.read_sql("select * from books;", engine, index_col="id")

Unnamed: 0_level_0,Book_Name,Description,Author,Rating,Num_of_Ratings,Regular_Price,Audio_Length,Language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,The Light We Carry,Overcoming in Uncertain Times,Michelle Obama,5 out of 5 stars,21 ratings,$35.00,9 hrs and 59 mins,English
1,"Friends, Lovers, and the Big Terrible Thing",A Memoir,Matthew Perry,4.5 out of 5 stars,"5,548 ratings",$25.51,8 hrs and 49 mins,English
2,The Lost Metal,A Mistborn Novel,Brandon Sanderson,5 out of 5 stars,80 ratings,$31.18,18 hrs and 46 mins,English
3,Fairy Tale,,Stephen King,5 out of 5 stars,"37,056 ratings",$26.90,24 hrs and 6 mins,English
4,It Starts with Us,A Novel,Colleen Hoover,4.5 out of 5 stars,"15,009 ratings",$23.62,8 hrs and 41 mins,English
...,...,...,...,...,...,...,...,...
68,Elon Musk,"Tesla, SpaceX, and the Quest for a Fantastic F...",Ashlee Vance,4.5 out of 5 stars,"56,968 ratings",$27.90,13 hrs and 23 mins,English
69,Astrophysics for People in a Hurry,,Neil deGrasse Tyson,4.5 out of 5 stars,"36,764 ratings",$17.47,3 hrs and 41 mins,English
70,The Dawn of Everything,A New History of Humanity,David Graeber,4.5 out of 5 stars,"2,241 ratings",$34.61,24 hrs and 2 mins,English
71,Lifespan,Why We Age - and Why We Don't Have To,David A. Sinclair PhD,4.5 out of 5 stars,"6,704 ratings",$18.49,11 hrs and 55 mins,English
