Import necessary libraries

In [41]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Scrape web page and print df

In [42]:
# idea for web scraping: scrape audible best-selling books
# Put the link to Audible Best-Sellers page here no matter the category
URL = "https://www.audible.com/search?sort=popularity-rank&ref=a_search_l1_catBackAll&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=S4VEHTG7BAZHCPTD9YB7"
URL = "https://www.audible.com/search?crid=31E3J6JGY980O&i=na-audible-us&k=brandon+sanderson&keywords=brandon+sanderson&ref-override=a_search_t1_header_search&sort=pubdate-desc-rank&sprefix=brand%2Cna-audible-us%2C170&url=search-alias%3Dna-audible-us&ref=a_search_c1_sort_1&pf_rd_p=073d8370-97e5-4b7b-be04-aa06cf22d7dd&pf_rd_r=VRAHHY22TC5WB0JFHJFR"
URL = "https://www.audible.com/adblbestsellers?searchCategory=18573518011&ref=a_adblbests_l1_catRefs_10&pf_rd_p=2ea8d46b-3372-49db-8ad4-77416e49695f&pf_rd_r=Z5CN827PYCK5J0XNQ8SM"
URL = "https://www.audible.com/adblbestsellers?searchCategory=18580540011&ref=a_adblbests_l1_catRefs_20&pf_rd_p=2ea8d46b-3372-49db-8ad4-77416e49695f&pf_rd_r=1DF75X5DWF19XQYMVZ07"

from scraping import audible_scraper

df = audible_scraper(URL)
df.head()

Unnamed: 0,Book_Name,Description,Author,Rating,Num_of_Ratings,Regular_Price,Audio_Length,Language
0,The Light We Carry,Overcoming in Uncertain Times,Michelle Obama,5 out of 5 stars,173 ratings,$35.00,9 hrs and 59 mins,English
1,"Friends, Lovers, and the Big Terrible Thing",A Memoir,Matthew Perry,4.5 out of 5 stars,"6,656 ratings",$25.51,8 hrs and 49 mins,English
2,Fairy Tale,,Stephen King,5 out of 5 stars,"38,459 ratings",$26.90,24 hrs and 6 mins,English
3,The Lost Metal,A Mistborn Novel,Brandon Sanderson,5 out of 5 stars,"1,035 ratings",$31.18,18 hrs and 46 mins,English
4,It Starts with Us,A Novel,Colleen Hoover,4.5 out of 5 stars,"15,951 ratings",$23.62,8 hrs and 41 mins,English


In [48]:
from uncategorized import check_book_availability

check_book_availability("The Lost Metal", df)
check_book_availability("fjdslkf", df)
check_book_availability("Centers of Gravity", df)
check_book_availability("The Shepherd's Crown", df)


The book with the title 'The Lost Metal' is present in the catalogue
and is available for buying

The book with the title 'fjdslkf' isn't present in the catalogue

The book with the title 'Centers of Gravity' isn't present in the catalogue

The book with the title 'The Shepherd's Crown' isn't present in the catalogue


Create connection to MySQL

In [44]:
# later put related to MySQL code and imports into a separate file
from sqlalchemy import create_engine
from env_vars import DB_USER, DB_PASSWORD

def establish_connection():
    DB_TO_WORK_WITH = "audible_books_db"
    DB_HOST = "localhost:3306"
    return create_engine(
        f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_TO_WORK_WITH}",
        pool_recycle=3600)

engine = establish_connection()

Load df to db

In [45]:
from sqlalchemy.exc import ProgrammingError

# If you load to db straightaway, you will get dublicate indexes and (maybe) values
# To solve this problem, read everything from your DB table into df
# and append your just scraped info to it with resetting the index
# Warning: may be slow with big amount of data
def append_to_DB(input_df):
    try:
        all_info_from_db_df = pd.read_sql("select * from books;", engine, index_col="id")
        # combine two dfs
        df_combined = pd.concat([all_info_from_db_df, input_df])
        # remove dublicates and reset index
        df_combined.drop_duplicates(subset=["Book_Name", "Author", "Regular_Price", "Audio_Length", "Language"],
                                    ignore_index=True, inplace=True)
        # load all info to DB
        df_combined.to_sql(name="books", if_exists='replace', con=engine, index_label="id")
    except ProgrammingError:
        print("The table doesn't exist. Creating one and adding info to it right now...")
        input_df.to_sql(name="books", if_exists='replace', con=engine, index_label="id")
        
append_to_DB(df)

Read from your db

In [47]:
pd.read_sql("select * from books;", engine, index_col="id")

Unnamed: 0_level_0,Book_Name,Description,Author,Rating,Num_of_Ratings,Regular_Price,Audio_Length,Language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,The Light We Carry,Overcoming in Uncertain Times,Michelle Obama,5 out of 5 stars,173 ratings,$35.00,9 hrs and 59 mins,English
1,"Friends, Lovers, and the Big Terrible Thing",A Memoir,Matthew Perry,4.5 out of 5 stars,"6,645 ratings",$25.51,8 hrs and 49 mins,English
2,Fairy Tale,,Stephen King,5 out of 5 stars,"38,459 ratings",$26.90,24 hrs and 6 mins,English
3,The Lost Metal,A Mistborn Novel,Brandon Sanderson,5 out of 5 stars,"1,035 ratings",$31.18,18 hrs and 46 mins,English
4,It Starts with Us,A Novel,Colleen Hoover,4.5 out of 5 stars,"15,951 ratings",$23.62,8 hrs and 41 mins,English
5,I'm Glad My Mom Died,,Jennette McCurdy,5 out of 5 stars,"72,429 ratings",$13.99,6 hrs and 26 mins,English
6,Atomic Habits,An Easy & Proven Way to Build Good Habits & Br...,James Clear,5 out of 5 stars,"113,443 ratings",$16.72,5 hrs and 35 mins,English
7,Spare,,Prince Harry The Duke of Sussex,,Not rated yet,$50.40,Not Yet Known,English
8,Desert Star,,Michael Connelly,5 out of 5 stars,"2,388 ratings",$30.79,9 hrs and 37 mins,English
9,Where the Crawdads Sing,,Delia Owens,5 out of 5 stars,"284,017 ratings",$17.92,12 hrs and 12 mins,English
