In [1]:
import requests
from bs4 import BeautifulSoup as bs
import time
import re

### The workflow:
I decided to scrape the books with tag 'business' from goodreads.com: <br><br>

1. get the hrefs of the lists with tag 'business'<br>
2. get hrefs of all books on page 1 of first goodreads-list<br>
3. create a list of dictionaries for all books, so then I can convert it to df<br>

In [None]:
# №1: 

def get_list_hrefs(tag:str, page):
    """Given a tag name and page-no returns the urls of the Goodreads-list."""
    
    url = f'https://www.goodreads.com/list/tag/{tag}?page={page}&utf8=✓'
    page = requests.get(url)
    soup = bs(page.content, 'html.parser')
    href_url = [tag['href'] for tag in soup.find_all('a', {'class':'listTitle'})]
    return href_url

In [None]:
# return all hrefs from the first page on Goodreads with tag 'business'
list_hrefs = get_list_hrefs('business', 1)

In [None]:
# №2:

def get_href_from_soups(url:str) -> list:
    """Given the url of one Goodreads-list returns a list of all book-urls."""
    
    url = f'https://www.goodreads.com/{url}'
    page = requests.get(url)
    soup = bs(page.content, 'html.parser')
    href_url = [tag['href'] for tag in soup.find_all('a', {'class':'bookTitle'})]
    return href_url

In [None]:
# return the hrefs of the books from the first Goodreads-list with tag 'business'
href_book_1 = get_href_from_soups(list_hrefs[0])

In [None]:
# №3:

def from_href_to_infos(href_list:list):
    """Given list of book_urls creates a list of dicts of all books."""

    def get_title(soup):
        return soup.title.get_text()


    def get_author_name(soup):
        aut_li = []
        for item in soup.find_all('span', {'itemprop':'name'}):
            aut_li.append(re.sub('\s+', ' ',item.get_text()))
        return aut_li
    
    def get_author_link(soup):
        return [tag['href'] for tag in soup.find_all('a', {'class':'authorName'})]

    
    def get_rating(soup):
        return float(soup.find('span', {'itemprop':'ratingValue'}).get_text()) 
    
    
    def get_rating_count(soup):
        return int(re.sub(',','',soup.find('meta', {'itemprop':'ratingCount'}).get_text().split()[0]))
    
    
    def get_genre(soup):
        genre_li = []
        for item in soup.find_all('a', {'class':'actionLinkLite bookPageGenreLink'}):
            genre = item.get_text()
            if genre not in genre_li:
                genre_li.append(genre)
            else:
                pass
        return genre_li
    
    
    def get_book_description(soup):
        
        return re.sub('\n', '', soup.find('div', { 'id' : 'description'}).get_text())

        
    def get_pages_num(soup):
        return int(soup.find('span', { 'itemprop' : 'numberOfPages'}).get_text().split()[0])
    
    
    def get_year_publ(soup):
        for item in soup.find_all('div', { 'class' : 'row'}):
            try:
                if 'Published' in item.get_text():
                    return int(item.get_text().split()[3])
            except ValueError:
                return 0
    
    
    def get_image_url(soup):
        return [tag['src'] for tag in soup.find_all('img', { 'id' : 'coverImage'})]
    
    
    def get_url_quotes(soup):
        quotes_li = []
        for item in soup.find_all('h2', { 'class' : 'brownBackground'}):
            hr = [tag['href'] for tag in item.find_all('a')]
            quotes_li.append(hr)
        return quotes_li[-1]
    
    
    all_books = []
    for i in href_list:
        url = f'https://www.goodreads.com{i}'
        page = requests.get(url)
        soup = bs(page.content, 'html.parser')
        
        book_dict = {'title' : get_title(soup),
                     'author' : get_author_name(soup),
                     'author_link' : get_author_link(soup),
                     'avg_rating' : get_rating(soup), 
                     'rating_count' : get_rating_count(soup),
                     'genres' : get_genre(soup),
                     'description' : get_book_description(soup),
                     'book_url' : url,
                     'num_pages' : get_pages_num(soup),
                     'year' : get_year_publ(soup),
                     'cover' : get_image_url(soup),
                     'quotes_url' : get_url_quotes(soup)}
        
        all_books.append(book_dict)
        
    return all_books