## Importing Libraries

In [1]:
import time
import requests
from bs4 import BeautifulSoup
import csv
import re

## Main Functions

In [3]:
classic_url = 'https://www.goodreads.com/list/tag/classics?page=1' # Selecting a base genre url (in this example, we use classic genre)


def select_lists_urls(classic_url):  # Function for find all lists in genre url`s pages
    try:
        response = requests.get(classic_url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()  # This will raise an exception for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        classic_list_urls = soup.find_all(class_="listTitle")
        if not classic_list_urls:
            print("No elements found with the given selector")
        return [f"https://www.goodreads.com{link['href']}" for link in classic_list_urls]
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []


In [5]:
classic_genre_url = select_lists_urls(classic_url)
classic_genre_url[0:5] # Here are first 5 lists` urls.

['https://www.goodreads.com/list/show/264.Books_That_Everyone_Should_Read_At_Least_Once',
 'https://www.goodreads.com/list/show/2491.Must_Read_Books_Different_Genres',
 'https://www.goodreads.com/list/show/1190.I_m_glad_someone_made_me_read_this_book',
 'https://www.goodreads.com/list/show/952.1001_Books_You_Must_Read_Before_You_Die',
 'https://www.goodreads.com/list/show/16.Best_Books_of_the_19th_Century']

In [6]:
def select_book_links_from_list_urls(list_url, start_page=1, end_page=101): # Defining function for scrape books` links from our list url
    all_book_links = []
    
    for page in range(start_page, end_page):
        page_url = f"{list_url}?page={page}"
        try:
            response = requests.get(page_url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            classic_books_urls = soup.find_all(class_="bookTitle")
            
            if not classic_books_urls:
                print(f"No elements found with the given selector on page {page}")
            else:
                page_links = [f"https://www.goodreads.com{link['href']}" for link in classic_books_urls]
                all_book_links.extend(page_links)
                print(f"Scraped {len(page_links)} links from page {page}")
            
            # Add a small delay to be respectful to the server
            time.sleep(1)
            
        except requests.exceptions.RequestException as e:
            print(f"An error occurred while processing page {page}: {e}")
    
    return all_book_links


# You can change page numbers. 

In [7]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import nest_asyncio

# Apply nest_asyncio to allow nested use of asyncio
nest_asyncio.apply()

from bs4 import BeautifulSoup
import aiohttp

async def get_details_from_books(session, book_url): # Defining a function for extract book details
    try:
        await asyncio.sleep(1)
        async with session.get(book_url, headers={'User-Agent': 'Mozilla/5.0'}) as response:
            response.raise_for_status()
            content = await response.text()
            soup = BeautifulSoup(content, 'html.parser')
            
            # Extracting title
            title = soup.find('h1', class_='Text Text__title1').text.strip()
            
            # Extracting author
            author = soup.find('span', class_='ContributorLink__name').text.strip()
            
            # Extracting total ratings and reviews
            total_ratings = soup.find("span", {"data-testid": "ratingsCount"}).text.strip()
            total_ratings = int(re.sub(r'\D', '', total_ratings))  # Remove all non-digit characters

            # Extracting total reviews
            total_reviews = soup.find('span', class_='u-dot-before').text.strip()
            total_reviews = int(re.sub(r'\D', '', total_reviews))  # Remove all non-digit characters
            
            # Extracting rating
            rating = float(soup.find('div', class_='RatingStatistics__rating').text.strip())


            # Extracting genres (could be multiple genres on web-page, but we only extract first 7 main genres from book)
            outer_spans = soup.find_all('span', class_='BookPageMetadataSection__genreButton')
            main_genres = []
            for outer_span in outer_spans:
                if outer_span:
                    a_tag = outer_span.find('a', class_='Button Button--tag Button--medium')
                    if a_tag:
                        genre_span = a_tag.find('span', class_='Button__labelItem')
                        if genre_span:
                            genre_text = genre_span.text.strip()
                            main_genres.append(genre_text)

            
            # Extracting book description
            book_description = soup.find('span', class_='Formatted').text.strip()
            
            # Extracting book image URL
            book_image_url = soup.find('img', class_='ResponsiveImage')['src']
            
            # Extracting number of pages
            pages = soup.find('p', {"data-testid": "pagesFormat"}).text.strip()
            num_pages = int(re.search(r'\d+', pages).group()) if pages != "Page count not found" else "Page count not found"
            
            return {
                'title': title,
                'author': author,
                'total_ratings': total_ratings,
                'total_reviews': total_reviews,
                'rating': rating,
                'genres': main_genres,
                'book_description': book_description,
                'book_url': book_url,
                'book_image_url': book_image_url,
                'num_pages': num_pages,
            }
    
    except Exception as e:
        print(f"Error fetching details for {book_url}: {e}")
        return None


async def scrape_book_details(book_urls):
    async with aiohttp.ClientSession() as session:
        tasks = [get_details_from_books(session, url) for url in book_urls]
        results = await asyncio.gather(*tasks)
    return [book for book in results if book]

# Function to run the async code
def run_scraper(book_urls):
    return asyncio.get_event_loop().run_until_complete(scrape_book_details(book_urls))

## Example running scripts

In [8]:
books_links = select_book_links_from_list_urls('https://www.goodreads.com/list/show/264.Books_That_Everyone_Should_Read_At_Least_Once')
len(books_links)

Scraped 100 links from page 1
Scraped 100 links from page 2
Scraped 100 links from page 3
Scraped 100 links from page 4
Scraped 100 links from page 5
Scraped 100 links from page 6
Scraped 100 links from page 7
Scraped 100 links from page 8
Scraped 100 links from page 9
Scraped 100 links from page 10
Scraped 100 links from page 11
Scraped 100 links from page 12
Scraped 100 links from page 13
Scraped 100 links from page 14
Scraped 100 links from page 15
Scraped 100 links from page 16
Scraped 100 links from page 17
Scraped 100 links from page 18
Scraped 100 links from page 19
Scraped 100 links from page 20
Scraped 100 links from page 21
Scraped 100 links from page 22
Scraped 100 links from page 23
Scraped 100 links from page 24
Scraped 100 links from page 25
Scraped 100 links from page 26
Scraped 100 links from page 27
Scraped 100 links from page 28
Scraped 100 links from page 29
Scraped 100 links from page 30
Scraped 100 links from page 31
Scraped 100 links from page 32
Scraped 100 links

10000

In [10]:
example_book_links = books_links[0:50]
book_details_list = run_scraper(example_book_links)

In [11]:
book_details_list

[{'title': 'To Kill a Mockingbird',
  'author': 'Harper Lee',
  'total_ratings': 6282070,
  'total_reviews': 120290,
  'rating': 4.26,
  'genres': ['Fiction',
   'Historical Fiction',
   'School',
   'Literature',
   'Young Adult',
   'Historical',
   'Read For School'],
  'book_description': 'The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. "To Kill A Mockingbird" became both an instant bestseller and a critical success when it was first published in 1960. It went on to win the Pulitzer Prize in 1961 and was later made into an Academy Award-winning film, also a classic.Compassionate, dramatic, and deeply moving, "To Kill A Mockingbird" takes readers to the roots of human behavior - to innocence and experience, kindness and cruelty, love and hatred, humor and pathos. Now with over 18 million copies in print and translated into forty languages, this regional story by a young Alabama woman claims universal appeal. Harper Lee al

In [12]:
import pandas as pd

df = pd.DataFrame(book_details_list)

df['genres'] = df['genres'].apply(lambda x: ', '.join(x))

# Save to CSV file
df.to_csv('books5.csv', index=False)

In [13]:
df.head()

Unnamed: 0,title,author,total_ratings,total_reviews,rating,genres,book_description,book_url,book_image_url,num_pages
0,To Kill a Mockingbird,Harper Lee,6282070,120290,4.26,"Fiction, Historical Fiction, School, Literatur...",The unforgettable novel of a childhood in a sl...,https://www.goodreads.com/book/show/2657.To_Ki...,https://images-na.ssl-images-amazon.com/images...,323
1,Harry Potter and the Sorcerer’s Stone,J.K. Rowling,10305025,167160,4.47,"Fantasy, Fiction, Young Adult, Magic, Children...",An alternative cover for this ASIN can be foun...,https://www.goodreads.com/book/show/42844155-h...,https://images-na.ssl-images-amazon.com/images...,333
2,Pride and Prejudice,Jane Austen,4347277,120211,4.29,"Fiction, Historical Fiction, Historical, Liter...","Since its immediate success in 1813, Pride and...",https://www.goodreads.com/book/show/1885.Pride...,https://images-na.ssl-images-amazon.com/images...,279
3,The Diary of a Young Girl,Anne Frank,3817131,42864,4.19,"Nonfiction, Biography, Memoir, Classics, Histo...",Discovered in the attic in which she spent the...,https://www.goodreads.com/book/show/48855.The_...,https://images-na.ssl-images-amazon.com/images...,283
4,Animal Farm,George Orwell,3993622,101453,3.99,"Classics, Fiction, Dystopia, Fantasy, School, ...",Librarian's note: There is an Alternate Cover ...,https://www.goodreads.com/book/show/170448.Ani...,https://images-na.ssl-images-amazon.com/images...,141
