In [1]:
import requests
import pandas as pd

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from tqdm import tqdm
import os

class ImprovedBookScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.books_data = []

    def scrape_goodreads_list(self, url, num_books=10):
        """
        Scrape book information from a Goodreads list page
        """
        print(f"Starting to scrape {num_books} books...")
        
        try:
            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all book entries on the page
            book_entries = soup.find_all('tr', {'itemtype': 'http://schema.org/Book'})
            
            if not book_entries:
                print("No book entries found. The page structure might have changed.")
                return []
            
            for book in tqdm(book_entries[:num_books]):
                try:
                    # Extract book title and author directly from the list page
                    title_element = book.find('span', {'itemprop': 'name'})
                    author_element = book.find('a', {'class': 'authorName'})
                    
                    # Get the book's URL for detailed information
                    title_link = book.find('a', {'class': 'bookTitle'})
                    if title_link and 'href' in title_link.attrs:
                        book_url = 'https://www.goodreads.com' + title_link['href']
                        
                        # Print current book being scraped (for debugging)
                        print(f"\nScraping book: {title_element.text.strip() if title_element else 'Unknown Title'}")
                        
                        # Get detailed book information
                        book_info = self.scrape_book_page(book_url)
                        
                        if book_info:
                            self.books_data.append(book_info)
                        
                        # Random delay between requests
                        time.sleep(random.uniform(2, 4))
                    
                except Exception as e:
                    print(f"Error scraping book: {str(e)}")
                    continue
                    
        except Exception as e:
            print(f"Error scraping list page: {str(e)}")
        
        return self.books_data

    def scrape_book_page(self, url):
        """
        Scrape detailed information from a book's page
        """
        try:
            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.content, 'html.parser')

            # Print the URL being scraped (for debugging)
            print(f"Scraping URL: {url}")

            cover_img = (
                soup.find('img', {'class': 'ResponsiveImage'}) or  # New Goodreads
                soup.find('img', {'id': 'coverImage'}) or         # Old Goodreads
                soup.find('img', {'class': 'BookCover__image'})   # Another possible class
            )
            
            # Extract the highest quality image URL
            cover_url = None
            if cover_img and 'src' in cover_img.attrs:
                cover_url = cover_img['src']
                # Sometimes Goodreads uses data-src for lazy loading
                if not cover_url or cover_url.endswith('nophoto'):
                    cover_url = cover_img.get('data-src', '')
                
                # Get the largest version of the image
                cover_url = cover_url.replace('._SX98_', '._SX500_').replace('._SY160_', '._SY750_')
        
            
            # Find the book container
            book_container = soup.find('div', {'class': 'BookPage__mainContent'})
            
            if not book_container:
                print("Book container not found. Trying alternative selectors...")
            
            # Extract book information with multiple fallback selectors
            title = (
                soup.find('h1', {'class': 'Text Text__title1'}) or 
                soup.find('h1', {'id': 'bookTitle'})
            )
            
            author = (
                soup.find('span', {'class': 'ContributorLink__name'}) or 
                soup.find('span', {'itemprop': 'name'})
            )
            
            rating = (
                soup.find('div', {'class': 'RatingStatistics__rating'}) or
                soup.find('span', {'itemprop': 'ratingValue'})
            )
            
            summary = (
                soup.find('div', {'class': 'BookPageMetadataSection__description'}) or
                soup.find('div', {'id': 'description'})
            )
            
            # Extract genres (with multiple possible selectors)
            genres = []
            genre_elements = (
                soup.find_all('span', {'class': 'BookPageMetadataSection__genreButton'}) or
                soup.find_all('a', {'class': 'actionLinkLite bookPageGenreLink'})
            )

            # Extract publication year
            # publish_year = None
            # publication_details = soup.find('div', {'class': 'Row'}).get_text(strip=True) if soup.find('div', {'class': 'Row'}) else ''
            # if "Published" in publication_details:
            #     try:
            #         publish_year = [int(s) for s in publication_details.split() if s.isdigit()][0]
            #     except IndexError:
            #         pass  # No valid year found
            
            # Build book information dictionary
            book_info = {
                'title': title.text.strip() if title else None,
                'author': author.text.strip() if author else None,
                'rating': rating.text.strip() if rating else None,
                'summary': summary.text.strip() if summary else None,
                'genres': [g.text.strip() for g in genre_elements] if genre_elements else None,
                # 'publish_year': publish_year,
                'cover_url': cover_url,
                'url': url
            }
            
            # Print extracted info (for debugging)
            print(f"Extracted title: {book_info['title']}")
            print(f"Extracted author: {book_info['author']}")
            # print(f"Extracted publish year: {book_info['publish_year']}")
            
            return book_info
            
        except Exception as e:
            print(f"Error scraping book page {url}: {str(e)}")
            return None

    def save_to_csv(self, filename='books_dataset.csv'):
        """
        Save the scraped data to a CSV file
        """
        if not self.books_data:
            print("No data to save!")
            return
        file_exists = os.path.exists(filename)   
        df = pd.DataFrame(self.books_data)
        df.to_csv(filename, mode='a', header=not file_exists, index=False, encoding='utf-8')
        print(f"\nDataset saved to {filename}")
        print(f"Number of books saved: {len(df)}")
        print("\nFirst few entries:")
        print(df.head())

# # Example usage
# if __name__ == "__main__":
#     scraper = ImprovedBookScraper()
    
#     # Example Goodreads list URL - replace with your actual URL
#     list_url = "https://www.goodreads.com/list/show/11.Best_Crime_Mystery_Books"
    
#     # Scrape books
#     books = scraper.scrape_goodreads_list(list_url, num_books=10)
    
#     # Save to CSV
#     scraper.save_to_csv()

In [12]:
if __name__ == "__main__":
    scraper = ImprovedBookScraper()
    
    # Example Goodreads list URL - replace with your actual URL
    list_url = "https://www.goodreads.com/list/show/6.Best_Books_of_the_20th_Century?page=11"
    
    # Scrape books
    books = scraper.scrape_goodreads_list(list_url, num_books=100)
    
    # Save to CSV
    scraper.save_to_csv()

Starting to scrape 100 books...


  0%|          | 0/100 [00:00<?, ?it/s]


Scraping book: Motherless Brooklyn
Scraping URL: https://www.goodreads.com/book/show/328854.Motherless_Brooklyn
Extracted title: Motherless Brooklyn
Extracted author: Jonathan Lethem


  1%|          | 1/100 [00:11<18:46, 11.38s/it]


Scraping book: Follow the River
Scraping URL: https://www.goodreads.com/book/show/138872.Follow_the_River
Extracted title: Follow the River
Extracted author: James Alexander Thom


  2%|▏         | 2/100 [00:23<19:35, 11.99s/it]


Scraping book: Fantastic Mr. Fox
Scraping URL: https://www.goodreads.com/book/show/6693.Fantastic_Mr_Fox
Extracted title: Fantastic Mr. Fox
Extracted author: Roald Dahl


  3%|▎         | 3/100 [00:35<19:21, 11.98s/it]


Scraping book: A Girl of the Limberlost (Limberlost, #2)
Scraping URL: https://www.goodreads.com/book/show/17567.A_Girl_of_the_Limberlost
Extracted title: A Girl of the Limberlost
Extracted author: Gene Stratton-Porter


  4%|▍         | 4/100 [00:45<17:54, 11.19s/it]


Scraping book: Love You Forever
Scraping URL: https://www.goodreads.com/book/show/310259.Love_You_Forever
Extracted title: Love You Forever
Extracted author: Robert Munsch


  5%|▌         | 5/100 [00:55<16:58, 10.72s/it]


Scraping book: A Ring of Endless Light (Austin Family Chronicles, #4)
Scraping URL: https://www.goodreads.com/book/show/14358.A_Ring_of_Endless_Light
Extracted title: A Ring of Endless Light
Extracted author: Madeleine L'Engle


  6%|▌         | 6/100 [01:05<16:10, 10.32s/it]


Scraping book: QB VII
Scraping URL: https://www.goodreads.com/book/show/426825.QB_VII
Extracted title: QB VII
Extracted author: Leon Uris


  7%|▋         | 7/100 [01:13<15:13,  9.82s/it]


Scraping book: Toda Mafalda
Scraping URL: https://www.goodreads.com/book/show/54741.Toda_Mafalda
Extracted title: Toda Mafalda
Extracted author: Quino


  8%|▊         | 8/100 [01:20<13:20,  8.71s/it]


Scraping book: Ignorance
Scraping URL: https://www.goodreads.com/book/show/78728.Ignorance
Extracted title: Ignorance
Extracted author: Milan Kundera


  9%|▉         | 9/100 [01:30<13:55,  9.18s/it]


Scraping book: The 42nd Parallel (U.S.A. #1)
Scraping URL: https://www.goodreads.com/book/show/7101.The_42nd_Parallel
Extracted title: The 42nd Parallel
Extracted author: John Dos Passos


 10%|█         | 10/100 [01:38<13:20,  8.90s/it]


Scraping book: The Milagro Beanfield War
Scraping URL: https://www.goodreads.com/book/show/39242.The_Milagro_Beanfield_War
Extracted title: The Milagro Beanfield War
Extracted author: John     Nichols


 11%|█         | 11/100 [01:47<13:06,  8.84s/it]


Scraping book: The City and the Stars
Scraping URL: https://www.goodreads.com/book/show/250024.The_City_and_the_Stars
Extracted title: The City and the Stars
Extracted author: Arthur C. Clarke


 12%|█▏        | 12/100 [01:56<12:57,  8.83s/it]


Scraping book: Waiting
Scraping URL: https://www.goodreads.com/book/show/235773.Waiting
Extracted title: Waiting
Extracted author: Ha Jin


 13%|█▎        | 13/100 [02:06<13:30,  9.31s/it]


Scraping book: Fugitive Pieces
Scraping URL: https://www.goodreads.com/book/show/15836.Fugitive_Pieces
Extracted title: Fugitive Pieces
Extracted author: Anne  Michaels


 14%|█▍        | 14/100 [02:20<15:27, 10.78s/it]


Scraping book: The Chrysalids
Scraping URL: https://www.goodreads.com/book/show/826845.The_Chrysalids
Extracted title: The Chrysalids
Extracted author: John Wyndham


 15%|█▌        | 15/100 [02:31<15:15, 10.77s/it]


Scraping book: The Street of Crocodiles
Scraping URL: https://www.goodreads.com/book/show/244261.The_Street_of_Crocodiles
Extracted title: The Street of Crocodiles
Extracted author: Bruno Schulz


 16%|█▌        | 16/100 [02:41<14:38, 10.46s/it]


Scraping book: The Snows of Kilimanjaro and Other Stories
Scraping URL: https://www.goodreads.com/book/show/4645.The_Snows_of_Kilimanjaro_and_Other_Stories
Extracted title: The Snows of Kilimanjaro and Other Stories
Extracted author: Ernest Hemingway


 17%|█▋        | 17/100 [02:52<14:48, 10.71s/it]


Scraping book: The Wretched of the Earth
Scraping URL: https://www.goodreads.com/book/show/66933.The_Wretched_of_the_Earth
Extracted title: The Wretched of the Earth
Extracted author: Frantz Fanon


 18%|█▊        | 18/100 [03:00<13:30,  9.89s/it]


Scraping book: Play It As It Lays
Scraping URL: https://www.goodreads.com/book/show/428.Play_It_As_It_Lays
Extracted title: Play It As It Lays
Extracted author: Joan Didion


 19%|█▉        | 19/100 [03:10<13:10,  9.76s/it]


Scraping book: Bird by Bird
Scraping URL: https://www.goodreads.com/book/show/12543.Bird_by_Bird
Extracted title: Bird by Bird
Extracted author: Anne Lamott


 20%|██        | 20/100 [03:20<13:05,  9.81s/it]


Scraping book: Good Night, Mr. Tom
Scraping URL: https://www.goodreads.com/book/show/161099.Good_Night_Mr_Tom
Extracted title: Good Night, Mr. Tom
Extracted author: Michelle Magorian


 21%|██        | 21/100 [03:27<12:05,  9.19s/it]


Scraping book: I'm Not Stiller
Scraping URL: https://www.goodreads.com/book/show/265102.I_m_Not_Stiller
Extracted title: I'm Not Stiller
Extracted author: Max Frisch


 22%|██▏       | 22/100 [03:37<12:01,  9.25s/it]


Scraping book: The Incredible Journey
Scraping URL: https://www.goodreads.com/book/show/231821.The_Incredible_Journey
Extracted title: The Incredible Journey
Extracted author: Sheila Burnford


 23%|██▎       | 23/100 [03:44<11:18,  8.81s/it]


Scraping book: In Our Time
Scraping URL: https://www.goodreads.com/book/show/4652.In_Our_Time
Extracted title: In Our Time
Extracted author: Ernest Hemingway


 24%|██▍       | 24/100 [03:54<11:17,  8.91s/it]


Scraping book: Enduring Love
Scraping URL: https://www.goodreads.com/book/show/6870.Enduring_Love
Extracted title: Enduring Love
Extracted author: Ian McEwan


 25%|██▌       | 25/100 [04:04<11:38,  9.31s/it]


Scraping book: At Home in Mitford (Mitford Years, #1)
Scraping URL: https://www.goodreads.com/book/show/71776.At_Home_in_Mitford
Extracted title: At Home in Mitford
Extracted author: Jan Karon


 26%|██▌       | 26/100 [04:13<11:19,  9.18s/it]


Scraping book: Pawn of Prophecy (The Belgariad, #1)
Scraping URL: https://www.goodreads.com/book/show/44659.Pawn_of_Prophecy
Extracted title: Pawn of Prophecy
Extracted author: David Eddings


 27%|██▋       | 27/100 [04:25<12:10, 10.01s/it]


Scraping book: The Civil War: A Narrative
Scraping URL: https://www.goodreads.com/book/show/44234.The_Civil_War
Extracted title: The Civil War: A Narrative
Extracted author: Shelby Foote


 28%|██▊       | 28/100 [04:33<11:15,  9.38s/it]


Scraping book: Beneath the Wheel
Scraping URL: https://www.goodreads.com/book/show/25905.Beneath_the_Wheel
Extracted title: Beneath the Wheel
Extracted author: Hermann Hesse


 29%|██▉       | 29/100 [04:41<10:43,  9.06s/it]


Scraping book: The Drifters
Scraping URL: https://www.goodreads.com/book/show/42955.The_Drifters
Extracted title: The Drifters
Extracted author: James A. Michener


 30%|███       | 30/100 [04:50<10:37,  9.11s/it]


Scraping book: Ringworld (Ringworld, #1)
Scraping URL: https://www.goodreads.com/book/show/61179.Ringworld
Extracted title: Ringworld
Extracted author: Larry Niven


 31%|███       | 31/100 [04:58<09:59,  8.69s/it]


Scraping book: The Golden Bowl
Scraping URL: https://www.goodreads.com/book/show/259020.The_Golden_Bowl
Extracted title: The Golden Bowl
Extracted author: Henry James


 32%|███▏      | 32/100 [05:06<09:47,  8.64s/it]


Scraping book: The Dark Is Rising (The Dark is Rising, #2)
Scraping URL: https://www.goodreads.com/book/show/210329.The_Dark_Is_Rising
Extracted title: The Dark Is Rising
Extracted author: Susan Cooper


 33%|███▎      | 33/100 [05:13<09:04,  8.13s/it]


Scraping book: Gateway (Heechee Saga, #1)
Scraping URL: https://www.goodreads.com/book/show/218427.Gateway
Extracted title: Gateway
Extracted author: Frederik Pohl


 34%|███▍      | 34/100 [05:21<08:56,  8.13s/it]


Scraping book: Giovanni’s Room
Scraping URL: https://www.goodreads.com/book/show/406235.Giovanni_s_Room
Extracted title: Giovanni’s Room
Extracted author: James Baldwin


 35%|███▌      | 35/100 [05:31<09:11,  8.49s/it]


Scraping book: Presumed Innocent (Kindle County Legal Thriller, #1)
Scraping URL: https://www.goodreads.com/book/show/425029.Presumed_Innocent
Extracted title: Presumed Innocent
Extracted author: Scott Turow


 36%|███▌      | 36/100 [05:39<08:50,  8.29s/it]


Scraping book: U.S.A.: The 42nd Parallel / 1919 / The Big Money
Scraping URL: https://www.goodreads.com/book/show/261441.U_S_A_
Extracted title: U.S.A.: The 42nd Parallel / 1919 / The Big Money
Extracted author: John Dos Passos


 37%|███▋      | 37/100 [05:49<09:30,  9.05s/it]


Scraping book: Dead Man Walking: The Eyewitness Account Of The Death Penalty That Sparked a National Debate
Scraping URL: https://www.goodreads.com/book/show/133793.Dead_Man_Walking
Extracted title: Dead Man Walking: The Eyewitness Account Of The Death Penalty That Sparked a National Debate
Extracted author: Helen Prejean


 38%|███▊      | 38/100 [06:01<10:18,  9.97s/it]


Scraping book: The Go-Between
Scraping URL: https://www.goodreads.com/book/show/258079.The_Go_Between
Extracted title: The Go-Between
Extracted author: L.P. Hartley


 39%|███▉      | 39/100 [06:13<10:28, 10.30s/it]


Scraping book: Le Grand Meaulnes
Scraping URL: https://www.goodreads.com/book/show/794779.Le_Grand_Meaulnes
Extracted title: Le Grand Meaulnes
Extracted author: Alain-Fournier


 40%|████      | 40/100 [06:23<10:21, 10.36s/it]


Scraping book: Alanna: The First Adventure (Song of the Lioness, #1)
Scraping URL: https://www.goodreads.com/book/show/13831.Alanna
Extracted title: Alanna: The First Adventure
Extracted author: Tamora Pierce


 41%|████      | 41/100 [06:34<10:29, 10.67s/it]


Scraping book: Camera Lucida: Reflections on Photography
Scraping URL: https://www.goodreads.com/book/show/497164.Camera_Lucida
Extracted title: Camera Lucida: Reflections on Photography
Extracted author: Roland Barthes


 42%|████▏     | 42/100 [06:47<10:49, 11.20s/it]


Scraping book: The White Album
Scraping URL: https://www.goodreads.com/book/show/421.The_White_Album
Extracted title: The White Album
Extracted author: Joan Didion


 43%|████▎     | 43/100 [06:59<10:50, 11.41s/it]


Scraping book: A Fan's Notes (A Fan's Notes, #1)
Scraping URL: https://www.goodreads.com/book/show/774032.A_Fan_s_Notes
Extracted title: A Fan's Notes
Extracted author: Frederick Exley


 44%|████▍     | 44/100 [07:09<10:25, 11.17s/it]


Scraping book: The Road Less Traveled: A New Psychology of Love, Traditional Values and Spiritual Growth
Scraping URL: https://www.goodreads.com/book/show/347852.The_Road_Less_Traveled
Extracted title: The Road Less Traveled: A New Psychology of Love, Traditional Values and Spiritual Growth
Extracted author: M. Scott Peck


 45%|████▌     | 45/100 [07:20<10:00, 10.91s/it]


Scraping book: The Fall of Freddie the Leaf: A Story of Life for All Ages
Scraping URL: https://www.goodreads.com/book/show/841110.The_Fall_of_Freddie_the_Leaf
Extracted title: The Fall of Freddie the Leaf: A Story of Life for All Ages
Extracted author: Leo F. Buscaglia


 46%|████▌     | 46/100 [07:39<12:05, 13.43s/it]


Scraping book: River God (Ancient Egypt, #1)
Scraping URL: https://www.goodreads.com/book/show/429138.River_God
Extracted title: River God
Extracted author: Wilbur Smith


 47%|████▋     | 47/100 [07:49<11:00, 12.46s/it]


Scraping book: The White Hotel
Scraping URL: https://www.goodreads.com/book/show/46087.The_White_Hotel
Extracted title: The White Hotel
Extracted author: D.M. Thomas


 48%|████▊     | 48/100 [08:02<10:52, 12.54s/it]


Scraping book: Travels with My Aunt
Scraping URL: https://www.goodreads.com/book/show/48858.Travels_with_My_Aunt
Extracted title: Travels with My Aunt
Extracted author: Graham Greene


 49%|████▉     | 49/100 [08:20<12:01, 14.14s/it]


Scraping book: Embers
Scraping URL: https://www.goodreads.com/book/show/783505.Embers
Extracted title: Embers
Extracted author: Sándor Márai


 50%|█████     | 50/100 [08:29<10:30, 12.62s/it]


Scraping book: Hogfather
Scraping URL: https://www.goodreads.com/book/show/34532.Hogfather
Extracted title: Hogfather
Extracted author: Terry Pratchett


 51%|█████     | 51/100 [08:42<10:25, 12.77s/it]


Scraping book: Being There
Scraping URL: https://www.goodreads.com/book/show/677877.Being_There
Extracted title: Being There
Extracted author: Jerzy Kosiński


 52%|█████▏    | 52/100 [08:55<10:12, 12.76s/it]


Scraping book: Parable of the Sower (Earthseed, #1)
Scraping URL: https://www.goodreads.com/book/show/52397.Parable_of_the_Sower
Extracted title: Parable of the Sower
Extracted author: Octavia E. Butler


 53%|█████▎    | 53/100 [09:05<09:26, 12.05s/it]


Scraping book: Cane
Scraping URL: https://www.goodreads.com/book/show/765172.Cane
Extracted title: Cane
Extracted author: Jean Toomer


 54%|█████▍    | 54/100 [09:16<08:53, 11.59s/it]


Scraping book: The Body in the Library (Miss Marple, #2)
Scraping URL: https://www.goodreads.com/book/show/16319.The_Body_in_the_Library
Extracted title: The Body in the Library
Extracted author: Agatha Christie


 55%|█████▌    | 55/100 [09:28<08:47, 11.72s/it]


Scraping book: Nobody's Fool (Sully #1)
Scraping URL: https://www.goodreads.com/book/show/659388.Nobody_s_Fool
Extracted title: Nobody's Fool
Extracted author: Richard Russo


 56%|█████▌    | 56/100 [09:37<08:06, 11.06s/it]


Scraping book: Sphere
Scraping URL: https://www.goodreads.com/book/show/455373.Sphere
Extracted title: Sphere
Extracted author: Michael Crichton


 57%|█████▋    | 57/100 [09:48<07:48, 10.89s/it]


Scraping book: Rilla of Ingleside (Anne of Green Gables, #8)
Scraping URL: https://www.goodreads.com/book/show/433533.Rilla_of_Ingleside
Extracted title: Rilla of Ingleside
Extracted author: L.M. Montgomery


 58%|█████▊    | 58/100 [09:58<07:26, 10.64s/it]


Scraping book: The Bell
Scraping URL: https://www.goodreads.com/book/show/11230.The_Bell
Extracted title: The Bell
Extracted author: Iris Murdoch


 59%|█████▉    | 59/100 [10:06<06:44,  9.86s/it]


Scraping book: The Day of the Locust
Scraping URL: https://www.goodreads.com/book/show/113441.The_Day_of_the_Locust
Extracted title: The Day of the Locust
Extracted author: Nathanael West


 60%|██████    | 60/100 [10:17<06:45, 10.13s/it]


Scraping book: A House for Mr Biswas
Scraping URL: https://www.goodreads.com/book/show/5849.A_House_for_Mr_Biswas
Extracted title: A House for Mr Biswas
Extracted author: V.S. Naipaul


 61%|██████    | 61/100 [10:27<06:37, 10.19s/it]


Scraping book: The Invention of Morel
Scraping URL: https://www.goodreads.com/book/show/94486.The_Invention_of_Morel
Extracted title: The Invention of Morel
Extracted author: Adolfo Bioy Casares


 62%|██████▏   | 62/100 [10:37<06:27, 10.19s/it]


Scraping book: Cruddy
Scraping URL: https://www.goodreads.com/book/show/29015.Cruddy
Extracted title: Cruddy
Extracted author: Lynda Barry


 63%|██████▎   | 63/100 [10:46<06:07,  9.94s/it]


Scraping book: Hocus Pocus
Scraping URL: https://www.goodreads.com/book/show/9589.Hocus_Pocus
Extracted title: Hocus Pocus
Extracted author: Kurt Vonnegut Jr.


 64%|██████▍   | 64/100 [10:56<05:51,  9.75s/it]


Scraping book: Revelation Space
Scraping URL: https://www.goodreads.com/book/show/89187.Revelation_Space
Extracted title: Revelation Space
Extracted author: Alastair Reynolds


 65%|██████▌   | 65/100 [11:05<05:36,  9.63s/it]


Scraping book: La Prisonnière
Scraping URL: https://www.goodreads.com/book/show/865124.La_Prisonni_re
Extracted title: La Prisonnière
Extracted author: Marcel Proust


 66%|██████▌   | 66/100 [11:19<06:12, 10.95s/it]


Scraping book: Sanctuary
Scraping URL: https://www.goodreads.com/book/show/18789.Sanctuary
Extracted title: Sanctuary
Extracted author: William Faulkner


 67%|██████▋   | 67/100 [11:30<05:56, 10.81s/it]


Scraping book: Gift from the Sea
Scraping URL: https://www.goodreads.com/book/show/77295.Gift_from_the_Sea
Extracted title: Gift from the Sea
Extracted author: Anne Morrow Lindbergh


 68%|██████▊   | 68/100 [11:37<05:16,  9.89s/it]


Scraping book: Cathedral
Scraping URL: https://www.goodreads.com/book/show/11449.Cathedral
Extracted title: Cathedral
Extracted author: Raymond Carver


 69%|██████▉   | 69/100 [11:46<04:55,  9.54s/it]


Scraping book: La ciudad y los perros
Scraping URL: https://www.goodreads.com/book/show/60142.La_ciudad_y_los_perros
Extracted title: La ciudad y los perros
Extracted author: Mario Vargas Llosa


 70%|███████   | 70/100 [11:54<04:34,  9.16s/it]


Scraping book: To Your Scattered Bodies Go (Riverworld, #1)
Scraping URL: https://www.goodreads.com/book/show/189147.To_Your_Scattered_Bodies_Go
Extracted title: To Your Scattered Bodies Go
Extracted author: Philip José Farmer


 71%|███████   | 71/100 [12:06<04:47,  9.92s/it]


Scraping book: Maurice
Scraping URL: https://www.goodreads.com/book/show/3103.Maurice
Extracted title: Maurice
Extracted author: E.M. Forster


 72%|███████▏  | 72/100 [12:19<05:02, 10.79s/it]


Scraping book: Danny the Champion of the World
Scraping URL: https://www.goodreads.com/book/show/6690.Danny_the_Champion_of_the_World
Extracted title: Danny the Champion of the World
Extracted author: Roald Dahl


 73%|███████▎  | 73/100 [12:29<04:44, 10.54s/it]


Scraping book: Weaveworld
Scraping URL: https://www.goodreads.com/book/show/52640.Weaveworld
Extracted title: Weaveworld
Extracted author: Clive Barker


 74%|███████▍  | 74/100 [12:40<04:38, 10.70s/it]


Scraping book: The Mote in God's Eye (Moties, #1)
Scraping URL: https://www.goodreads.com/book/show/100365.The_Mote_in_God_s_Eye
Extracted title: The Mote in God's Eye
Extracted author: Larry Niven


 75%|███████▌  | 75/100 [12:49<04:14, 10.20s/it]


Scraping book: Second Foundation (Foundation, #3)
Scraping URL: https://www.goodreads.com/book/show/29580.Second_Foundation
Extracted title: Second Foundation
Extracted author: Isaac Asimov


 76%|███████▌  | 76/100 [12:59<04:00, 10.04s/it]


Scraping book: The Club Dumas
Scraping URL: https://www.goodreads.com/book/show/7194.The_Club_Dumas
Book container not found. Trying alternative selectors...
Extracted title: None
Extracted author: None


 77%|███████▋  | 77/100 [13:15<04:32, 11.85s/it]


Scraping book: Murder Must Advertise  (Lord Peter Wimsey, #10)
Scraping URL: https://www.goodreads.com/book/show/351559.Murder_Must_Advertise
Extracted title: Murder Must Advertise
Extracted author: Dorothy L. Sayers


 78%|███████▊  | 78/100 [13:24<04:04, 11.11s/it]


Scraping book: Madness and Civilization: A History of Insanity in the Age of Reason
Scraping URL: https://www.goodreads.com/book/show/51933.Madness_and_Civilization
Extracted title: Madness and Civilization: A History of Insanity in the Age of Reason
Extracted author: Michel Foucault


 79%|███████▉  | 79/100 [13:34<03:44, 10.70s/it]


Scraping book: Corduroy
Scraping URL: https://www.goodreads.com/book/show/231850.Corduroy
Extracted title: Corduroy
Extracted author: Don Freeman


 80%|████████  | 80/100 [13:44<03:30, 10.54s/it]


Scraping book: Straight Man
Scraping URL: https://www.goodreads.com/book/show/414298.Straight_Man
Extracted title: Straight Man
Extracted author: Richard Russo


 81%|████████  | 81/100 [13:59<03:48, 12.04s/it]


Scraping book: The Lords of Discipline
Scraping URL: https://www.goodreads.com/book/show/85443.The_Lords_of_Discipline
Extracted title: The Lords of Discipline
Extracted author: Pat Conroy


 82%|████████▏ | 82/100 [14:09<03:24, 11.36s/it]


Scraping book: True Grit
Scraping URL: https://www.goodreads.com/book/show/257845.True_Grit
Extracted title: True Grit
Extracted author: Charles Portis


 83%|████████▎ | 83/100 [14:18<03:02, 10.72s/it]


Scraping book: The Drawing of the Three (The Dark Tower, #2)
Scraping URL: https://www.goodreads.com/book/show/5094.The_Drawing_of_the_Three
Extracted title: The Drawing of the Three
Extracted author: Stephen        King


 84%|████████▍ | 84/100 [14:27<02:42, 10.15s/it]


Scraping book: Decline and Fall
Scraping URL: https://www.goodreads.com/book/show/30929.Decline_and_Fall
Extracted title: Decline and Fall
Extracted author: Evelyn Waugh


 85%|████████▌ | 85/100 [14:35<02:22,  9.51s/it]


Scraping book: Jacob Have I Loved
Scraping URL: https://www.goodreads.com/book/show/337058.Jacob_Have_I_Loved
Extracted title: Jacob Have I Loved
Extracted author: Katherine Paterson


 86%|████████▌ | 86/100 [14:45<02:13,  9.56s/it]


Scraping book: The Second Chronicles of Thomas Covenant (The Second Chronicles of Thomas Covenant #1-3)
Scraping URL: https://www.goodreads.com/book/show/228991.The_Second_Chronicles_of_Thomas_Covenant
Extracted title: The Second Chronicles of Thomas Covenant
Extracted author: Stephen R. Donaldson


 87%|████████▋ | 87/100 [14:54<02:02,  9.44s/it]


Scraping book: Go Tell It on the Mountain
Scraping URL: https://www.goodreads.com/book/show/17143.Go_Tell_It_on_the_Mountain
Extracted title: Go Tell It on the Mountain
Extracted author: James Baldwin


 88%|████████▊ | 88/100 [15:04<01:55,  9.64s/it]


Scraping book: Forever...
Scraping URL: https://www.goodreads.com/book/show/37743.Forever_
Extracted title: Forever...
Extracted author: Judy Blume


 89%|████████▉ | 89/100 [15:13<01:43,  9.41s/it]


Scraping book: The Mother Tongue: English and How It Got That Way
Scraping URL: https://www.goodreads.com/book/show/29.The_Mother_Tongue
Extracted title: The Mother Tongue: English and How It Got That Way
Extracted author: Bill Bryson


 90%|█████████ | 90/100 [15:25<01:40, 10.07s/it]


Scraping book: The Story of Ferdinand
Scraping URL: https://www.goodreads.com/book/show/773951.The_Story_of_Ferdinand
Extracted title: The Story of Ferdinand
Extracted author: Munro Leaf


 91%|█████████ | 91/100 [15:35<01:31, 10.11s/it]


Scraping book: Replay
Scraping URL: https://www.goodreads.com/book/show/341735.Replay
Extracted title: Replay
Extracted author: Ken Grimwood


 92%|█████████▏| 92/100 [15:46<01:23, 10.39s/it]


Scraping book: We the Living
Scraping URL: https://www.goodreads.com/book/show/668.We_the_Living
Extracted title: We the Living
Extracted author: Ayn Rand


 93%|█████████▎| 93/100 [15:57<01:13, 10.55s/it]


Scraping book: The Devil's Arithmetic
Scraping URL: https://www.goodreads.com/book/show/91357.The_Devil_s_Arithmetic
Extracted title: The Devil's Arithmetic
Extracted author: Jane Yolen


 94%|█████████▍| 94/100 [16:07<01:01, 10.31s/it]


Scraping book: The Haj
Scraping URL: https://www.goodreads.com/book/show/42691.The_Haj
Extracted title: The Haj
Extracted author: Leon Uris


 95%|█████████▌| 95/100 [16:20<00:56, 11.35s/it]


Scraping book: Moon Palace
Scraping URL: https://www.goodreads.com/book/show/447.Moon_Palace
Extracted title: Moon Palace
Extracted author: Paul Auster


 96%|█████████▌| 96/100 [16:30<00:43, 10.83s/it]


Scraping book: The Seat of the Soul
Scraping URL: https://www.goodreads.com/book/show/119760.The_Seat_of_the_Soul
Extracted title: The Seat of the Soul
Extracted author: Gary Zukav


 97%|█████████▋| 97/100 [16:40<00:31, 10.55s/it]


Scraping book: The History of Sexuality, Volume 1: An Introduction
Scraping URL: https://www.goodreads.com/book/show/1875.The_History_of_Sexuality_Volume_1
Extracted title: The History of Sexuality, Volume 1: An Introduction
Extracted author: Michel Foucault


 98%|█████████▊| 98/100 [16:55<00:23, 11.80s/it]


Scraping book: A Fire Upon the Deep (Zones of Thought, #1)
Scraping URL: https://www.goodreads.com/book/show/77711.A_Fire_Upon_the_Deep
Extracted title: A Fire Upon the Deep
Extracted author: Vernor Vinge


 99%|█████████▉| 99/100 [17:07<00:11, 11.95s/it]


Scraping book: The Blind Owl
Scraping URL: https://www.goodreads.com/book/show/45967.The_Blind_Owl
Extracted title: The Blind Owl
Extracted author: Sadegh Hedayat


100%|██████████| 100/100 [17:17<00:00, 10.37s/it]


Dataset saved to books_dataset.csv
Number of books saved: 100

First few entries:
                      title                author rating  \
0       Motherless Brooklyn       Jonathan Lethem   3.86   
1          Follow the River  James Alexander Thom   4.23   
2         Fantastic Mr. Fox            Roald Dahl   4.08   
3  A Girl of the Limberlost  Gene Stratton-Porter   4.17   
4          Love You Forever         Robert Munsch   4.38   

                                             summary  \
0  Lionel Essrog is Brooklyn’s very own self-appo...   
1  Mary Ingles was twenty-three, happily married,...   
2  Fantastic Mr. Fox is on the run! The three mea...   
3  Set amid Indiana's vast Limberlost Swamp, this...   
4  An extraordinarily different story by Robert M...   

                                              genres  \
0  [Fiction, Mystery, Crime, New York, Novels, Au...   
1  [Historical Fiction, Fiction, Historical, Adve...   
2  [Childrens, Fiction, Fantasy, Classics, Middle..




In [20]:
import pandas as pd
from tabulate import tabulate
import textwrap

class BookDataDisplay:
    def __init__(self, csv_file='books_dataset.csv'):
        self.df = pd.read_csv(csv_file)
        
    def print_basic_info(self):
        """Print basic dataset information"""
        print("\n=== Dataset Overview ===")
        print(f"Total number of books: {len(self.df)}")
        print(f"Columns available: {', '.join(self.df.columns)}")
        print("\nFirst few books:")
        print("----------------")
        
        for idx, row in self.df.head().iterrows():
            print(f"\nBook {idx + 1}:")
            print(f"Title: {row['title']}")
            print(f"Author: {row['author']}")
            if 'rating' in row:
                print(f"Rating: {row['rating']}")
            print("-" * 50)
    
    def print_detailed_book(self, index):
        """Print detailed information for a specific book"""
        try:
            book = self.df.iloc[index]
            print("\n=== Detailed Book Information ===")
            print(f"Title: {book['title']}")
            print(f"Author: {book['author']}")
            if 'rating' in book:
                print(f"Rating: {book['rating']}")
            if 'genres' in book:
                print(f"Genres: {book['genres']}")
            print("\nSummary:")
            if 'summary' in book and isinstance(book['summary'], str):
                # Wrap summary text for better readability
                wrapped_summary = textwrap.fill(book['summary'], width=70)
                print(wrapped_summary)
            print("-" * 50)
        except IndexError:
            print(f"Error: Book index {index} not found in dataset")
    
    def print_tabulated(self, columns=None, max_rows=10):
        """Print data in a neat table format"""
        if columns is None:
            columns = ['title', 'author', 'rating']
        
        # Select only existing columns
        available_columns = [col for col in columns if col in self.df.columns]
        
        print("\n=== Books Table ===")
        print(tabulate(
            self.df[available_columns].head(max_rows), 
            headers='keys', 
            tablefmt='grid',
            showindex=True
        ))
    
    def search_books(self, keyword, column='title'):
        """Search for books containing a keyword in specified column"""
        if column not in self.df.columns:
            print(f"Error: Column '{column}' not found in dataset")
            return
        
        matches = self.df[self.df[column].str.contains(keyword, case=False, na=False)]
        
        print(f"\n=== Search Results for '{keyword}' in {column} ===")
        print(f"Found {len(matches)} matches:")
        
        for idx, book in matches.iterrows():
            print(f"\nMatch {idx + 1}:")
            print(f"Title: {book['title']}")
            print(f"Author: {book['author']}")
            if 'rating' in book:
                print(f"Rating: {book['rating']}")
            print("-" * 30)

# Example usage
if __name__ == "__main__":
    display = BookDataDisplay('books_dataset.csv')
    
    # Print basic dataset information
    display.print_basic_info()
    
    # # Print detailed information for first book
    display.print_detailed_book(0)
    
    # # Print tabulated data
    # display.print_tabulated()
    
    # # Search for books
    # display.search_books('fantasy', 'genres')


=== Dataset Overview ===
Total number of books: 5
Columns available: title, author, rating, summary, genres, url

First few books:
----------------

Book 1:
Title: The Girl with the Dragon Tattoo
Author: Stieg Larsson
Rating: 4.17
--------------------------------------------------

Book 2:
Title: And Then There Were None
Author: Agatha Christie
Rating: 4.28
--------------------------------------------------

Book 3:
Title: Angels & Demons
Author: Dan    Brown
Rating: 3.95
--------------------------------------------------

Book 4:
Title: Rebecca
Author: Daphne du Maurier
Rating: 4.25
--------------------------------------------------

Book 5:
Title: In Cold Blood
Author: Truman Capote
Rating: 4.09
--------------------------------------------------

=== Detailed Book Information ===
Title: The Girl with the Dragon Tattoo
Author: Stieg Larsson
Rating: 4.17
Genres: ['Fiction', 'Mystery', 'Thriller', 'Crime', 'Mystery Thriller', 'Suspense', 'Contemporary']

Summary:
Harriet Vanger, a scio