In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import re
from datetime import datetime
import requests
import json
import csv
import random
import numpy as np
PROBABILITIES = [0.005, 0.05, 0.1, 0.4, 0.445]

def download_image(source, filename):
    with open(filename, 'wb') as f:
        image_content = requests.get(source).content
        f.write(image_content)
        print('Downloaded image at:', filename)
        

class OpenLibraryScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.books_data = []
        self.base_url = "https://openlibrary.org"

    def get_soup(self, url):
        """Make a request to the URL and return BeautifulSoup object"""
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException as e:
            print(f"Error fetching URL: {e}")
            return None

    def clean_text(self, text):
        """Clean extracted text by removing extra whitespace and newlines"""
        if text:
            return re.sub(r'\s+', ' ', text).strip()
        return ''

    def scrape_book(self, url):
        """Scrape individual book page from Open Library"""
        soup = self.get_soup(url)
        if not soup:
            return None

        try:
            book_data = {
                'url': url,
                'title': '',
                'authors': [],
                'publisher': '',
                'publish_date': '',
                'category': [],
                'rating': '',
                'page_count': '',
                'languages': [],
                'image_url': ''
            }

         

            # Title
            title_elem = soup.select_one('h1.work-title')
            if title_elem:
                book_data['title'] = self.clean_text(title_elem.text)
            # Image URL
            img_elem = soup.find('img', attrs = {'itemprop': 'image'})
            img_url = 'https:' + img_elem['src'] if img_elem else None
            valid_img = re.search(r"openlibrary.org", img_url)
            if valid_img:
                book_data['image_url'] = img_url
                download_image(img_url, f"images/{book_data['title']}.jpg")
            else:
                print('No image found for:', book_data['title'])
                return None
            rating = random.choices([1,2,3,4,5], PROBABILITIES, k=1)[0]
            book_data['rating'] = rating
            # Authors
            author_elems = soup.select('a[href^="/author"]')
            book_data['authors'] = list(set([self.clean_text(author.text) for author in author_elems]))

            # Publisher and publish date
            publish_info = soup.select('div.edition-omniline')
            if publish_info:
                pub_text = publish_info[0].text
                pub_text = self.clean_text(pub_text)
                publish_date_pattern = r"Publish Date (\w* \d{1,2}, )?(?P<publish_date>\d{4})"

                publisher_pattern = r"Publisher (?P<publisher>.*) [Language]?"  # Extract publisher
                pages_pattern = r"Pages (?P<pages>\d+)"  # Extract pages
                language_pattern = r"Language (?P<language>[^\s]+)"  # Extract language
                # Extract publisher
                # match = re.search(pattern, pub_text)
                publisher_match = re.search(publisher_pattern, pub_text).group('publisher') if re.search(publisher_pattern, pub_text) else None
                date_match = re.search(publish_date_pattern, pub_text).group('publish_date') if re.search(publish_date_pattern, pub_text) else None
                pages_match = re.search(pages_pattern, pub_text).group('pages') if re.search(pages_pattern, pub_text) else None
                language_match = re.search(language_pattern, pub_text).group('language') if re.search(language_pattern, pub_text) else None 
                if publisher_match:
                    book_data['publisher'] = self.clean_text(publisher_match)
                if date_match:
                    book_data['publish_date'] = date_match
                if language_match:
                    book_data['languages'] = language_match
                if pages_match:
                    book_data['page_count'] = pages_match     

            # category
            subject_elems = soup.select('a[href^="/subjects/"]')
            book_data['category'] = [self.clean_text(subject.text) for subject in subject_elems]

            # Description
            desc_elem = soup.select_one('div[itemprop="description"]')
            if desc_elem:
                book_data['description'] = self.clean_text(desc_elem.text)

            # Page count
            pages_elem = soup.select_one('span[itemprop="numberOfPages"]')
            if pages_elem:
                book_data['page_count'] = self.clean_text(pages_elem.text)


        
            self.books_data.append(book_data)
            return book_data

        except Exception as e:
            print(f"Error scraping book {url}: {e}")
            return None

    def search_books(self, query, max_results=5):
        """Search for books and return their URLs"""
        search_url = f"{self.base_url}/search?q={query.replace(' ', '+')}&mode=everything"
        soup = self.get_soup(search_url)
        if not soup:
            return []

        book_links = []
        results = soup.select('h3.booktitle a[href^="/works/"]')
        
        for result in results[:max_results]:
            book_url = self.base_url + result['href']
            book_links.append(book_url)
            
        return book_links

    def scrape_books(self, urls):
        """Scrape multiple book pages"""
        i = 0
        for url in urls:
            i+=1
            print(i)
            print(f"Scraping: {url}")
            self.scrape_book(url)
            sleep(2)  # Be nice to the server
        
        return self.export_to_dataframe()

    def export_to_dataframe(self):
        """Export scraped data to pandas DataFrame"""
        return pd.DataFrame(self.books_data)

    def save_to_csv(self, filename="books_data.csv"):
        """Save scraped data to CSV file"""
        df = self.export_to_dataframe()
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")




In [2]:
import pandas as pd
df = pd.read_csv("python_book_urls.csv")
book_urls = df['url'].tolist()
print(book_urls[:5])    

['https://openlibrary.org/works/OL25421708W', 'https://openlibrary.org/works/OL18593468W?edition=ia%3Afriend0000hill', 'https://openlibrary.org/works/OL3098719W?edition=ia%3Acompletedistance0000tull', 'https://openlibrary.org/works/OL17364743W?edition=ia%3Ahiddenoracle0000rior_l5g3', 'https://openlibrary.org/works/OL16808392W?edition=ia%3Ahouseofhades0000rior_f4t9']


In [None]:

if __name__ == "__main__":
    scraper = OpenLibraryScraper()
    # book_urls =["https://openlibrary.org/works/OL41942014W","https://openlibrary.org/works/OL15987761W?edition=ia%3Ablueumbrellanove0000maso"]
    # Example 1: Scrape specific books
    df = scraper.scrape_books(book_urls[20:40])
    


    scraper.save_to_csv("open_library_books.csv")

1
Scraping: https://openlibrary.org/works/OL18738896W?edition=ia%3Awhirlingcircleso0000alle
Downloaded image at: images/The Whirling Circles of Ba Gua Zhang.jpg
2
Scraping: https://openlibrary.org/works/OL2679070W?edition=ia%3Aopeningenergygat0000fran
Downloaded image at: images/Opening the energy gates of your body.jpg
3
Scraping: https://openlibrary.org/works/OL17452319W?edition=ia%3Adragontigermedic0000fran
Downloaded image at: images/Dragon And Tiger Medical Qigong Health And Energy In Seven Simple Movements.jpg
4
Scraping: https://openlibrary.org/works/OL3411239W?edition=ia%3Abarefootrunnerli0000ramb
Downloaded image at: images/Barefoot Runner.jpg
5
Scraping: https://openlibrary.org/works/OL16804289W?edition=ia%3Abegynnelse0000danb
Downloaded image at: images/Begynnelse.jpg
6
Scraping: https://openlibrary.org/works/OL34199971W
No image found for: Ask for Andrea
7
Scraping: https://openlibrary.org/works/OL23286131W?edition=ia%3Aofficialrocknrol0000frai
Downloaded image at: images/T