In [44]:
import requests
from bs4 import BeautifulSoup
import re
import json
import csv

# Base URL (without page number)
base_url = 'https://www.thriftbooks.com/browse/?11966col#b.s=mostPopular-desc&b.p={}&b.pp=50&b.col&b.f.t%5B%5D=11966&b.list'

# Create CSV
with open('thrift_books_data.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Title', 'ISBN', 'Release Date', 'Publisher', 'Format', 'Condition', 'Price'])

    # Loop through pages (up to 12 or more if more pages are added)
    page_number = 1
    while True:
        # Generate URL for the current page
        url = base_url.format(page_number)
        print(f"Scraping page {page_number}: {url}")

        # Request the page
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <script> tags
        scripts = soup.find_all('script')

        # Find the script containing 'window.searchStoreV2'
        json_data = None
        for script in scripts:
            if script.string and 'window.searchStoreV2' in script.string:
                match = re.search(r'window\.searchStoreV2\s*=\s*(\{.*?\});', script.string, re.DOTALL)
                if match:
                    json_data = json.loads(match.group(1))
                break

        if not json_data:
            print("No data found on this page.")
            break  # Exit if no data is found (in case of future changes)

        # Get list of works (books)
        works = json_data.get('works', [])
        if not works:
            print("No books found on this page.")
            break  # Exit if no books are found (in case of future changes)

        # Write book data to CSV
        for book in works:
            title = book.get('title', '')
            isbn = book.get('iSBN', '')
            release_date = book.get('releaseDate', '')
            publisher = book.get('publisher', '')
            format_ = book.get('media', '')
            condition = 'Good'  # Data me nahi aata hamesha, default set kar diya
            price = book.get('buyNowPrice', '')

            writer.writerow([title, isbn, release_date, publisher, format_, condition, price])

        # Check if there is a next page (to avoid infinite loop)
        pagination_bar = soup.find('div', class_='Pagination-bar')
        if pagination_bar and 'is-disabled' not in pagination_bar.find_all('button', class_='Pagination-link')[-1].get('class', []):
            page_number += 1  # Move to the next page
        else:
            break  # Exit if no more pages

print('✅ CSV file created successfully with data from all pages!')


Scraping page 1: https://www.thriftbooks.com/browse/?11966col#b.s=mostPopular-desc&b.p=1&b.pp=50&b.col&b.f.t%5B%5D=11966&b.list
Scraping page 2: https://www.thriftbooks.com/browse/?11966col#b.s=mostPopular-desc&b.p=2&b.pp=50&b.col&b.f.t%5B%5D=11966&b.list
Scraping page 3: https://www.thriftbooks.com/browse/?11966col#b.s=mostPopular-desc&b.p=3&b.pp=50&b.col&b.f.t%5B%5D=11966&b.list
Scraping page 4: https://www.thriftbooks.com/browse/?11966col#b.s=mostPopular-desc&b.p=4&b.pp=50&b.col&b.f.t%5B%5D=11966&b.list
Scraping page 5: https://www.thriftbooks.com/browse/?11966col#b.s=mostPopular-desc&b.p=5&b.pp=50&b.col&b.f.t%5B%5D=11966&b.list
Scraping page 6: https://www.thriftbooks.com/browse/?11966col#b.s=mostPopular-desc&b.p=6&b.pp=50&b.col&b.f.t%5B%5D=11966&b.list
Scraping page 7: https://www.thriftbooks.com/browse/?11966col#b.s=mostPopular-desc&b.p=7&b.pp=50&b.col&b.f.t%5B%5D=11966&b.list
Scraping page 8: https://www.thriftbooks.com/browse/?11966col#b.s=mostPopular-desc&b.p=8&b.pp=50&b.col&b

ConnectTimeout: HTTPSConnectionPool(host='www.thriftbooks.com', port=443): Max retries exceeded with url: /browse/?11966col (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002BFEE7BC2F0>, 'Connection to www.thriftbooks.com timed out. (connect timeout=None)'))