In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random

# Scrape Books to Scrape (First source)
def scrape_books_to_scrape(max_books=150):
    books = []
    url = 'https://books.toscrape.com/catalogue/page-1.html'
    base_url = 'https://books.toscrape.com/catalogue/'
    while url and len(books) < max_books:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        articles = soup.find_all('article', class_='product_pod')
        for article in articles:
            if len(books) >= max_books:
                break
            title = article.h3.a['title']
            price = article.find('p', class_='price_color').text.strip()
            rating_class = article.find('p', class_='star-rating')['class'][1]
            availability = article.find('p', class_='instock availability').text.strip()
            # الدخول على صفحة الكتاب عشان نجيب الكاتيجوري
            detail_url = base_url + article.h3.a['href'].replace('../../../', '')
            detail_response = requests.get(detail_url)
            detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
            category = detail_soup.find('ul', class_='breadcrumb').find_all('li')[2].text.strip()
            books.append({
                'title': title,
                'price': price,
                'rating': rating_class,
                'availability': availability,
                'category': category,
                'source': 'Books to Scrape'
            })
        next_button = soup.find('li', class_='next')
        if next_button:
            url = 'https://books.toscrape.com/catalogue/' + next_button.a['href']
        else:
            url = None
    return books

In [2]:
# Scrape OpenLibrary (Second source)
def scrape_openlibrary(max_books=150):
    books = []
    base_url = "https://openlibrary.org/search.json?q=python"
    response = requests.get(base_url)
    if response.status_code == 200:
        data = response.json()
        docs = data.get('docs', [])
        for doc in docs[:max_books]:
            title = doc.get('title', 'Unknown')
            price = f"£{random.randint(10, 100)}.00"  # Fake price
            rating = random.choice(['One', 'Two', 'Three', 'Four', 'Five'])
            availability = 'In stock'
            subjects = doc.get('subject')
            if subjects:
                category = subjects[0]
            else:
                category = 'Programming'
            books.append({
                'title': title,
                'price': price,
                'rating': rating,
                'availability': availability,
                'category': category,
                'source': 'OpenLibrary'
            })
    return books

In [3]:
# Scrape Google Books API (Third source)
def scrape_google_books_api(max_books=150):
    books = []
    base_url = "https://www.googleapis.com/books/v1/volumes?q=python&maxResults=40"  # Search query and max results
    response = requests.get(base_url)
    
    if response.status_code == 200:
        data = response.json()
        items = data.get('items', [])
        for item in items[:max_books]:
            title = item['volumeInfo'].get('title', 'Unknown')
            price = 'N/A'  # Google Books API does not provide price
            rating = item['volumeInfo'].get('averageRating', 'N/A')
            availability = 'In stock'  # Not available in Google Books API, so we assume it's in stock
            categories = item['volumeInfo'].get('categories', ['N/A'])
            category = ', '.join(categories)
            books.append({
                'title': title,
                'price': price,
                'rating': rating,
                'availability': availability,
                'category': category,
                'source': 'Google Books API'
            })
    return books

In [4]:
# Clean price function
def clean_price(price):
    return price.replace('Â', '').replace('Ã‚', '').replace('Â£', '£').strip()

# Get books from all sources
books1 = scrape_books_to_scrape(max_books=150)
books2 = scrape_openlibrary(max_books=150)
books3 = scrape_google_books_api(max_books=150)

# Combine all books
all_books = books1 + books2 + books3

# Clean prices
for book in all_books:
    book['price'] = clean_price(book['price'])

# Save to CSV
df = pd.DataFrame(all_books)
df.to_csv('books_data.csv', index=False, encoding='utf-8-sig')

# Read the saved file
df = pd.read_csv('books_data.csv')
print(df.head(10))


                                               title   price rating  \
0                               A Light in the Attic  £51.77  Three   
1                                 Tipping the Velvet  £53.74    One   
2                                         Soumission  £50.10    One   
3                                      Sharp Objects  £47.82   Four   
4              Sapiens: A Brief History of Humankind  £54.23   Five   
5                                    The Requiem Red  £22.65    One   
6  The Dirty Little Secrets of Getting Your Dream...  £33.34   Four   
7  The Coming Woman: A Novel Based on the Life of...  £17.93  Three   
8  The Boys in the Boat: Nine Americans and Their...  £22.60   Four   
9                                    The Black Maria  £52.15    One   

  availability            category           source  
0     In stock              Poetry  Books to Scrape  
1     In stock  Historical Fiction  Books to Scrape  
2     In stock             Fiction  Books to Scrape  
3 

In [5]:
print(df.columns)

Index(['title', 'price', 'rating', 'availability', 'category', 'source'], dtype='object')


In [6]:
print(df.isnull().sum())

title            0
price           40
rating          33
availability     0
category         1
source           0
dtype: int64


In [7]:
df.dropna(inplace=True)
print(df.isnull().sum())

title           0
price           0
rating          0
availability    0
category        0
source          0
dtype: int64


In [8]:
df['category'] = df['category'].apply(lambda x: x if isinstance(x, str) else 'Unknown')
