# Python DA Assignment 3 – Web Scraping and PySpark
This notebook contains the full solution for extracting book data from the [Books to Scrape](https://books.toscrape.com) website using web scraping, and analyzing it using PySpark.

In [None]:
!pip install requests beautifulsoup4 pandas pyspark

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [None]:
base_url = "https://books.toscrape.com/catalogue/"
start_url = "https://books.toscrape.com/catalogue/page-1.html"
books = []

def get_rating(tag):
    classes = tag.get('class', [])
    ratings = ['One', 'Two', 'Three', 'Four', 'Five']
    for rate in ratings:
        if rate in classes:
            return ratings.index(rate) + 1
    return None

In [None]:
while start_url:
    response = requests.get(start_url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('article', class_='product_pod')

    for article in articles:
        title = article.h3.a['title']
        price_str = article.find('p', class_='price_color').text.strip()
        price = float(re.sub(r'[^\d.]', '', price_str))
        rating = get_rating(article.find('p', class_='star-rating'))
        availability = article.find('p', class_='instock availability').text.strip()

        detail_url = base_url + article.h3.a['href']
        detail_resp = requests.get(detail_url)
        detail_resp.encoding = 'utf-8'
        detail_soup = BeautifulSoup(detail_resp.text, 'html.parser')
        genre = detail_soup.select('ul.breadcrumb li')[2].text.strip()

        books.append({
            'Title': title,
            'Price': price,
            'Rating': rating,
            'Availability': availability,
            'Genre': genre
        })

    next_btn = soup.find('li', class_='next')
    if next_btn:
        next_href = next_btn.a['href']
        start_url = base_url + next_href
    else:
        break

print("Books scraped:", len(books))
print("First 3 books:", books[:3])

In [None]:
df = pd.DataFrame(books)
df.to_csv('books_data.csv', index=False)
df.head()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Books Analysis").getOrCreate()

In [None]:
df_spark = spark.read.csv('books_data.csv', header=True, inferSchema=True)
df_spark.show(5)

In [None]:
df_spark.printSchema()

In [None]:
df_spark.describe().show()

In [None]:
df_spark.filter(df_spark.Price > 20).show(5)

In [None]:
df_spark.filter(df_spark.Rating >= 4).show(5)