<a href="https://colab.research.google.com/github/Hemavarna-S/Data_Science/blob/main/Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
pip install beautifulsoup4



In [26]:
from bs4 import BeautifulSoup as bs

In [27]:
import requests
import pandas as pd
import time
import random

In [28]:
BASE_URL = "http://books.toscrape.com/catalogue/page-{}.html"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Window NT 10.0; Win64; x64)"
                  "AppleWebkit/537.36 (KHTML, like Gecko)"
                  "Chrome/114.0.0.0 Safari/537.36"
}

In [29]:
def convert_rating(rating_text):
  ratings = {
      "One" : 1,
      "Two" : 2,
      "Three" : 3,
      "Four" : 4,
      "Five" : 5
  }
  return ratings.get(rating_text, None)

In [30]:
def fetch_page(page_num):
  url = BASE_URL.format(page_num)
  print(f"Fetching:{url}")
  try:
    res = requests.get(url, headers=HEADERS)
    res.raise_for_status()
    soup = bs(res.text,'html.parser')
    return soup
  except requests.exceptions.RequestException as e:
    print(f"Failed to fetch {url} : {e}")
    return None

In [31]:
def extract_books(soup):
  books = []
  articles = soup.find_all('article',class_='product_pod')
  for article in articles:
    try:
      title = article.h3.a['title']
      price = article.find('p',class_='price_color').text[1:]
      availability = article.find("p",class_='instock availability').text[1:]
      rating = article.find("p",class_='star-rating')['class'][1]
      books.append({
          'Title':title,
          'Price($)':price,
          'Availability':availability,
          'Rating':convert_rating(rating)
      })
    except Exception as e:
      print(f"Failed to extract book details: {e}")

  return books

In [32]:
def scrape_books(max_pages=5):
  all_books=[]
  for i in range(1,max_pages+1):
    soup = fetch_page(i)
    if soup:
      books = extract_books(soup)
      all_books.extend(books)
      time.sleep(random.uniform(1.5,3))
    else:
      print(f"Failed to extract book data:{e}")
      break
  return books

In [36]:
if __name__ == "__main__":
  max_pages = 5
  books_data = scrape_books(max_pages)
  df = pd.DataFrame(books_data)
  print(df.head)
  df.to_csv('books.csv',index=False)

Fetching:http://books.toscrape.com/catalogue/page-1.html
Fetching:http://books.toscrape.com/catalogue/page-2.html
Fetching:http://books.toscrape.com/catalogue/page-3.html
Fetching:http://books.toscrape.com/catalogue/page-4.html
Fetching:http://books.toscrape.com/catalogue/page-5.html
<bound method NDFrame.head of                                                 Title Price($)  \
0   Princess Jellyfish 2-in-1 Omnibus, Vol. 01 (Pr...   £13.61   
1    Princess Between Worlds (Wide-Awake Princess #5)   £13.34   
2                         Pop Gun War, Volume 1: Gift   £18.97   
3   Political Suicide: Missteps, Peccadilloes, Bad...   £36.28   
4                                            Patience   £10.16   
5   Outcast, Vol. 1: A Darkness Surrounds Him (Out...   £15.44   
6   orange: The Complete Collection 1 (orange: The...   £48.41   
7   Online Marketing for Busy Authors: A Step-By-S...   £46.35   
8                                 On a Midnight Clear   £14.07   
9                        