### Web Scraping Using BeatifulSoup

in this notebook we will scraping the web books review, using web Books to Scrape

### Import the Library

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
books = []

#The web have 50 page so we using range(1,51) in url to easy scrape the web

for i in range(1,51):
    url = f"https://books.toscrape.com/catalogue/page-{i}.html"
    response = requests.get(url)
    response = response.content
    soup = BeautifulSoup(response, 'html.parser')
    ol = soup.find('ol')
    articles = ol.find_all('article', class_='product_pod')
    for article in articles:
        image = article.find('img')
        title = image.attrs['alt']
        starTag = article.find('p')
        star = starTag['class'][1]
        price = article.find('p', class_='price_color').text
        price = float(price[1:])
        books.append([title, star, price])

In [3]:
# Make a columns of dataframe 
df = pd.DataFrame(books, columns=['Title', 'Star Rating', 'Price'])

# Save the dataframe to csv file with name books.csv
df.to_csv('books.csv')

In [4]:
# Open and Check the saved file 
df_books = pd.read_csv('books.csv')
df_books

Unnamed: 0.1,Unnamed: 0,Title,Star Rating,Price
0,0,A Light in the Attic,Three,51.77
1,1,Tipping the Velvet,One,53.74
2,2,Soumission,One,50.10
3,3,Sharp Objects,Four,47.82
4,4,Sapiens: A Brief History of Humankind,Five,54.23
...,...,...,...,...
995,995,Alice in Wonderland (Alice's Adventures in Won...,One,55.53
996,996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",Four,57.06
997,997,A Spy's Devotion (The Regency Spies of London #1),Five,16.97
998,998,1st to Die (Women's Murder Club #1),One,53.98


In [5]:
# Drop or Delete the column name 'Unnamed: 0' in dataframe
df_books.drop(columns = 'Unnamed: 0', inplace = True)

In [6]:
df_books

Unnamed: 0,Title,Star Rating,Price
0,A Light in the Attic,Three,51.77
1,Tipping the Velvet,One,53.74
2,Soumission,One,50.10
3,Sharp Objects,Four,47.82
4,Sapiens: A Brief History of Humankind,Five,54.23
...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,One,55.53
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",Four,57.06
997,A Spy's Devotion (The Regency Spies of London #1),Five,16.97
998,1st to Die (Women's Murder Club #1),One,53.98


In [7]:
# Change the Star Rating from text to number
df_books['Star Rating'] = df_books['Star Rating'].replace(['One', 'Two', 'Three', 'Four', 'Five'],['1','2','3','4','5'])

In [8]:
# Change the Star Rating type from object to int32
df_books['Star Rating'] = df_books['Star Rating'].astype('int32')

In [9]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Title        1000 non-null   object 
 1   Star Rating  1000 non-null   int32  
 2   Price        1000 non-null   float64
dtypes: float64(1), int32(1), object(1)
memory usage: 19.7+ KB


In [10]:
df_books

Unnamed: 0,Title,Star Rating,Price
0,A Light in the Attic,3,51.77
1,Tipping the Velvet,1,53.74
2,Soumission,1,50.10
3,Sharp Objects,4,47.82
4,Sapiens: A Brief History of Humankind,5,54.23
...,...,...,...
995,Alice in Wonderland (Alice's Adventures in Won...,1,55.53
996,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",4,57.06
997,A Spy's Devotion (The Regency Spies of London #1),5,16.97
998,1st to Die (Women's Murder Club #1),1,53.98


In [11]:
#Save the Change
df_books.to_csv('Books_Clean.csv')