In [2]:
# Beautiful soup parses HTML
from bs4 import BeautifulSoup
# Requests allows getting of website requests
import requests
# Books to scrape link
books_to_scrape = "https://books.toscrape.com/"
# Parse into HTML
parsed_data = requests.get(books_to_scrape)
# Text of parsed data
data = parsed_data.text
# Organizes data into readable format
data_bs = BeautifulSoup(data, "html.parser")

In [3]:
# Get book titles
book_titles = data_bs.find_all('h3')
# Empty titles list
titles = []

# Loop each title in book_titles
for title in book_titles:
  # Find 'a' tag of each title
    title_link = title.find('a')
    # Execute if a tag exists
    if title_link:
        # Get title link text and strip it
        book_name = title_link.text.strip()
        # Append book name to titles list
        titles.append(book_name)

In [4]:
# Get ratings HTML class
select_class = "star-rating"
# Get book ratings
book_ratings = data_bs.find_all('p', {'class': select_class})
# Empty ratings list
ratings = []
# Dictionary that maps names to numbers
rating_dict = {
    'One': 1,
    'Two': 2,
    'Three': 3,
    'Four': 4,
    'Five': 5
}

# Loop each rating in book_ratings
for rating in book_ratings:
  # Get rating name based on class 2nd element
  rating_name = rating['class'][1]
  # Convert name to number
  rating_number = rating_dict.get(rating_name)
  # Append rating number to ratings list
  ratings.append(rating_number)

In [25]:
# Get price HTML class
select_class = "price_color"
# Get book prices
book_prices = data_bs.find_all('p', {'class': select_class})
# Empty prices list
prices = []

# Loop each price in book_prices
for price in book_prices:
   # Price number is price text
   price_number = float(price.text[2:])
   # Append price number to prices list
   prices.append(price_number)

In [6]:
# Get availability HTML class
select_class = "instock availability"
# Get book availabilities
book_availabilities = data_bs.find_all('p', {'class': select_class})
# Empty availabilities list
availabilities = []

# Loop each availability in book_availabilities
for availability in book_availabilities:
   # Availability status is stripped availability text
   availability_status = availability.text.strip()
   # Append availability status to availabilities list
   availabilities.append(availability_status)

In [26]:
# Import pandas as pd for data science
import pandas as pd
# Dataframe dictionary of labels and values
dataframe = {
    "Titles": titles,
    "Ratings": ratings,
    "Prices (£)": prices,
    "Availabilities": availabilities
}
# Create dataframe from webscraped features
df = pd.DataFrame(dataframe)

What is the size of the dataset?

In [19]:
# Size of dataset
print(df.size)

80


What are the names and data types of each column?

In [27]:
# Names and data types of each column
print(df.dtypes)

Titles             object
Ratings             int64
Prices (£)        float64
Availabilities     object
dtype: object


How many unique values are there for each categorical variable?

In [30]:
# Number of unique values for each categorical variable
print(df.nunique())

Titles            20
Ratings            5
Prices (£)        20
Availabilities     1
dtype: int64


If there is any numerical value in the dataset, what are the minimum and maximum values for it?

In [43]:
# Minimum and maximum values of ratings
ratings_stats = df['Ratings'].agg(['min', 'max'])
# Display ratings min and max
print("Ratings\n" + str(ratings_stats))
# Minimum and maximum values of prices
prices_stats = df['Prices (£)'].agg(['min', 'max'])
# Display prices min and max
print("\nPrices (£)\n" + str(prices_stats))

Ratings
min    1
max    5
Name: Ratings, dtype: int64

Prices (£)
min    13.99
max    57.25
Name: Prices (£), dtype: float64


Drop rows that have missing values

In [37]:
# Drop rows with missing values
df.dropna()

Unnamed: 0,Titles,Ratings,Prices (£),Availabilities
0,A Light in the ...,3,51.77,In stock
1,Tipping the Velvet,1,53.74,In stock
2,Soumission,1,50.1,In stock
3,Sharp Objects,4,47.82,In stock
4,Sapiens: A Brief History ...,5,54.23,In stock
5,The Requiem Red,1,22.65,In stock
6,The Dirty Little Secrets ...,4,33.34,In stock
7,The Coming Woman: A ...,3,17.93,In stock
8,The Boys in the ...,4,22.6,In stock
9,The Black Maria,1,52.15,In stock


What are the most frequent categories in the data? Write your observation in words.

Book titles and prices have no frequent categories because all values are unique. The most frequent book rating is "1", which means one star. The most frequent category are book availabilities because all of them are "in stock."