In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os

pd.set_option('display.max_columns', None) # display all columns
import warnings
warnings.filterwarnings('ignore') # ignore warnings

## Links

In [None]:
links = []

for number in range(1,26):
    links.append("https://books.toscrape.com/catalogue/category/books_1/page-" + str(number) + ".html")
links

In [None]:
for link in links:
    response = requests.get(link)
    print(response.status_code)

In [None]:
soup_objects = {}

for link in links:
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")
    soup_objects[link] = soup

## Titles, Prices, Ratings, Pictures

In [None]:
titles = []
for link, soup in soup_objects.items():
    for li in soup.select("ol.row li h3 a"):
        titles.append(li["title"])

titles

In [None]:
prices = []

for link in links:
    for li in soup.select("ol.row p.price_color"):
        prices.append(li.get_text()[1:])

prices

In [None]:
ratings = []

for link in links:
    rating_element = soup.find_all("p", class_ = "star-rating")

    for rating_element in rating_element:
        ratings.append(rating_element["class"][1])

ratings

In [None]:
from urllib.parse import urljoin

pictures = []

for link in links:
    image_tags = soup.select("div.image_container img")

    for img in image_tags:
        # Build full URL correctly
        img_url = urljoin(link, img["src"])
        img_data = requests.get(img_url).content
        pictures.append(img_url)
    
print(f"Collected {len(pictures)} images")

In [None]:
books = pd.DataFrame({"Title": titles, "Price": prices, "Rating": ratings, "Picture": pictures})
books

## Links product pages

In [None]:
titles_link = []

for title in titles:
    title = title.lower()
    title = title.replace(":", "")
    title = title.replace("'", "")
    title = title.replace("#", "")
    title = title.replace("(", "")
    title = title.replace(")", "")
    title = title.replace(",", "")
    title = title.replace(".", "")
    title = title.replace("&", "")
    title = title.replace("-", "")
    title = title.replace("*", "")
    title = title.replace("?", "")
    title = title.replace("!", "")
    title = title.replace("%", "")
    title = title.replace('"', "")
    title = title.replace(" ", "-")
    titles_link.append(title)

titles_link


In [None]:
links_pp = []

for title in titles_link:
    links_pp.append("https://books.toscrape.com/catalogue/" + title + "_" + str(1000 - titles_link.index(title)) + "/index.html")
    
links_pp

In [None]:
soup_objects = {}

for link in links_pp:
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")
    soup_objects[link] = soup

In [None]:
soup_objects = {}

for link in links_pp:
    try:
        r = requests.get(link, timeout=10)
        r.raise_for_status()
        soup_objects[link] = BeautifulSoup(r.text, "html.parser")
    except Exception as e:
        print(f"Failed to fetch {link}: {e}")

In [None]:
titles_2 = []

for link, soup in soup_objects.items():
    for title in soup.select("div.row h1"):
        titles_2.append(title.get_text(strip=True))

titles_2

In [None]:
genres = []
for link, soup in soup_objects.items():
    li_elements = soup.select("ul.breadcrumb li a")
    if len(li_elements) >= 3:
        category = li_elements[2].get_text(strip=True)  # index 2 = 3rd item
        genres.append(category)

genres

In [None]:
len(titles_2)

In [None]:
len(genres)

In [None]:
descriptions = []

for link, soup in soup_objects.items():
    desc = soup.select_one("#product_description + p")
    if desc:
        descriptions.append(desc.get_text(strip=True))
    else:
        descriptions.append("no description available")
        
descriptions

In [None]:
print("soup_objects length:", len(soup_objects))
print("titles_2 length:", len(titles))
print("genres length:", len(genres))
print("descriptions length:", len(descriptions))

In [None]:
len(descriptions)

In [None]:
books_2 = pd.DataFrame({"Title": titles_2, "Genre": genres, "Description": descriptions})

books_2

In [None]:
#books_2.to_csv("../data/raw/books2.csv", index=False, encoding="utf-8", sep=",")

In [None]:
#books.to_csv("../data/raw/books1.csv", index=False, encoding="utf-8", sep=",")

In [None]:
books_scraped = pd.merge(books, books_2, on = "Title", how = "inner")
books_scraped

In [None]:
#books_scraped.to_csv("../data/raw/books_scraped.csv", index=False, encoding="utf-8", sep=",")

In [None]:
books_scraped["Author"] = "not available"
books_scraped

In [None]:
selected_col = ["Title", "Author", "Genre", "Price", "Rating", "Description", "Picture"]

books_scraped_final = books_scraped[selected_col]

books_scraped_final

In [None]:
books_api1 = pd.read_csv("../data/raw/books_random.csv")
books_api1

In [None]:
books_api2 = pd.read_csv("../data/raw/books_random2.csv")
books_api2

In [None]:
books_api3 = pd.read_csv("../data/raw/books_random3.csv")
books_api3

In [None]:
books_api4 = pd.read_csv("../data/raw/books_random4.csv")
books_api4

In [None]:
books_api5 = pd.read_csv("../data/raw/books_random5.csv")
books_api5

In [None]:
books_api = pd.concat([books_api1, books_api2], axis = 0).reset_index(drop=True)
books_api

In [None]:
books_api = pd.concat([books_api, books_api3], axis = 0).reset_index(drop=True)
books_api

In [None]:
books_api = pd.concat([books_api, books_api4], axis = 0).reset_index(drop=True)
books_api

In [None]:
books_api = pd.concat([books_api, books_api5], axis = 0).reset_index(drop=True)
books_api

In [None]:
books_api["Genre"].isna().sum()

In [None]:
books_api = books_api.dropna(subset=['Genre'])
books_api

In [None]:
books_api["Authors"].isna().sum()

In [None]:
books_api = books_api.dropna(subset=['Authors'])
books_api

In [None]:
books_api["Price"].isna().sum()

In [None]:
books_api.columns = ["Title", "Author", "Genre", "Price", "Rating", "Description", "Picture"]
books_api

In [None]:
books_final = pd.concat([books_scraped_final, books_api], axis = 0).reset_index(drop=True)
books_final

In [None]:
books_final["Price"].isna().sum()

In [None]:
books_final.duplicated().sum()

In [None]:
books_final = books_final.drop_duplicates()
books_final

In [None]:
#books_final.to_csv("../data/clean/books_final.csv", index=False, encoding="utf-8", sep=",")