In [7]:
# Imports & Setup

import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import time
from requests.exceptions import RequestException


In [8]:
# Create Required Folders
os.makedirs("scraped_data", exist_ok=True)
os.makedirs("cleaned_data", exist_ok=True)


In [9]:
# Fetch Pages & Save HTML (SCRAPING)
headers = {
    "User-Agent": "Mozilla/5.0"
}

page_count = 1
MAX_RETRIES = 3

while True:
    url = f"https://quotes.toscrape.com/page/{page_count}/"
    print(f"Fetching page {page_count}...")

    success = False

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            res = requests.get(url, headers=headers, timeout=10)
            if res.status_code == 200:
                success = True
                break
        except RequestException as e:
            print(f"Attempt {attempt} failed:", e)
            time.sleep(2)

    if not success:
        print(f"Skipping page {page_count} after {MAX_RETRIES} retries")
        page_count += 1
        continue

    soup = BeautifulSoup(res.text, "lxml")
    quotes = soup.select("div.quote")

    if not quotes:
        print("No more valid pages. Stopping.")
        break

    with open(f"scraped_data/quotes{page_count}.html", "w", encoding="utf-8") as f:
        f.write(res.text)

    print(f"Downloaded data from page {page_count}")

    page_count += 1
    time.sleep(1)


Fetching page 1...
Attempt 1 failed: HTTPSConnectionPool(host='quotes.toscrape.com', port=443): Max retries exceeded with url: /page/1/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002547E75DBE0>, 'Connection to quotes.toscrape.com timed out. (connect timeout=10)'))
Downloaded data from page 1
Fetching page 2...
Downloaded data from page 2
Fetching page 3...
Attempt 1 failed: HTTPSConnectionPool(host='quotes.toscrape.com', port=443): Max retries exceeded with url: /page/3/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002547E507610>, 'Connection to quotes.toscrape.com timed out. (connect timeout=10)'))
Downloaded data from page 3
Fetching page 4...
Downloaded data from page 4
Fetching page 5...
Attempt 1 failed: HTTPSConnectionPool(host='quotes.toscrape.com', port=443): Max retries exceeded with url: /page/5/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002547E64F390>, 'Connect

In [12]:
# Read Saved HTML Files
files = sorted(os.listdir("scraped_data"))
files



['quotes1.html',
 'quotes10.html',
 'quotes2.html',
 'quotes3.html',
 'quotes4.html',
 'quotes5.html',
 'quotes6.html',
 'quotes7.html',
 'quotes8.html',
 'quotes9.html']

In [13]:
# Extract “Life” Quotes from All Pages

life_quotes = []

for file in files:
    if file.startswith("quotes") and file.endswith(".html"):
        print(f"Processing {file}")

        with open(f"scraped_data/{file}", "r", encoding="utf-8") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, "lxml")
        all_quotes = soup.select("div.quote")

        for q in all_quotes:
            tags = [tag.get_text(strip=True) for tag in q.select(".tags .tag")]

            if "life" in tags:
                text = q.select_one("span.text").get_text(strip=True)
                author = q.select_one("small.author").get_text(strip=True)
                life_quotes.append([text, author])

print("Total life quotes collected:", len(life_quotes))


Processing quotes1.html
Processing quotes10.html
Processing quotes2.html
Processing quotes3.html
Processing quotes4.html
Processing quotes5.html
Processing quotes6.html
Processing quotes7.html
Processing quotes8.html
Processing quotes9.html
Total life quotes collected: 13


In [14]:
# Convert to DataFrame

df = pd.DataFrame(life_quotes, columns=["Quote", "Author"])
df.head()



Unnamed: 0,Quote,Author
0,“There are only two ways to live your life. On...,Albert Einstein
1,“It is better to be hated for what you are tha...,André Gide
2,“I'm the one that's got to die when it's time ...,Jimi Hendrix
3,“But better to get hurt by the truth than comf...,Khaled Hosseini
4,“This life is what you make it. No matter what...,Marilyn Monroe


In [15]:
# Save to CSV

df.to_csv("cleaned_data/life_quotes.csv", index=False, encoding="utf-8")
print("Saved to cleaned_data/life_quotes.csv")


Saved to cleaned_data/life_quotes.csv


In [16]:
# Sanity Check

df.tail()


Unnamed: 0,Quote,Author
8,"“Today you are You, that is truer than true. T...",Dr. Seuss
9,“Life is like riding a bicycle. To keep your b...,Albert Einstein
10,“Life isn't about finding yourself. Life is ab...,George Bernard Shaw
11,“Finish each day and be done with it. You have...,Ralph Waldo Emerson
12,“The fear of death follows from the fear of li...,Mark Twain
