In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [10]:
import pandas as pd

# Quotes To Scrape

### Send a Request to the Website

In [3]:
url = 'https://quotes.toscrape.com/'
response = requests.get(url)
html_content = response.content


In [7]:
html_content

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div class="col-md-4">\n                <p>\n                \n                    <a href="/login">Login</a>\n                \n                </p>\n            </div>\n        </div>\n    \n\n<div class="row">\n    <div class="col-md-8">\n\n    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">\n        <span class="text" itemprop="text">\xe2\x80\x9cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\xe2\x80\

### Parse the HTML Content

In [4]:
# Beautifulsoup analyzes and processes the content of a webpage
#This makes it possible to extract or manipulate parts of the web page programmatically.
soup = BeautifulSoup(html_content, 'html.parser')


In [5]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Quotes to Scrape</title>
<link href="/static/bootstrap.min.css" rel="stylesheet"/>
<link href="/static/main.css" rel="stylesheet"/>
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="

### Extract Authors and quotes on the first Page

In [8]:
# Find all quotes using a CSS selector
quotes = soup.select('span.text')
authors = soup.select('small.author')

## Alternatively using the "find_all()2
#quotes = soup.find_all('span', class_='text')
#authors = soup.find_all('small', class_='author')


### Print the Extracted Data

In [9]:
for quote, author in zip(quotes, authors):
    print(f'Quote: {quote.text}\nAuthor: {author.text}\n')


Quote: “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
Author: Albert Einstein

Quote: “It is our choices, Harry, that show what we truly are, far more than our abilities.”
Author: J.K. Rowling

Quote: “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
Author: Albert Einstein

Quote: “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
Author: Jane Austen

Quote: “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
Author: Marilyn Monroe

Quote: “Try not to become a man of success. Rather become a man of value.”
Author: Albert Einstein

Quote: “It is better to be hated for what you are than to be loved for what you are not.”
Author: André Gide

Quote: “I have not failed. I've just found 10,000 ways that won't work.”
Author: Thomas

In [11]:
# Create a list to store the data
data = []
for quote, author in zip(quotes, authors):
    data.append({
        'Quote': quote.text,
        'Author': author.text
    })

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Print the DataFrame
df

Unnamed: 0,Quote,Author
0,“The world as we have created it is a process ...,Albert Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling
2,“There are only two ways to live your life. On...,Albert Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe
5,“Try not to become a man of success. Rather be...,Albert Einstein
6,“It is better to be hated for what you are tha...,André Gide
7,"“I have not failed. I've just found 10,000 way...",Thomas A. Edison
8,“A woman is like a tea bag; you never know how...,Eleanor Roosevelt
9,"“A day without sunshine is like, you know, nig...",Steve Martin


### Complete Code

In [None]:
## Authors from the first page

In [16]:

def scrape_authors_from_first_page(base_url):
    # Send an HTTP request to the first page
    res = requests.get(base_url)
    
    # Parse the HTML content
    soup = BeautifulSoup(res.text, 'html.parser')
    
    # Find all elements containing author information
    authors = soup.select('small.author')
    
    # Extract the authors' names
    author_names = [author.text for author in authors]

    return author_names

# Save the base URL in a variable
base_url = 'http://quotes.toscrape.com/page/1/'

# Call the function to scrape authors from the first page
authors = scrape_authors_from_first_page(base_url)

# Print the authors' names
for author in authors:
    print(author)


Albert Einstein
J.K. Rowling
Albert Einstein
Jane Austen
Marilyn Monroe
Albert Einstein
André Gide
Thomas A. Edison
Eleanor Roosevelt
Steve Martin


In [None]:
## Quotes from the first page

In [17]:

def scrape_quotes_from_first_page(base_url):
    # Send an HTTP request to the first page
    res = requests.get(base_url)
    
    # Parse the HTML content
    soup = BeautifulSoup(res.text, 'html.parser')
    
    # Find all elements containing quotes
    quotes = soup.select('span.text')
    
    # Extract the quotes' text
    quote_texts = [quote.text for quote in quotes]

    return quote_texts

# Save the base URL in a variable
base_url = 'http://quotes.toscrape.com/page/1/'

# Call the function to scrape quotes from the first page
quotes = scrape_quotes_from_first_page(base_url)

# Print the quotes
for quote in quotes:
    print(quote)


“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
“Try not to become a man of success. Rather become a man of value.”
“It is better to be hated for what you are than to be loved for what you are not.”
“I have not failed. I've just found 10,000 ways that won't work.”
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
“A day without sunshine is like, you know, night.”


## Getting Top Ten Tags

In [18]:
import requests
from bs4 import BeautifulSoup

def scrape_top_tags(base_url):
    # Send an HTTP request to the homepage
    res = requests.get(base_url)
    
    # Parse the HTML content
    soup = BeautifulSoup(res.text, 'html.parser')
    
    # Find the tags box
    tags_box = soup.find('div', class_='tags-box')
    
    # Extract the tags
    tags = tags_box.find_all('a', class_='tag')
    
    # Extract the tag names
    tag_names = [tag.text.strip() for tag in tags]

    return tag_names

# Save the base URL in a variable
base_url = 'http://quotes.toscrape.com/'

# Call the function to scrape the top tags from the homepage
top_tags = scrape_top_tags(base_url)

# Print the top tags
for tag in top_tags:
    print(tag)


love
inspirational
life
humor
books
reading
friendship
friends
truth
simile


In [None]:
""""Notice how there is more than one page, and subsequent pages look like this: http://quotes.toscrape.com/page/2/.
Use what you know about for loops and string concatenation to loop through all the pages and get all the unique authors on the website.
Remember, there are many ways to achieve this. You will also need to figure out how to check that your loop is on the last page with quotes.
For debugging purposes, I will let you know that there are only 10 pages, so the last page is http://quotes.toscrape.com/page/10/.
Try to create a loop robust enough that it wouldn't matter to know the number of pages beforehand. Perhaps use try/except or While loop for this; it's up to """

### Initialize Variables

In [19]:
unique_authors = set() ## set a Python Data structure does not allow for duplicate
base_url = 'http://quotes.toscrape.com/page/{}/'
page_num = 1


### Start a Loop

In [20]:
while True:
    # Construct the URL for the current page
    scrape_url = base_url.format(page_num)
    
    try:
        # Send an HTTP request to the page
        res = requests.get(scrape_url)
        res.raise_for_status()  # Check if the request was successful
        
        # Parse the HTML content
        soup = BeautifulSoup(res.text, 'html.parser')
        
        # Find all elements containing author information
        authors = soup.select('small.author')
        
        # If no authors are found, it means we've reached the last page
        if not authors:
            break
        
        # Extract and add the authors' names to the set
        for author in authors:
            unique_authors.add(author.text.strip())
        
        # Increment the page number to move to the next page
        page_num += 1
    
    except requests.exceptions.RequestException:
        # Exit the loop if there is an HTTP error
        break


### Print the Unique Authors

In [21]:
for author in unique_authors:
    print(author)


J.R.R. Tolkien
J.K. Rowling
John Lennon
Suzanne Collins
Ernest Hemingway
André Gide
Jorge Luis Borges
Charles Bukowski
E.E. Cummings
Eleanor Roosevelt
Ayn Rand
Stephenie Meyer
William Nicholson
Terry Pratchett
J.D. Salinger
C.S. Lewis
George Bernard Shaw
Haruki Murakami
Pablo Neruda
Douglas Adams
Madeleine L'Engle
J.M. Barrie
Steve Martin
Jimi Hendrix
Martin Luther King Jr.
Friedrich Nietzsche
Harper Lee
George Eliot
Jane Austen
Dr. Seuss
Alfred Tennyson
Khaled Hosseini
Garrison Keillor
Marilyn Monroe
Allen Saunders
Ralph Waldo Emerson
James Baldwin
Helen Keller
Thomas A. Edison
George Carlin
Mark Twain
George R.R. Martin
W.C. Fields
Alexandre Dumas fils
Bob Marley
Elie Wiesel
Mother Teresa
Charles M. Schulz
Jim Henson
Albert Einstein


### Complete Code


In [None]:
import requests
from bs4 import BeautifulSoup

# Initialize a set to store unique authors
unique_authors = set()
# Save the base URL in a variable with a placeholder for page number
base_url = 'http://quotes.toscrape.com/page/{}/'
# Start with the first page
page_num = 1

while True:
    # Construct the URL for the current page
    scrape_url = base_url.format(page_num)
    
    try:
        # Send an HTTP request to the page
        res = requests.get(scrape_url)
        res.raise_for_status()  # Check if the request was successful
        
        # Parse the HTML content
        soup = BeautifulSoup(res.text, 'html.parser')
        
        # Find all elements containing author information
        authors = soup.select('small.author')
        
        # If no authors are found, it means we've reached the last page
        if not authors:
            break
        
        # Extract and add the authors' names to the set
        for author in authors:
            unique_authors.add(author.text.strip())
        
        # Increment the page number to move to the next page
        page_num += 1
    
    except requests.exceptions.RequestException:
        # Exit the loop if there is an HTTP error
        break

# Print the unique authors
for author in unique_authors:
    print(author)


# Books To Scrape

### Send a Request to the Website

In [1]:
import requests
from bs4 import BeautifulSoup

# Save the base URL in a variable
base_url = 'http://books.toscrape.com/catalogue/category/books_1/page-{}.html'


### Extract Data from a Single Page

In [7]:
# Construct the URL for the page to scrape
scrape_url = base_url.format(1)

# Send an HTTP request to the page
res = requests.get(scrape_url)

# Parse the HTML content
soup = BeautifulSoup(res.text,'html.parser')

# Find all elements containing book information
books = soup.select(".product_pod")


### Extract Titles with Two-Star Ratings

In [13]:
# List to save the titles with a two-star rating
two_star_titles = []
# Iterate through books on the page
for book in books:
    # Check if the book has a two-star rating
    if len(book.select('.star-rating.Two')) != 0:
        # Extract the title of the book
        book_title = book.select('a')[1]['title']
        # Add the title to the list
        two_star_titles.append(book_title)


In [14]:
two_star_titles

['Frankenstein', 'Emma']

### Extract Data Over Multiple Pages

In [10]:

# Iterate through all pages of the website (50 pages)
for page_num in range(1, 51):
    # Construct the URL for the current page
    scrape_url = base_url.format(page_num)
    
    # Send an HTTP request to the page
    res = requests.get(scrape_url)
    
    # Parse the HTML content
    soup = BeautifulSoup(res.text, 'html.parser')
    
    # Find all elements containing book information
    books = soup.select(".product_pod")
    
    # Iterate through books on the current page
    for book in books:
        # Check if the book has a two-star rating
        if len(book.select('.star-rating.Two')) != 0:
            # Extract the title of the book
            book_title = book.select('a')[1]['title']
            # Add the title to the list
            two_star_titles.append(book_title)


### Print the result

In [None]:
# Print the titles with a two-star rating
for title in two_star_titles:
    print(title)

### Complete Code in a Function

In [15]:
import requests
from bs4 import BeautifulSoup

def scrape_two_star_titles(base_url, num_pages):
    # List to save the titles with a two-star rating
    two_star_titles = []

    # Iterate through all pages of the website
    for page_num in range(1, num_pages + 1):
        # Construct the URL for the current page
        scrape_url = base_url.format(page_num)
        
        # Send an HTTP request to the page
        res = requests.get(scrape_url)
        
        # Parse the HTML content
        soup = BeautifulSoup(res.text, 'html.parser')
        
        # Find all elements containing book information
        books = soup.select(".product_pod")
        
        # Iterate through books on the current page
        for book in books:
            # Check if the book has a two-star rating
            if len(book.select('.star-rating.Two')) != 0:
                # Extract the title of the book
                book_title = book.select('a')[1]['title']
                # Add the title to the list
                two_star_titles.append(book_title)

    return two_star_titles

# Save the base URL in a variable
base_url = 'http://books.toscrape.com/catalogue/category/books_1/page-{}.html'

# Define the number of pages to scrape
num_pages = 50

# Call the function to scrape two-star titles
two_star_titles = scrape_two_star_titles(base_url, num_pages)

# Print the titles with a two-star rating
for title in two_star_titles:
    print(title)


Starving Hearts (Triangular Trade Trilogy, #1)
Libertarianism for Beginners
It's Only the Himalayas
How Music Works
Maude (1883-1993):She Grew Up with the country
You can't bury them all: Poems
Reasons to Stay Alive
Without Borders (Wanderlove #1)
Soul Reader
Security
Saga, Volume 5 (Saga (Collected Editions) #5)
Reskilling America: Learning to Labor in the Twenty-First Century
Political Suicide: Missteps, Peccadilloes, Bad Calls, Backroom Hijinx, Sordid Pasts, Rotten Breaks, and Just Plain Dumb Mistakes in the Annals of American Politics
Obsidian (Lux #1)
My Paris Kitchen: Recipes and Stories
Masks and Shadows
Lumberjanes, Vol. 2: Friendship to the Max (Lumberjanes #5-8)
Lumberjanes Vol. 3: A Terrible Plan (Lumberjanes #9-12)
Judo: Seven Steps to Black Belt (an Introductory Guide for Beginners)
I Hate Fairyland, Vol. 1: Madly Ever After (I Hate Fairyland (Compilations) #1-5)
Giant Days, Vol. 2 (Giant Days #5-8)
Everydata: The Misinformation Hidden in the Little Data You Consume Every 