# Introduction to Beautiful Soup

In [25]:
#importing libraries
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

In [26]:
#Fetch the pages
URL = 'https://quotes.toscrape.com/'

response = requests.get(URL)
print(response.status_code) #get the status code
print(response.headers) #get the headers

200
{'Date': 'Wed, 26 Jun 2024 06:29:36 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Content-Length': '11054', 'Connection': 'keep-alive', 'Strict-Transport-Security': 'max-age=0; includeSubDomains; preload'}


In [27]:
#storing page content
html = response.text

#storing the html
with open('main.html','w', encoding='utf-8') as fd:
    fd.write(html)

In [28]:
#create soup
soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Quotes to Scrape
  </title>
  <link href="/static/bootstrap.min.css" rel="stylesheet"/>
  <link href="/static/main.css" rel="stylesheet"/>
 </head>
 <body>
  <div class="container">
   <div class="row header-box">
    <div class="col-md-8">
     <h1>
      <a href="/" style="text-decoration: none">
       Quotes to Scrape
      </a>
     </h1>
    </div>
    <div class="col-md-4">
     <p>
      <a href="/login">
       Login
      </a>
     </p>
    </div>
   </div>
   <div class="row">
    <div class="col-md-8">
     <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
      <span class="text" itemprop="text">
       “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
      </span>
      <span>
       by
       <small class="author" itemprop="author">
        Albert Einstein
       </small>
       <a href="/author/Albert

## Best Practices
Always try to locate the element in the below order
1. id
2. class name
3. tag name, css selector
4. Xpath

In [29]:
#finds the first element
soup.find('span', class_ = 'text')

<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>

In [30]:
#find all the elements
soup.find_all('span', class_ = 'text')

[<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>,
 <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>,
 <span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>,
 <span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>,
 <span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>,
 <span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>,
 <span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.

In [31]:
#get inner text
soup.find('span', class_ = 'text').get_text()
#or
soup.find('span', class_ = 'text').text

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

In [32]:
quotes = []
for quote in soup.find_all('span', class_ = 'text'):
    quotes.append(quote.text[1:-1]) 

quotes

['The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.',
 'It is our choices, Harry, that show what we truly are, far more than our abilities.',
 'There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.',
 'The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.',
 "Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.",
 'Try not to become a man of success. Rather become a man of value.',
 'It is better to be hated for what you are than to be loved for what you are not.',
 "I have not failed. I've just found 10,000 ways that won't work.",
 "A woman is like a tea bag; you never know how strong it is until it's in hot water.",
 'A day without sunshine is like, you know, night.']

In [33]:
#scrapping quotes with author details
data = []
for sp in soup.find_all('div', class_ = 'quote'):

    quote = sp.find('span', class_ = 'text').text[1:-1]

    author_name = sp.find('small', class_ = 'author').text

    author_id = sp.find('a').get('href')

    tags = []
    for tag in sp.find_all('a',class_ = 'tag'):
        tags.append(tag.text)
    tags      = ', '.join(tags)

    data.append([quote, author_name, author_id, tags])



In [34]:
df = pd.DataFrame(data, columns= ['quote', 'author_name', 'author_id', 'tags'])
df.to_csv('Quotes.csv', index=False)

In [36]:
# scrapping multiple pages

data = []
for page in tqdm(range(1,11)):
    link = 'https://quotes.toscrape.com/page/' + str(page)

    soup = BeautifulSoup(requests.get(link).text, 'html.parser')

    for sp in soup.find_all('div', class_ = 'quote'):

        quote = sp.find('span', class_ = 'text').text[1:-1]

        author_name = sp.find('small', class_ = 'author').text

        author_id = sp.find('a').get('href')

        tags = []
        for tag in sp.find_all('a',class_ = 'tag'):
            tags.append(tag.text)
        tags      = ', '.join(tags)

        data.append([quote, author_name, author_id, tags])
    
df = pd.DataFrame(data, columns= ['quote', 'author_name', 'author_id', 'tags'])

for i in range(len(df['author_id'])):
    df['author_id'][i] = ('https://quotes.toscrape.com' + df['author_id'][i])
    
df.to_csv('Quotes_with_author_details.csv', index=False)
    

100%|██████████| 10/10 [00:20<00:00,  2.04s/it]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['author_id'][i] = ('https://quotes.toscrape.com' + df['author_id'][i])
