### Web Scraping using Requests

In [1]:
import requests
url = 'https://quotes.toscrape.com/'
resp = requests.get(url=url)
resp.ok

True

In [2]:
resp.headers 

{'Date': 'Thu, 29 Sep 2022 13:25:27 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Content-Length': '11053', 'Connection': 'keep-alive', 'Strict-Transport-Security': 'max-age=0; includeSubDomains; preload'}

In [3]:
resp.text

'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div class="col-md-4">\n                <p>\n                \n                    <a href="/login">Login</a>\n                \n                </p>\n            </div>\n        </div>\n    \n\n<div class="row">\n    <div class="col-md-8">\n\n    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">\n        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>\n        <sp

### Web Scraping using BeautifulSoup

In [4]:
# lxml is parser library used with BeautifulSoup, parser provides an interface to Python's internal parser and byte-code compiler 
# parser stores data in tree like structure 
from bs4 import BeautifulSoup 
soup = BeautifulSoup(resp.content, 'lxml')
soup 

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Quotes to Scrape</title>
<link href="/static/bootstrap.min.css" rel="stylesheet"/>
<link href="/static/main.css" rel="stylesheet"/>
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="t

In [5]:
soup.title

<title>Quotes to Scrape</title>

In [6]:
soup.title.text

'Quotes to Scrape'

In [7]:
# find_all() fetches all link
links = soup.find_all('link')
links

[<link href="/static/bootstrap.min.css" rel="stylesheet"/>,
 <link href="/static/main.css" rel="stylesheet"/>]

In [8]:
print(type(links))
print(type(links[0]))

<class 'bs4.element.ResultSet'>
<class 'bs4.element.Tag'>


In [9]:
links[0].attrs

{'rel': ['stylesheet'], 'href': '/static/bootstrap.min.css'}

In [10]:
links[0].attrs['href']

'/static/bootstrap.min.css'

### Extracting Quotes using Web Scraping

In [42]:
# using _ after class as it's class of html, not class of python
quotes = soup.find_all('div', class_='quote')
quotes[0] 

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

In [111]:
quotes[1]

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
<span>by <small class="author" itemprop="author">J.K. Rowling</small>
<a href="/author/J-K-Rowling">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="abilities,choices" itemprop="keywords"/>
<a class="tag" href="/tag/abilities/page/1/">abilities</a>
<a class="tag" href="/tag/choices/page/1/">choices</a>
</div>
</div>

In [112]:
type(quotes[1])

bs4.element.Tag

In [43]:
# extract quote
quotes[0].span

<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>

In [44]:
quotes[0].span.string

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

In [45]:
# author
quotes[0].small.string

'Albert Einstein'

In [85]:
# tags
quotes[0].find('div', attrs={'class': 'tags'})

<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>

In [47]:
# returns only single tag
quotes[0].find('div', attrs={'class': 'tags'}).a

<a class="tag" href="/tag/change/page/1/">change</a>

In [48]:
# returns all tag
quotes[0].find('div', attrs={'class': 'tags'}).find_all('a')

[<a class="tag" href="/tag/change/page/1/">change</a>,
 <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>,
 <a class="tag" href="/tag/thinking/page/1/">thinking</a>,
 <a class="tag" href="/tag/world/page/1/">world</a>]

In [98]:
quotes[0].find_all('a', attrs={'class': 'tag'})

[<a class="tag" href="/tag/change/page/1/">change</a>,
 <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>,
 <a class="tag" href="/tag/thinking/page/1/">thinking</a>,
 <a class="tag" href="/tag/world/page/1/">world</a>]

In [108]:
quotes[0].find_all('a', class_ = 'tag')

[<a class="tag" href="/tag/change/page/1/">change</a>,
 <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>,
 <a class="tag" href="/tag/thinking/page/1/">thinking</a>,
 <a class="tag" href="/tag/world/page/1/">world</a>]

In [113]:
for i in enumerate(quotes[0]):
    print(i)

(0, '\n')
(1, <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>)
(2, '\n')
(3, <span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>)
(4, '\n')
(5, <div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>)
(6, '\n')


In [109]:
t=[]
for tagitem in quotes[0].find_all('a', class_ = 'tag'):
    t.append(tagitem.attrs['href'])

In [110]:
t

['/tag/change/page/1/',
 '/tag/deep-thoughts/page/1/',
 '/tag/thinking/page/1/',
 '/tag/world/page/1/']

In [107]:
quotes[0].find('div', attrs={'class': 'tags'}).a.attrs['href']

'/tag/change/page/1/'

In [80]:
quotes[0].find('div', attrs={'class': 'tags'}).find('a').attrs['href']

'/tag/change/page/1/'

In [21]:
quotes[0].find('div', attrs={'class': 'tags'}).meta

<meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>

In [22]:
quotes[0].find('div', attrs={'class': 'tags'}).meta.attrs['content']

'change,deep-thoughts,thinking,world'

In [114]:
# scraping all href from html elemwnt <a></a> from a page
t = []
for tagitem in soup.findAll('a', class_ = 'tag'):
    t.append(tagitem.attrs['href'])
t

['/tag/change/page/1/',
 '/tag/deep-thoughts/page/1/',
 '/tag/thinking/page/1/',
 '/tag/world/page/1/',
 '/tag/abilities/page/1/',
 '/tag/choices/page/1/',
 '/tag/inspirational/page/1/',
 '/tag/life/page/1/',
 '/tag/live/page/1/',
 '/tag/miracle/page/1/',
 '/tag/miracles/page/1/',
 '/tag/aliteracy/page/1/',
 '/tag/books/page/1/',
 '/tag/classic/page/1/',
 '/tag/humor/page/1/',
 '/tag/be-yourself/page/1/',
 '/tag/inspirational/page/1/',
 '/tag/adulthood/page/1/',
 '/tag/success/page/1/',
 '/tag/value/page/1/',
 '/tag/life/page/1/',
 '/tag/love/page/1/',
 '/tag/edison/page/1/',
 '/tag/failure/page/1/',
 '/tag/inspirational/page/1/',
 '/tag/paraphrased/page/1/',
 '/tag/misattributed-eleanor-roosevelt/page/1/',
 '/tag/humor/page/1/',
 '/tag/obvious/page/1/',
 '/tag/simile/page/1/',
 '/tag/love/',
 '/tag/inspirational/',
 '/tag/life/',
 '/tag/humor/',
 '/tag/books/',
 '/tag/reading/',
 '/tag/friendship/',
 '/tag/friends/',
 '/tag/truth/',
 '/tag/simile/']

In [23]:
# for all quotes
Quoteslist = []
for quote in quotes:
    text = quote.span.string
    author = quote.small.string
    tags = quote.find('div', attrs={'class': 'tags'}).meta.attrs['content']
    Quoteslist.append([text, author, tags])

In [24]:
Quoteslist[-1]

['“A day without sunshine is like, you know, night.”',
 'Steve Martin',
 'humor,obvious,simile']

In [25]:
Quoteslist 

[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
  'Albert Einstein',
  'change,deep-thoughts,thinking,world'],
 ['“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
  'J.K. Rowling',
  'abilities,choices'],
 ['“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
  'Albert Einstein',
  'inspirational,life,live,miracle,miracles'],
 ['“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
  'Jane Austen',
  'aliteracy,books,classic,humor'],
 ["“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
  'Marilyn Monroe',
  'be-yourself,inspirational'],
 ['“Try not to become a man of success. Rather become a man of value.”',
  'Albert Einstein',
  'adulthood,success,value'],
 ['“It is better to be