### Extracting Quotes using Web Crawling

In [1]:
# Web scraping is about extracting data from specific websites and URLs or single page of website
# Web crawling is about finding or discovering URLs or links on web, Crawling used to find unknown URLs and domains
# Web crawling used to extract data from unknown range of Multiple Pages or indexes
import requests
url = 'https://quotes.toscrape.com/'
resp = requests.get(url=url)
resp.ok

True

In [2]:
resp.headers

{'Date': 'Thu, 29 Sep 2022 09:23:28 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Content-Length': '11053', 'Connection': 'keep-alive', 'Strict-Transport-Security': 'max-age=0; includeSubDomains; preload'}

In [3]:
from bs4 import BeautifulSoup 
soup = BeautifulSoup(resp.content, 'lxml')
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Quotes to Scrape</title>
<link href="/static/bootstrap.min.css" rel="stylesheet"/>
<link href="/static/main.css" rel="stylesheet"/>
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="t

In [4]:
soup.find('li', class_='next')

<li class="next">
<a href="/page/2/">Next <span aria-hidden="true">→</span></a>
</li>

In [5]:
soup.find('li', class_='next').a.attrs['href']

'/page/2/'

In [6]:
resp.url

'https://quotes.toscrape.com/'

In [7]:
def extractPage(soup: BeautifulSoup) -> list:
    Quoteslist = []
    quotes = soup.findAll('div', class_='quote')
    for quote in quotes:
        text = quote.span.string
        author = quote.small.string
        tags = quote.find('div', attrs={'class': 'tags'}).meta.attrs['content']
        Quoteslist.append([text, author, tags])
    return Quoteslist

In [9]:
base_url = 'https://quotes.toscrape.com'
url = base_url
QuotesDB1 = []
while True:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    QuotesDB1.extend(extractPage(soup))
    print(f'Crawling done {url}...')
    try:
        # get next page
        next_ = soup.find('li', class_='next').a.attrs['href']
        url = base_url + next_ 
    except AttributeError as at:
        print("All Pages Crawled!")
        break

Crawling done https://quotes.toscrape.com...
Crawling done https://quotes.toscrape.com/page/2/...
Crawling done https://quotes.toscrape.com/page/3/...
Crawling done https://quotes.toscrape.com/page/4/...
Crawling done https://quotes.toscrape.com/page/5/...
Crawling done https://quotes.toscrape.com/page/6/...
Crawling done https://quotes.toscrape.com/page/7/...
Crawling done https://quotes.toscrape.com/page/8/...
Crawling done https://quotes.toscrape.com/page/9/...
Crawling done https://quotes.toscrape.com/page/10/...
All Pages Crawled!


In [13]:
print(len(QuotesDB1))
print(QuotesDB1[-1])

100
['“... a mind needs books as a sword needs a whetstone, if it is to keep its edge.”', 'George R.R. Martin', 'books,mind']


In [14]:
# finding top ten tags from url
soup.findAll('span', class_='tag-item')

[<span class="tag-item">
 <a class="tag" href="/tag/love/" style="font-size: 28px">love</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/inspirational/" style="font-size: 26px">inspirational</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/life/" style="font-size: 26px">life</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/humor/" style="font-size: 24px">humor</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/books/" style="font-size: 22px">books</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/reading/" style="font-size: 14px">reading</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/friendship/" style="font-size: 10px">friendship</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/friends/" style="font-size: 8px">friends</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/truth/" style="font-size: 8px">truth</a>
 </span>,
 <span class="tag-item">
 <a class="

In [None]:
tags_url = []
for tagitem in soup.findAll('span', class_='tag-item'):
    tags_url.append(tagitem.a.attrs['href'])

In [None]:
base_url = 'https://quotes.toscrape.com'
TopTenDB = []
for tag_url in tags_url:
    
    url = base_url + tag_url
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml')
        TopTenDB.extend(extractPage(soup))

        try:
            # get next page
            next_ = soup.find('li', class_='next').a.attrs['href']
            url = base_url + next_
            print(f'Crawling {url}...')
        except AttributeError as at:
            print("All Pages Crawled!")
            break

In [None]:
TopTenDB[0]