## Scrapping html data
 - Parse an html file

In [1]:
con = open("./data/html.html")
raw_html = con.read()
con.close()
print(raw_html)

<!DOCTYPE html>

<html>
    <head>
        <title>GETTING STARTED WITH bs4</title>
    </head>
    <div class="para 1">
        <p>
            This is paragraph one
        </p>
        <p>
            This is paragraph two
        </p>
    </div>
    <div class='para 2'>
        <p>
            This is para 1 in div 2
        </p>
    
    </div>
    <div class="para 1">
        <p>
            This is paragraph three of div with class para 1
        </p>
        <p>
            This is paragraph four of div with class para 1
        </p>
    </div>

</html>


In [2]:
### We will use a python library to parse this html
from bs4 import BeautifulSoup

In [3]:
parsed_html = BeautifulSoup(raw_html,'html.parser')

In [4]:
parsed_html

<!DOCTYPE html>

<html>
<head>
<title>GETTING STARTED WITH bs4</title>
</head>
<div class="para 1">
<p>
            This is paragraph one
        </p>
<p>
            This is paragraph two
        </p>
</div>
<div class="para 2">
<p>
            This is para 1 in div 2
        </p>
</div>
<div class="para 1">
<p>
            This is paragraph three of div with class para 1
        </p>
<p>
            This is paragraph four of div with class para 1
        </p>
</div>
</html>

In [5]:
parsed_html.title

<title>GETTING STARTED WITH bs4</title>

In [6]:
parsed_html.div

<div class="para 1">
<p>
            This is paragraph one
        </p>
<p>
            This is paragraph two
        </p>
</div>

In [7]:
parsed_html.title.text

'GETTING STARTED WITH bs4'

In [12]:
paras = []
for para in parsed_html.find_all("p"):
    print(para.text.strip())
    paras.append(para.text.strip())

This is paragraph one
This is paragraph two
This is para 1 in div 2
This is paragraph three of div with class para 1
This is paragraph four of div with class para 1


In [13]:
paras

['This is paragraph one',
 'This is paragraph two',
 'This is para 1 in div 2',
 'This is paragraph three of div with class para 1',
 'This is paragraph four of div with class para 1']

In [21]:
parsed_html.find_all("div",class_="para 2")[0].p.text.strip()

'This is para 1 in div 2'

In [28]:
parsed_html.find_all("div",class_="para 1")[1].find_all("p")[1].text.strip()

'This is paragraph four of div with class para 1'

In [33]:
parsed_html.find_all("div",class_="para 1")[1].text.split("\n\n")[1].strip()

'This is paragraph three of div with class para 1'

In [41]:
parsed_html.find_all("div",class_="para 1")[1].text.split("\n\n")[2].strip()

'This is paragraph four of div with class para 1'

In [49]:
parsed_html.find_all("div",class_="para 1")[0].find_all("p")[1].text.strip()

'This is paragraph two'

In [51]:
parsed_html.find_all("div",class_="para 1")[1].find_all("p")[0].text.strip() 

'This is paragraph three of div with class para 1'

In [59]:
counter = 1
paras = []
for div in parsed_html.find_all("div",class_="para 1"):
    if counter <=1:
        p=div.find_all("p")[1].text.strip()
    else:
        p=div.find_all("p")[0].text.strip()
    paras.append(p)
    counter = counter + 1 
paras

['This is paragraph two', 'This is paragraph three of div with class para 1']

In [61]:
paras = []
for idx,div in enumerate(parsed_html.find_all("div",class_="para 1")):
    if idx == 0:
        p=div.find_all("p")[1].text.strip()
    else:
        p=div.find_all("p")[0].text.strip()
    paras.append(p)

In [62]:
paras

['This is paragraph two', 'This is paragraph three of div with class para 1']

In [67]:
### How do we get data (raw html) from the web
# make a "request" to a server to get data
# get request ---> request to the server to send something to us
# post request ----> request to the server where we send some data to the server (form submit)
import requests
url = "https://www.w3schools.com/TAGS/default.ASP"
resp=requests.get(url)
raw_html = resp.text

In [68]:
url = "https://www.goodreads.com/quotes"
resp = requests.get(url)
raw_html = resp.text

In [70]:
## Lets see what is the logic we can create to extract the quote and the author name
parsed_html = BeautifulSoup(raw_html,'html.parser')

In [77]:
parsed_html.find_all("span",class_="authorOrTitle")[0].text.strip()

'Oscar Wilde'

In [78]:
authors = []
for auth in parsed_html.find_all("span",class_="authorOrTitle"):
    authors.append(auth.text.strip())
print(authors)

['Oscar Wilde', 'Marilyn Monroe', 'Albert Einstein', 'Frank Zappa', 'Marcus Tullius Cicero', 'Bernard M. Baruch', 'William W. Purkey', 'Dr. Seuss', 'Mae West', 'Mahatma Gandhi', 'Robert Frost', 'J.K. Rowling,', 'Albert Camus', 'Mark Twain', 'C.S. Lewis,', 'Maya Angelou', 'Elbert Hubbard', 'Oscar Wilde', 'Oscar Wilde', 'Mahatma Gandhi', 'Martin Luther King Jr.,', 'Friedrich Nietzsche,', 'Stephen Chbosky,', 'Oscar Wilde,', 'Ralph Waldo Emerson', 'Rob Siltanen', 'Narcotics Anonymous', 'Marilyn Monroe', 'H. Jackson Brown Jr.,', 'Andre Gide,']


In [100]:
### Try extracting the quotes also from this webpage
quotes = []
for div in parsed_html.find_all("div",class_="quoteText"):
    quotes.append(div.text.split("\n    ―\n  \n")[0].strip())

In [103]:
import pandas as pd
table = pd.DataFrame({'author':authors,'quote':quotes})

In [104]:
table.head()

Unnamed: 0,author,quote
0,Oscar Wilde,“Be yourself; everyone else is already taken.”
1,Marilyn Monroe,"“I'm selfish, impatient and a little insecure...."
2,Albert Einstein,“Two things are infinite: the universe and hum...
3,Frank Zappa,"“So many books, so little time.”"
4,Marcus Tullius Cicero,“A room without books is like a body without a...


In [115]:
#https://www.goodreads.com/quotes?page=6
#https://www.goodreads.com/quotes?page=1
#https://www.goodreads.com/quotes?page=99
page_num = 2
base_url = "https://www.goodreads.com/quotes?page={}".format(page_num)

In [116]:
base_url

'https://www.goodreads.com/quotes?page=2'

In [111]:
### Create logic to extract data from a single page as a python function
def get_data(parsed_html):
    quotes = []
    authors = []
    for div in parsed_html.find_all("div",class_="quoteText"):
        quotes.append(div.text.split("\n    ―\n  \n")[0].strip())
    for auth in parsed_html.find_all("span",class_="authorOrTitle"):
        authors.append(auth.text.strip())
    table = pd.DataFrame({'author':authors,'quote':quotes})
    return table

In [118]:
from tqdm import tqdm
tables = []
for page_num in tqdm(range(20)):
    base_url = "https://www.goodreads.com/quotes?page={}".format(page_num)
    raw_html = requests.get(base_url).text
    parsed_html = BeautifulSoup(raw_html,'html.parser')
    tables.append(get_data(parsed_html))

100%|███████████████████████████████████████████| 20/20 [00:39<00:00,  1.95s/it]


In [121]:
pd.concat(tables).head(2)

Unnamed: 0,author,quote
0,Oscar Wilde,“Be yourself; everyone else is already taken.”
1,Marilyn Monroe,"“I'm selfish, impatient and a little insecure...."


In [127]:
### Extend this table with the information on number of likes as well. 
#### You need to find out top 10 quotes by popularity
### Extract data for first 40 pages

In [128]:
def get_data(parsed_html):
    quotes = []
    authors = []
    likes = []
    for div in parsed_html.find_all("div",class_="quoteText"):
        quotes.append(div.text.split("\n    ―\n  \n")[0].strip())
    for auth in parsed_html.find_all("span",class_="authorOrTitle"):
        authors.append(auth.text.strip())
    for like in parsed_html.find_all("a",class_="smallText"):
        likes.append(float(like.text.replace("likes","")))
    table = pd.DataFrame({'author':authors,'quote':quotes,'likes':likes})
    return table

In [129]:
tables = []
for page_num in tqdm(range(40)):
    page_num = page_num+1
    base_url = "https://www.goodreads.com/quotes?page={}".format(page_num)
    raw_html = requests.get(base_url).text
    parsed_html = BeautifulSoup(raw_html,'html.parser')
    tables.append(get_data(parsed_html))

100%|███████████████████████████████████████████| 40/40 [07:17<00:00, 10.93s/it]


In [130]:
pd.concat(tables).head(2)

Unnamed: 0,author,quote,likes
0,Oscar Wilde,“Be yourself; everyone else is already taken.”,159175.0
1,Marilyn Monroe,"“I'm selfish, impatient and a little insecure....",156512.0


In [132]:
import numpy as np
np.arange(1,41)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40])

In [133]:
## For the url: https://www.goodreads.com/list/show/19.Best_for_Book_Clubs
# Extract: Name of the book, Author and Average Rating

url = "https://www.goodreads.com/list/show/19.Best_for_Book_Clubs"
raw_html = requests.get(url).text
parsed_html = BeautifulSoup(raw_html,'html.parser')

In [137]:
parsed_html.find_all("a",class_="bookTitle")[0].span.text

'The Help'

In [140]:
booknames = []
for book in parsed_html.find_all("a",class_="bookTitle"):
    booknames.append(book.span.text.strip())
#booknames

In [143]:
parsed_html.find_all("a",class_="authorName")[0].span.text

'Kathryn Stockett'

In [145]:
authors = []
for auth in parsed_html.find_all("a",class_="authorName"):
    authors.append(auth.span.text.strip())
#authors

In [152]:
parsed_html.find_all("span",class_="minirating")[0].text.strip().split(" ")[0]

'4.46'

In [154]:
ratings = []
for rating in parsed_html.find_all("span",class_="minirating"):
    rat=rating.text.strip().split(" ")[0]
    ratings.append(float(rat))
#ratings

In [174]:
def get_data(parsed_html):
    booknames = []
    authors = []
    ratings = []
    for book in parsed_html.find_all("a",class_="bookTitle"):
        booknames.append(book.span.text.strip())
    for auth in parsed_html.find_all("a",class_="authorName"):
        authors.append(auth.span.text.strip())
    for rating in parsed_html.find_all("span",class_="minirating"):
        rat=rating.text.strip()
        ratings.append(rat)
    table = pd.DataFrame({'authors':authors,'books':booknames,'ratings':ratings})
    return table

In [175]:
tables = []
for i in tqdm(range(1,11)):
    url="https://www.goodreads.com/list/show/19.Best_for_Book_Clubs?page={}".format(i)
    raw_html = requests.get(url).text
    parsed_html = BeautifulSoup(raw_html,'html.parser')
    tables.append(get_data(parsed_html))

100%|███████████████████████████████████████████| 10/10 [01:04<00:00,  6.45s/it]


In [170]:
url = "https://www.goodreads.com/list/show/19.Best_for_Book_Clubs?page=3"
raw_html = requests.get(url).text
parsed_html = BeautifulSoup(raw_html,'html.parser')

In [172]:
parsed_html.find_all("span",class_="minirating")[2].text

'really liked it 4.00 avg rating — 137,682 ratings'