## Getting data from html: Parsing dom

In [1]:
## Read the html.html file
path="./data/html.html"
con=open(path,"r")
data=con.read()
con.close()

In [2]:
print(data)

<!DOCTYPE html>

<html>
    <head>
        <title>GETTING STARTED WITH bs4</title>
    </head>
    <div class="para 1">
        <p>
            This is paragraph one
        </p>
        <p>
            This is paragraph two
        </p>
    </div>
    <div class='para 2'>
        <p>
            This is para 1 in div 2
        </p>
    
    </div>
    <div class="para 1">
        <p>
            This is paragraph three of div with class para 1
        </p>
        <p>
            This is paragraph four of div with class para 1
        </p>
    </div>

</html>


In [3]:
from bs4 import BeautifulSoup
parsed_markup=BeautifulSoup(data,'html.parser')

In [4]:
parsed_markup.title ## access dom elements

<title>GETTING STARTED WITH bs4</title>

In [5]:
parsed_markup.title.text.split(" ")[-1]

'bs4'

In [6]:
parsed_markup.div

<div class="para 1">
<p>
            This is paragraph one
        </p>
<p>
            This is paragraph two
        </p>
</div>

In [7]:
parsed_markup.find_all("div")

[<div class="para 1">
 <p>
             This is paragraph one
         </p>
 <p>
             This is paragraph two
         </p>
 </div>,
 <div class="para 2">
 <p>
             This is para 1 in div 2
         </p>
 </div>,
 <div class="para 1">
 <p>
             This is paragraph three of div with class para 1
         </p>
 <p>
             This is paragraph four of div with class para 1
         </p>
 </div>]

In [8]:
parsed_markup.find_all("div",class_='para 2')

[<div class="para 2">
 <p>
             This is para 1 in div 2
         </p>
 </div>]

In [9]:
divs=parsed_markup.find_all("div",class_='para 1')

In [10]:
divs[0].find_all("p")   

[<p>
             This is paragraph one
         </p>,
 <p>
             This is paragraph two
         </p>]

In [11]:
divs[1]

<div class="para 1">
<p>
            This is paragraph three of div with class para 1
        </p>
<p>
            This is paragraph four of div with class para 1
        </p>
</div>

In [12]:
for i in parsed_markup.find_all("div",class_="para 1"):
    paras=i.find_all("p")
    print(paras[1].text.strip())

This is paragraph two
This is paragraph four of div with class para 1


In [13]:
## Extracting data:
para_data=[]
for i in parsed_markup.find_all("div",class_="para 1"):
    paras=i.find_all("p")
    para_data.append(paras[1].text.strip())

In [14]:
para_data

['This is paragraph two', 'This is paragraph four of div with class para 1']

## Using requests to fetch html

In [15]:
## Simulate a browser
url="https://www.goodreads.com/quotes"
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
quotes=requests.get(url,headers=headers).text

In [16]:
### Make the html markup parsable
quotes_parsed=BeautifulSoup(quotes,'html.parser')

In [17]:
quotes_parsed.div

<div data-react-class="ReactComponents.StoresInitializer" data-react-props="{}"><noscript data-react-checksum="-1374351170" data-reactid=".16mqg5c4h40"></noscript></div>

In [18]:
authors=[]
for i in quotes_parsed.find_all("span",class_="authorOrTitle"):
    authors.append(i.text.strip())

In [19]:
authors

['Oscar Wilde',
 'Marilyn Monroe',
 'Albert Einstein',
 'Frank Zappa',
 'Marcus Tullius Cicero',
 'Bernard M. Baruch',
 'William W. Purkey',
 'Dr. Seuss',
 'Mae West',
 'Mahatma Gandhi',
 'Robert Frost',
 'J.K. Rowling,',
 'Albert Camus',
 'Mark Twain',
 'C.S. Lewis,',
 'Maya Angelou',
 'Elbert Hubbard',
 'Oscar Wilde',
 'Oscar Wilde',
 'Mahatma Gandhi',
 'Martin Luther King Jr.,',
 'Friedrich Nietzsche,',
 'Stephen Chbosky,',
 'Oscar Wilde,',
 'Ralph Waldo Emerson',
 'Rob Siltanen',
 'Narcotics Anonymous',
 'Marilyn Monroe',
 'H. Jackson Brown Jr.,',
 'Andre Gide,']

In [20]:
quotes=[]
for i in quotes_parsed.find_all("div",class_="quoteText"):
    quotes.append((list(i.children)[0].strip()))

In [21]:
import pandas as pd
table=pd.DataFrame({'quotes':quotes,'authors':authors})

In [22]:
table.head()

Unnamed: 0,quotes,authors
0,“Be yourself; everyone else is already taken.”,Oscar Wilde
1,"“I'm selfish, impatient and a little insecure....",Marilyn Monroe
2,“Two things are infinite: the universe and hum...,Albert Einstein
3,"“So many books, so little time.”",Frank Zappa
4,“A room without books is like a body without a...,Marcus Tullius Cicero


In [23]:
base_url="https://www.goodreads.com/quotes?page={}"

In [24]:
base_url.format(1)

'https://www.goodreads.com/quotes?page=1'

In [25]:
from tqdm import tqdm
authors=[]
quotes=[]
for i in tqdm(range(1,101)):
    url=base_url.format(i)
    markup=requests.get(url,headers=headers).text
    markup_parsed=BeautifulSoup(markup,'html.parser')
    for i in markup_parsed.find_all("span",class_="authorOrTitle"):
        authors.append(i.text.strip())
    for i in markup_parsed.find_all("div",class_="quoteText"):
        quotes.append(list(i.children)[0].strip()) 

100%|█████████████████████████████████████████| 100/100 [05:21<00:00,  3.22s/it]


In [28]:
table=pd.DataFrame({'author':authors,'quotes':quotes})

In [29]:
### Class Excercise
## Extract Author Name and Book Title
url="https://www.goodreads.com/list/show/19.Best_for_Book_Clubs"

In [32]:
name=[]
author=[]
base_url="https://www.goodreads.com/list/show/19.Best_for_Book_Clubs?page={}"
for i in tqdm(range(1,11)):
    url=base_url.format(i)
    resp=requests.get(url,headers=headers).text
    page=BeautifulSoup(resp,'html.parser')
    for i in page.find_all("a",class_="bookTitle"):
        name.append(i.text)
    for i in page.find_all("a",class_="authorName"):
        author.append(i.text)

100%|███████████████████████████████████████████| 10/10 [00:39<00:00,  3.96s/it]


In [33]:
table=pd.DataFrame({'name':name,'author':author})
table.head()

Unnamed: 0,name,author
0,\nThe Help\n,Kathryn Stockett
1,\nThe Kite Runner\n,Khaled Hosseini
2,\nWater for Elephants\n,Sara Gruen
3,\nThe Book Thief\n,Markus Zusak
4,\nTo Kill a Mockingbird\n,Harper Lee
