# Beautiful Soup
Check the beautiful soup documentation page



In [14]:
from bs4 import BeautifulSoup
import requests

In [15]:
with open('dummy.html', 'r') as f:
    doc = BeautifulSoup(f, "html.parser") # parse over the html document

print(doc.prettify())

<html>
 <head>
  <title>
   Your Title Here
  </title>
 </head>
 <body bgcolor="FFFFFF">
  <center>
   <img align="BOTTOM" src="clouds.jpg"/>
  </center>
  <hr/>
  <a href="http://somegreatsite.com">
   Link Name
  </a>
  is a link to another nifty site
  <h1>
   This is a Header
  </h1>
  <h2>
   This is a Medium Header
  </h2>
  Send me mail at
  <a href="mailto:support@yourcompany.com">
   support@yourcompany.com
  </a>
  .
  <p>
   This is a new paragraph!
   <p>
    <b color="red">
     This is a new paragraph!
    </b>
    <br/>
    <b>
     <i>
      This is a new sentence without a paragraph break, in bold italics.
     </i>
    </b>
    <hr/>
   </p>
  </p>
 </body>
</html>


this allows us to read over and format the html

In [16]:
# we want to search for a specific aspect
# like a table, or some information
# how do we find things by the tag name?
tag = doc.title
print(tag) # access specific tag. It will only give the first one however

<title>Your Title Here</title>


In [17]:
# access the string inside of a tag
print(tag.string)

Your Title Here


In [18]:
# modify the string
tag.string = "Hello"
print(doc.prettify())

<html>
 <head>
  <title>
   Hello
  </title>
 </head>
 <body bgcolor="FFFFFF">
  <center>
   <img align="BOTTOM" src="clouds.jpg"/>
  </center>
  <hr/>
  <a href="http://somegreatsite.com">
   Link Name
  </a>
  is a link to another nifty site
  <h1>
   This is a Header
  </h1>
  <h2>
   This is a Medium Header
  </h2>
  Send me mail at
  <a href="mailto:support@yourcompany.com">
   support@yourcompany.com
  </a>
  .
  <p>
   This is a new paragraph!
   <p>
    <b color="red">
     This is a new paragraph!
    </b>
    <br/>
    <b>
     <i>
      This is a new sentence without a paragraph break, in bold italics.
     </i>
    </b>
    <hr/>
   </p>
  </p>
 </body>
</html>


In [19]:
# get list of tags
tag = doc.find('a') # find first instance of a tag that starts with a
tags = doc.find_all('p') # get list of tags

In [20]:
print(tag)
print(tags)

<a href="http://somegreatsite.com">Link Name</a>
[<p> This is a new paragraph!
    
<p> <b color="red">This is a new paragraph!</b>
<br/> <b><i>This is a new sentence without a paragraph break, in bold italics.</i></b>
<hr/>
</p></p>, <p> <b color="red">This is a new paragraph!</b>
<br/> <b><i>This is a new sentence without a paragraph break, in bold italics.</i></b>
<hr/>
</p>]


In [21]:
# accessing nested tags
print(doc.find_all('p')[0].find_all('b')[1])

<b><i>This is a new sentence without a paragraph break, in bold italics.</i></b>


In [35]:
# now how do we read html in a website?
# we will check newegg using the requests library
url = r"https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqbmZaUGliNXlteDd5bjYwU3NJSHRreGg2VzJSZ3xBQ3Jtc0tuVFRHdndVTm9TZ2ZmdnhoMXpMSlpGcnRPTDU0NllDeVFWTXpKZHVGelczUmtOREYydVVDbzhTVDdoZzNXekJFQ2R0RnFSSmlfMC0zaV9RMXFZZ0RzaTR3UHZmVlN6S3hUa0hQcC1zMEdBdXJJem9uZw&q=https%3A%2F%2Fwww.newegg.ca%2Fgigabyte-geforce-rtx-3080-ti-gv-n308tgaming-oc-12gd%2Fp%2FN82E16814932436%3FDescription%3D3080%26cm_re%3D3080-_-14-932-436-_-Product&v=gRLHr664tXA"
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")

In [36]:
# lots of websites dont let you submit bot get requests
# make sure you arent spamming requests, thats accidental dosing
# so how do we get the price of the GPU?
# we will look for the dollar sign and then the price afterwards

prices = doc.find_all(text="a")
print(prices) # should return the price

[]


### Beautiful soup structure
The beautiful soup library structures HTML files like trees. The trunk of the tree is the header, for instance, and everything inside exists as branches

the parent of the title is the head, the descendant of the head is the body, for example 

if you access the parent of an object or string, you can search for the string in that parent

In [37]:
# so we can get the parent by doing:
parent = prices[0].parent
print(parent)
strong = parent.find("strong")
print(strong.string)

IndexError: list index out of range