###### The main objective of this is to get familiar with the basics of the BeautifulSoup Python Library

In [2]:
!pip install html5lib

Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
     ------------------------------------ 112.2/112.2 kB 465.4 kB/s eta 0:00:00
Installing collected packages: html5lib
Successfully installed html5lib-1.1


In [2]:
!pip install bs4



In [3]:
#importing the re required modules
from bs4 import BeautifulSoup #helps us in webscrapping
import requests #helps to download a webpage

In [5]:
#we can store it as a string int the variable html
html="<!DOCTYPE html><html><head><title>Page Title</title></head><body><h3><b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"


In [6]:
#passing it onto the BeautifulSoup constructor parses the document
#the BeautifulSoup object represents the document as a nested data structure
soup=BeautifulSoup(html,'html5lib')

In [7]:
#using the method prettify() to display the HTML in nested structure
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Page Title
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Lebron James
   </b>
  </h3>
  <p>
   Salary: $ 92,000,000
  </p>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $85,000, 000
  </p>
  <h3>
   Kevin Durant
  </h3>
  <p>
   Salary: $73,200, 000
  </p>
 </body>
</html>


In [8]:
#we can use Tag to get the title of the page and name of the top paid player
tag_obj=soup.title
print(tag_obj)

<title>Page Title</title>


In [9]:
#we can see the tag type using type
print(type(tag_obj))

<class 'bs4.element.Tag'>


In [10]:
#If there is more than one Tag with the same name
#the first element with that Tag name is called. 
#here this corresponds to the most paid player.
tag_obj=soup.h3
tag_obj

<h3><b id="boldest">Lebron James</b></h3>

In [16]:
#since Tag is a tree of objects.We can access child of the tag or navigate down the branch as follow
tag_child=tag_obj.b
tag_child

<b id="boldest">Lebron James</b>

In [17]:
#accessing the parent tag
tag_parent=tag_child.parent
tag_parent

<h3><b id="boldest">Lebron James</b></h3>

In [18]:
tag_obj

<h3><b id="boldest">Lebron James</b></h3>

In [19]:
#tag_obj parent is the body element
tag_obj.parent

<body><h3><b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body>

In [21]:
#the sibling of tag_obj is the paragraph element
sib_1=tag_obj.next_sibling
sib_1

<p> Salary: $ 92,000,000 </p>

In [22]:
sib_2=sib_1.next_sibling
sib_2

<h3> Stephen Curry</h3>

In [23]:
#accessing HTml attributes
tag_child['id']

'boldest'

In [24]:
#or we can access that dictioonnary directlt using atrrs
tag_child.attrs

{'id': 'boldest'}

In [25]:
#we can also obtain the content of the attribute of tag using get() method
tag_child.get('id')

'boldest'

#### Navigable String

In [27]:
#A string corresponds to a bit of text or content within a tag. 
#Beautiful Soup uses the NavigableString class to contain this text. 
tag_string=tag_child.string
tag_string
#here we obtain the name of the first player

'Lebron James'

In [29]:
#We can convert it to string object in Python.
unicode_string=str(tag_string)
unicode_string


'Lebron James'

### Filter

Filter allows to find complex pattern.
The simples filter is string.
In this section we will pass a string to a different filter method and Beautiful Soup will perform a match against that exact string.


In [30]:
#we store the html as a string in the variable table
table="<table><tr><td id='flight'>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td><td>300 kg</td></tr><tr><td>2</td><td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td><td>80 kg</td></tr></table>"

In [31]:
table_bs=BeautifulSoup(table,'html5lib')

In [33]:
#the find_all() method looks through a tag's descendants and retrieves all the 
#descendants that match the filter
#When we set the name parameter to a tag name, the method will extract all the tags with that name and its children.
table_rows=table_bs.find_all('tr')
table_rows

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>]

In [35]:
table_rows[0]#shows the first row of the Python iterable

<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>

In [36]:
table_rows[0].td #obtaining the child

<td id="flight">Flight No</td>

In [37]:
#if we iterate through the list ,each elemnt corresponds to a row in the table
for i,row in enumerate(table_rows):
    print("row",i,"is",row)

row 0 is <tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>
row 1 is <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr>
row 2 is <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>
row 3 is <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>


In [38]:
#since row is a cell object,we can use find_all() method to the cells 
#to extract table cells used the attribute td,i.e all the children with attribute td
for i,row in enumerate(table_rows):
    print("row",i)
    cells=row.find_all('td')
    for j,cell in enumerate(cells):
        print("column",j,"cell",cell)

row 0
column 0 cell <td id="flight">Flight No</td>
column 1 cell <td>Launch site</td>
column 2 cell <td>Payload mass</td>
row 1
column 0 cell <td>1</td>
column 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>
column 2 cell <td>300 kg</td>
row 2
column 0 cell <td>2</td>
column 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
column 2 cell <td>94 kg</td>
row 3
column 0 cell <td>3</td>
column 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>
column 2 cell <td>80 kg</td>


In [39]:
#using a list  we can match against any item in that list
list=table_bs.find_all(name=["tr","td"])
list

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <td id="flight">Flight No</td>,
 <td>Launch site</td>,
 <td>Payload mass</td>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr>,
 <td>1</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>,
 <td>300 kg</td>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <td>2</td>,
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>,
 <td>94 kg</td>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>,
 <td>3</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>,
 <td>80 kg</td>]

In [40]:
#finds all the elements that has link to the Florida wiki page
list=table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")
list

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [43]:
#if we set the href value to True code finds all the tags with href values
table_bs.find_all(href=True)
#set the href value to False to find all the elements withouut href values

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Texas">Texas</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [47]:
#using string we can search for strings instead of tags
table_bs.find_all(string="Florida")

['Florida', 'Florida']

In [48]:
#find_all() mmethod scans the entire docuent for results.
#find() mmethod finds the first elemment in the document
two_tables="<h3>Rocket Launch </h3><p><table class='rocket'><tr><td>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr><td>1</td><td>Florida</td><td>300 kg</td></tr><tr><td>2</td><td>Texas</td><td>94 kg</td></tr><tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table></p><p><h3>Pizza Party  </h3><table class='pizza'><tr><td>Pizza Place</td><td>Orders</td> <td>Slices </td></tr><tr><td>Domino's Pizza</td><td>10</td><td>100</td></tr><tr><td>Little Caesars</td><td>12</td><td >144 </td></tr><tr><td>Papa John's </td><td>15 </td><td>165</td></tr>"

In [49]:
two_tables_bs= BeautifulSoup(two_tables, 'html.parser')#creates beautifulsoup obj

In [50]:
#finding the first table
two_tables_bs.find('table')

<table class="rocket"><tr><td>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr><td>1</td><td>Florida</td><td>300 kg</td></tr><tr><td>2</td><td>Texas</td><td>94 kg</td></tr><tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table>

##### Downloading and Scraping The Cotents of a WebPage


In [51]:
url="http://www.ibm.com"

In [54]:
#we can use get to download the contents opf the webpage
data=requests.get(url).text

In [55]:
#creating a BeautifulSoup object using a  BeautifulSoup constructor
soup=BeautifulSoup(data,"html5lib")

In [57]:
#scrap all links
for link in soup.find_all('a',href=True):#in html anchor/link is represented ny a
   print(link.get('href'))

https://www.ibm.com/resources/the-data-differentiator/scale-ai?lnk=hpenls1
https://www.ibm.com/watsonx?lnk=hpenls2
https://www.ibm.com/flashsystem/resources/offers?lnk=hpenca1
//mediacenter.ibm.com/id/1_t4tolges
//mediacenter.ibm.com/id/1_t4tolges
//mediacenter.ibm.com/id/1_t4tolges
https://www.ibm.com/in-en/data-fabric?lnk=hpencm1
//mediacenter.ibm.com/id/1_ohfv4i6v
//mediacenter.ibm.com/id/1_ohfv4i6v
//mediacenter.ibm.com/id/1_ohfv4i6v
https://www.ibm.com/in-en/sustainability?lnk=hpencm2
//mediacenter.ibm.com/id/1_4f1czavh
//mediacenter.ibm.com/id/1_4f1czavh
//mediacenter.ibm.com/id/1_4f1czavh
https://www.ibm.com/consulting/?lnk=hpencm3
#tab_3171780
#tab_3171784
#tab_3171788
#tab_3171792
#tab_3171796
#tab_3171800
https://www.ibm.com/consulting/?lnk=hpenco1
https://www.ibm.com/consulting/strategy/?lnk=hpenco2
https://www.ibm.com/consulting/ibmix?lnk=flathl
https://www.ibm.com/consulting/technology/?lnk=hpenco4
https://www.ibm.com/services/operations-consulting?lnk=flathl
/strategic-pa

In [60]:
#scrap all immage tags
for link in soup.find_all('img'):
    print (link)
    print(link.get('src'))
    

<img alt="Financial worker at desk with six computer screens" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2023-04-05/CS_The_Modernize_Banking_Question_V1_30_mobile-frame-04-30_drupal.jpg"/>
//1.cms.s81c.com/sites/default/files/2023-04-05/CS_The_Modernize_Banking_Question_V1_30_mobile-frame-04-30_drupal.jpg
<img alt="view of the sky between skyscrapers" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2023-04-05/CS_The_Sustainability_Building_Question_30_mobile-07-00_drupal.jpg"/>
//1.cms.s81c.com/sites/default/files/2023-04-05/CS_The_Sustainability_Building_Question_30_mobile-07-00_drupal.jpg
<img alt="Close-up of golf ball on a green course" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2023-04-05/CS_the_transform_masters_question_30_mobile_frame00_drupal_0.jpg"/>
//1.cms.s81c.com/sites/default/files/2023-04-05/CS_the_transform_masters_question_30_mobile_frame00_drupal_0.jpg
<img alt="business meeting" class="" loading="lazy" 

In [61]:
#scrape data from htmml tables
#The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [62]:
#get the contents of webpage and store it in text format
data=requests.get(url).text

In [63]:
soup=BeautifulSoup(data,'html5lib')

In [64]:
#finding a html table in the webpage
table=soup.find('table')

In [66]:
#gets the rows of the table
for row in table.find_all('tr'):#tr is the html tag for table row
    #get all the columms in each row
    cols=row.find_all('td')#td is the html tag for table column
    color_name=cols[2].string#stores the value of column 3 as color_name
    color_code=cols[3].string#stores the value of column 4 as color_code
    print("{}--->{}".format(color_name,color_code))

Color Name--->None
lightsalmon--->#FFA07A
salmon--->#FA8072
darksalmon--->#E9967A
lightcoral--->#F08080
coral--->#FF7F50
tomato--->#FF6347
orangered--->#FF4500
gold--->#FFD700
orange--->#FFA500
darkorange--->#FF8C00
lightyellow--->#FFFFE0
lemonchiffon--->#FFFACD
papayawhip--->#FFEFD5
moccasin--->#FFE4B5
peachpuff--->#FFDAB9
palegoldenrod--->#EEE8AA
khaki--->#F0E68C
darkkhaki--->#BDB76B
yellow--->#FFFF00
lawngreen--->#7CFC00
chartreuse--->#7FFF00
limegreen--->#32CD32
lime--->#00FF00
forestgreen--->#228B22
green--->#008000
powderblue--->#B0E0E6
lightblue--->#ADD8E6
lightskyblue--->#87CEFA
skyblue--->#87CEEB
deepskyblue--->#00BFFF
lightsteelblue--->#B0C4DE
dodgerblue--->#1E90FF
