# BeautifulSoup

In [5]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd

URL = "http://www.example.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

# Nagivating an HTML File

In [7]:
%%html

<!DOCTYPE html>
<html>
<head>
    <title>NBA Players Salaries</title>
</head>
<body>
    <div class="player" id="player1">
        <h3><b id="boldest">LeBron James</b></h3>
        <p> Salary: $92,000,000 </p>
    </div>
    <div class="player" id="player2">
        <h3>Stephen Curry</h3>
        <p> Salary: $85,000,000 </p>
    </div>
    <div class="player" id="player3">
        <h3>Kevin Durant</h3>
        <p> Salary: $73,200,000 </p>
    </div>
    <div class="player" id="player4">
        <h3>Giannis Antetokounmpo</h3>
        <p> Salary: $45,000,000 </p>
    </div>
</body>
</html>


## Parse html with BeautifulSoup

In [8]:
# Load the HTML
html_content = '''
<!DOCTYPE html>
<html>
<head>
    <title>NBA Players Salaries</title>
</head>
<body>
    <div class="player" id="player1">
        <h3><b id="boldest">LeBron James</b></h3>
        <p> Salary: $92,000,000 </p>
    </div>
    <div class="player" id="player2">
        <h3>Stephen Curry</h3>
        <p> Salary: $85,000,000 </p>
    </div>
    <div class="player" id="player3">
        <h3>Kevin Durant</h3>
        <p> Salary: $73,200,000 </p>
    </div>
    <div class="player" id="player4">
        <h3>Giannis Antetokounmpo</h3>
        <p> Salary: $45,000,000 </p>
    </div>
</body>
</html>
'''

soup = BeautifulSoup(html_content, 'html.parser')

## More Sctructured View

In [10]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   NBA Players Salaries
  </title>
 </head>
 <body>
  <div class="player" id="player1">
   <h3>
    <b id="boldest">
     LeBron James
    </b>
   </h3>
   <p>
    Salary: $92,000,000
   </p>
  </div>
  <div class="player" id="player2">
   <h3>
    Stephen Curry
   </h3>
   <p>
    Salary: $85,000,000
   </p>
  </div>
  <div class="player" id="player3">
   <h3>
    Kevin Durant
   </h3>
   <p>
    Salary: $73,200,000
   </p>
  </div>
  <div class="player" id="player4">
   <h3>
    Giannis Antetokounmpo
   </h3>
   <p>
    Salary: $45,000,000
   </p>
  </div>
 </body>
</html>



## Tags

In [16]:
tag_title = soup.title
print("tag object:", tag_object)

tag object: <title>NBA Players Salaries</title>


In [17]:
tag_player = soup.h3
tag_player

<h3><b id="boldest">LeBron James</b></h3>

## Accesing child

In [18]:
tag_player_child = tag_player.b
tag_player_child

<b id="boldest">LeBron James</b>

## Accesing Parent

In [20]:
tag_player_parent = tag_player_child.parent
tag_player_parent

<h3><b id="boldest">LeBron James</b></h3>

In [21]:
tag_player_parent2 =tag_player_parent.parent
tag_player_parent2

<div class="player" id="player1">
<h3><b id="boldest">LeBron James</b></h3>
<p> Salary: $92,000,000 </p>
</div>

In [23]:
tag_player_parent3 =tag_player_parent2.parent
tag_player_parent3

<body>
<div class="player" id="player1">
<h3><b id="boldest">LeBron James</b></h3>
<p> Salary: $92,000,000 </p>
</div>
<div class="player" id="player2">
<h3>Stephen Curry</h3>
<p> Salary: $85,000,000 </p>
</div>
<div class="player" id="player3">
<h3>Kevin Durant</h3>
<p> Salary: $73,200,000 </p>
</div>
<div class="player" id="player4">
<h3>Giannis Antetokounmpo</h3>
<p> Salary: $45,000,000 </p>
</div>
</body>

## Accesing Sibling

In [33]:
tag_body = soup.h3
tag_body

<h3><b id="boldest">LeBron James</b></h3>

In [34]:
tag_body_sibling = tag_body.next_sibling
tag_body_sibling

'\n'

In [35]:
tag_body_sibling = tag_body_sibling.next_sibling
tag_body_sibling

<p> Salary: $92,000,000 </p>

## Using Attributes

In [46]:
tag_player = soup.h3
tag_player

<h3><b id="boldest">LeBron James</b></h3>

In [51]:
tag_player_child = tag_player.b
tag_player_child

<b id="boldest">LeBron James</b>

In [52]:
tag_player_child.attrs

{'id': 'boldest'}

## String

In [53]:
tag_player = soup.h3

In [54]:
tag_player.string

'LeBron James'

## Finding Specific Player

In [57]:
finding_tag = soup.find(id = "player1")
finding_tag

<div class="player" id="player1">
<h3><b id="boldest">LeBron James</b></h3>
<p> Salary: $92,000,000 </p>
</div>

In [62]:
finding_tag_sibling = finding_tag.next_sibling.next_sibling # skipping \n
finding_tag_sibling

<div class="player" id="player2">
<h3>Stephen Curry</h3>
<p> Salary: $85,000,000 </p>
</div>

In [65]:
player = soup.find(class_='player')
while player:
    name = player.h3.text
    salary = player.p.text
    print(f'{name}: {salary}')
    player = player.next_sibling.next_sibling  # Move to the next sibling


LeBron James:  Salary: $92,000,000 
Stephen Curry:  Salary: $85,000,000 
Kevin Durant:  Salary: $73,200,000 
Giannis Antetokounmpo:  Salary: $45,000,000 


In [67]:
all_h3_tags = soup.find_all('h3')
print('All <h3> tags:')
for h3 in all_h3_tags:
    print(h3)

All <h3> tags:
<h3><b id="boldest">LeBron James</b></h3>
<h3>Stephen Curry</h3>
<h3>Kevin Durant</h3>
<h3>Giannis Antetokounmpo</h3>


# Naviagting html tables

In [69]:
%%html
<table>
  <tr>
    <td id='flight' >Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
   </tr>
  <tr> 
    <td>1</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a></td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td>
    <td>80 kg</td>
  </tr>
</table>

0,1,2
Flight No,Launch site,Payload mass
1,Florida,300 kg
2,Texas,94 kg
3,Florida,80 kg


In [70]:
table = "<table><tr><td id='flight'>Flight No</td><td>Launch site</td> \
<td>Payload mass</td></tr><tr> <td>1</td> \
<td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td> \
<td>300 kg</td></tr><tr><td>2</td> \
<td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td> \
<td>94 kg</td></tr><tr><td>3</td> \
<td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td> \
<td>80 kg</td></tr></table>"

In [71]:
table_bs = BeautifulSoup(table, 'html5lib')

## Find all

In [74]:
table_rows = table_bs.find_all("tr")
table_rows

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr>,
 <tr><td>2</td> <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td> <td>94 kg</td></tr>,
 <tr><td>3</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td> <td>80 kg</td></tr>]

In [81]:
first_row = table_rows[0]
first_row

<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>

In [80]:
first_row.next_sibling

<tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr>

In [82]:
first_row_child = first_row.td
first_row_child

<td id="flight">Flight No</td>

In [83]:
for i, row in enumerate(table_rows):
    print("row", i, "is", row)

row 0 is <tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>
row 1 is <tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr>
row 2 is <tr><td>2</td> <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td> <td>94 kg</td></tr>
row 3 is <tr><td>3</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td> <td>80 kg</td></tr>


In [84]:
for i, row in enumerate(table_rows):
    print("row", i)
    cells = row.find_all('td')
    for j, cell in enumerate(cells):
        print('colunm', j, "cell", cell)

row 0
colunm 0 cell <td id="flight">Flight No</td>
colunm 1 cell <td>Launch site</td>
colunm 2 cell <td>Payload mass</td>
row 1
colunm 0 cell <td>1</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>
colunm 2 cell <td>300 kg</td>
row 2
colunm 0 cell <td>2</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
colunm 2 cell <td>94 kg</td>
row 3
colunm 0 cell <td>3</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>
colunm 2 cell <td>80 kg</td>


In [85]:
list_input = table_bs.find_all(name=["tr", "td"])
list_input

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <td id="flight">Flight No</td>,
 <td>Launch site</td>,
 <td>Payload mass</td>,
 <tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr>,
 <td>1</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>,
 <td>300 kg</td>,
 <tr><td>2</td> <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td> <td>94 kg</td></tr>,
 <td>2</td>,
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>,
 <td>94 kg</td>,
 <tr><td>3</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td> <td>80 kg</td></tr>,
 <td>3</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>,
 <td>80 kg</td>]

In [87]:
table_bs.find_all(id="flight")

[<td id="flight">Flight No</td>]

In [88]:
list_input = table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")
list_input

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [89]:
table_bs.find_all('a', href=True)

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Texas">Texas</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [90]:
table_bs.find_all(string="Florida")

['Florida', 'Florida']

## Find 

In [91]:
%%html
<h3>Rocket Launch </h3>

<p>
<table class='rocket'>
  <tr>
    <td>Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
  </tr>
  <tr>
    <td>1</td>
    <td>Florida</td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td>Texas</td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td>Florida </td>
    <td>80 kg</td>
  </tr>
</table>
</p>
<p>

<h3>Pizza Party</h3>
  
    
<table class='pizza'>
  <tr>
    <td>Pizza Place</td>
    <td>Orders</td> 
    <td>Slices </td>
   </tr>
  <tr>
    <td>Domino's Pizza</td>
    <td>10</td>
    <td>100</td>
  </tr>
  <tr>
    <td>Little Caesars</td>
    <td>12</td>
    <td >144 </td>
  </tr>
  <tr>
    <td>Papa John's </td>
    <td>15 </td>
    <td>165</td>
  </tr>


0,1,2
Flight No,Launch site,Payload mass
1,Florida,300 kg
2,Texas,94 kg
3,Florida,80 kg

0,1,2
Pizza Place,Orders,Slices
Domino's Pizza,10,100
Little Caesars,12,144
Papa John's,15,165


In [92]:
tables = """
<h3>Rocket Launch </h3>

<p>
<table class='rocket'>
  <tr>
    <td>Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
  </tr>
  <tr>
    <td>1</td>
    <td>Florida</td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td>Texas</td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td>Florida </td>
    <td>80 kg</td>
  </tr>
</table>
</p>
<p>

<h3>Pizza Party</h3>
  
    
<table class='pizza'>
  <tr>
    <td>Pizza Place</td>
    <td>Orders</td> 
    <td>Slices </td>
   </tr>
  <tr>
    <td>Domino's Pizza</td>
    <td>10</td>
    <td>100</td>
  </tr>
  <tr>
    <td>Little Caesars</td>
    <td>12</td>
    <td >144 </td>
  </tr>
  <tr>
    <td>Papa John's </td>
    <td>15 </td>
    <td>165</td>
  </tr>
  """

In [96]:
two_tables_bs = BeautifulSoup(tables, "html.parser")

In [97]:
two_tables_bs.find("table")

<table class="rocket">
<tr>
<td>Flight No</td>
<td>Launch site</td>
<td>Payload mass</td>
</tr>
<tr>
<td>1</td>
<td>Florida</td>
<td>300 kg</td>
</tr>
<tr>
<td>2</td>
<td>Texas</td>
<td>94 kg</td>
</tr>
<tr>
<td>3</td>
<td>Florida </td>
<td>80 kg</td>
</tr>
</table>

In [99]:
two_tables_bs.find("table", class_='pizza')

<table class="pizza">
<tr>
<td>Pizza Place</td>
<td>Orders</td>
<td>Slices </td>
</tr>
<tr>
<td>Domino's Pizza</td>
<td>10</td>
<td>100</td>
</tr>
<tr>
<td>Little Caesars</td>
<td>12</td>
<td>144 </td>
</tr>
<tr>
<td>Papa John's </td>
<td>15 </td>
<td>165</td>
</tr>
</table>

# Scraping Content of a Web Page

In [12]:
url = "http://www.ibm.com"

In [13]:
data = requests.get(url).text

In [14]:
datasoup = BeautifulSoup(data, "html5lib")  # create a soup object using the variable 'data'

## Scraping all links

In [15]:
for link in soup.find_all('a', href=True):  # in html anchor/link is represented by the tag <a>
    print(link.get('href'))

https://www.iana.org/domains/example


## All image tags

In [17]:
for link in soup.find_all('img'):  # in html image is represented by the tag <img>
    print(link)
    print(link.get('src'))

## Scape data from HTML tables

In [18]:
# The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [19]:
# get the contents of the webpage in text format and store in a variable called data
data = requests.get(url).text

In [20]:
soup = BeautifulSoup(data, "html5lib")

In [24]:
table = soup.find("table")

In [25]:
# Get all rows from the table
for row in table.find_all('tr'):  # in html table row represented by tag <tr>
    # Get all columns in each row.
    cols = row.find_all('td')  # in html a column is represented by tag <td>
    color_name = cols[2].string  # store the value in column 3 as color_name
    color_code = cols[3].text  # store the value in column 4 as color_code
    print("{}--->{}".format(color_name, color_code))

Color Name--->Hex Code#RRGGBB
lightsalmon--->#FFA07A
salmon--->#FA8072
darksalmon--->#E9967A
lightcoral--->#F08080
coral--->#FF7F50
tomato--->#FF6347
orangered--->#FF4500
gold--->#FFD700
orange--->#FFA500
darkorange--->#FF8C00
lightyellow--->#FFFFE0
lemonchiffon--->#FFFACD
papayawhip--->#FFEFD5
moccasin--->#FFE4B5
peachpuff--->#FFDAB9
palegoldenrod--->#EEE8AA
khaki--->#F0E68C
darkkhaki--->#BDB76B
yellow--->#FFFF00
lawngreen--->#7CFC00
chartreuse--->#7FFF00
limegreen--->#32CD32
lime--->#00FF00
forestgreen--->#228B22
green--->#008000
powderblue--->#B0E0E6
lightblue--->#ADD8E6
lightskyblue--->#87CEFA
skyblue--->#87CEEB
deepskyblue--->#00BFFF
lightsteelblue--->#B0C4DE
dodgerblue--->#1E90FF


# Scraping tables using pandas

In [26]:
# The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [32]:
tables = pd.read_html(url)
table = tables[0]
table

Unnamed: 0,0,1,2,3,4
0,Number,Color,Color Name,Hex Code #RRGGBB,"Decimal Code (R,G,B)"
1,1,,lightsalmon,#FFA07A,"rgb(255,160,122)"
2,2,,salmon,#FA8072,"rgb(250,128,114)"
3,3,,darksalmon,#E9967A,"rgb(233,150,122)"
4,4,,lightcoral,#F08080,"rgb(240,128,128)"
5,5,,coral,#FF7F50,"rgb(255,127,80)"
6,6,,tomato,#FF6347,"rgb(255,99,71)"
7,7,,orangered,#FF4500,"rgb(255,69,0)"
8,8,,gold,#FFD700,"rgb(255,215,0)"
9,9,,orange,#FFA500,"rgb(255,165,0)"


## API use

In [2]:
# Write your code here
url = "https://official-joke-api.appspot.com/jokes/ten"
r = requests.get(url)

In [3]:
results = json.loads(r.text)
df = pd.DataFrame(results)
df.drop(columns = ["type", "id"], inplace = True)
df

Unnamed: 0,setup,punchline
0,"3 SQL statements walk into a NoSQL bar. Soon, ...",They couldn't find a table.
1,What do I look like?,A JOKE MACHINE!?
2,Why dot net developers don't wear glasses?,Because they see sharp.
3,What’s E.T. short for?,He’s only got little legs.
4,Why couldn't the kid see the pirate movie?,Because it was rated arrr!
5,How good are you at Power Point?,I Excel at it.
6,What do you call a singing Laptop?,A Dell
7,Knock knock. \n Who's there? \n A broken penci...,Never mind. It's pointless.
8,What did the router say to the doctor?,It hurts when IP.
9,Did you hear that David lost his ID in prague?,Now we just have to call him Dav.


# Scrapy

In [3]:
import scrapy
class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = ['http://quotes.toscrape.com/tag/humor/',]
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {'quote': quote.css('span.text::text').get()}

# Selenium

In [None]:
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("http://www.example.com")

# Applications of Web Scraping
Web scraping is used in various fields and has many applications:

1. Price Comparison: Services such as ParseHub use web scraping to collect data from online shopping websites and use it to compare the prices of products.

2. Email address gathering: Many companies that use email as a medium for marketing, use web scraping to collect email ID and then send bulk emails.

3. Social Media Scraping: Web scraping is used to collect data from Social Media websites such as Twitter to find out what's trending.

# Web Scraping Tables Using Pandas

In [14]:
URL = 'https://en.wikipedia.org/wiki/List_of_largest_banks'

In [15]:
tables = pd.read_html(URL)

In [17]:
df = tables[0]
df

Unnamed: 0,Rank,Bank name,Total assets (2023) (US$ billion)
0,1,Industrial and Commercial Bank of China,6303.44
1,2,Agricultural Bank of China,5623.12
2,3,China Construction Bank,5400.28
3,4,Bank of China,4578.28
4,5,JPMorgan Chase,3875.39
...,...,...,...
95,96,Handelsbanken,351.79
96,97,Industrial Bank of Korea,345.81
97,98,DNB,339.21
98,99,Qatar National Bank,338.14


## problems with this approach 
e.g. hyperlinks in tables

In [19]:
URL = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
tables = pd.read_html(URL)
df = tables[2] # the required table will have index 2
print(df)

    Country/Territory UN region IMF[1][13]            World Bank[14]  \
    Country/Territory UN region   Forecast       Year       Estimate   
0               World         —  109529216       2024      100562011   
1       United States  Americas   28781083       2024       25462700   
2               China      Asia   18532633  [n 1]2024       17963171   
3             Germany    Europe    4591100       2024        4072192   
4               Japan      Asia    4110452       2024        4231141   
..                ...       ...        ...        ...            ...   
208  Marshall Islands   Oceania        305       2024            280   
209          Anguilla  Americas          —          —              —   
210             Nauru   Oceania        161       2024            151   
211        Montserrat  Americas          —          —              —   
212            Tuvalu   Oceania         66       2024             60   

               United Nations[15]             
          Year  