# Lab 09: BeautifulSoup

In [2]:
# import libraries

import requests
from bs4 import BeautifulSoup


---

### From lecture notes: `.find()` & `.find_all()`

Using a tag name as an attribute gives us only the first tag by that name.

If we need to get all tags with a certain name, we need to use `find_all()`.

The `find_all()` (`find()`) method can take a variety of filters to find lists of desired tags (a single tag):

In [3]:
html_sample_code = ('<!DOCTYPE html><html lang="en"><head><title>Sample HTML Page</title></head>'
                    '<body><h1>This is a heading.</h1>'
                    '<p>This is a typical paragraph.</p>'
                    '<p class="class-one">This is a paragraph of class "class-one".</p>'
                    '<ol><li class="class-one"><a href="sample.html">The 1st item</a></li>'
                    '<li class="class-one">The 2nd item</li></ol>'
                    '<p id="unique-one">This is a paragraph with an ID of "unique-one".</p>'
                    '<div class="col m-3 border class-one">This is a division.'
                    '<a href="sample.html">link 2</a></div></body></html>')

sample_soup = BeautifulSoup(html_sample_code, 'html.parser')

In [4]:
sample_soup.find('p')                           # perform a match against that exact string; return the first tag encountered

<p>This is a typical paragraph.</p>

In [5]:
sample_soup.find_all('p')                      # perform a match against that exact string; return a list of tags

[<p>This is a typical paragraph.</p>,
 <p class="class-one">This is a paragraph of class "class-one".</p>,
 <p id="unique-one">This is a paragraph with an ID of "unique-one".</p>]

In [6]:
sample_soup.find_all(["p", "a"])               # perform a string match against any item in that list

[<p>This is a typical paragraph.</p>,
 <p class="class-one">This is a paragraph of class "class-one".</p>,
 <a href="sample.html">The 1st item</a>,
 <p id="unique-one">This is a paragraph with an ID of "unique-one".</p>,
 <a href="sample.html">link 2</a>]

In [7]:
sample_soup.find_all('p', {'class': 'class-one'})               # perform a match with a given attribute

[<p class="class-one">This is a paragraph of class "class-one".</p>]

---

### Lab Tasks 1-2
With Requests and BeautifulSoup, route to the site of etnet, scrape the following website: 

http://www.etnet.com.hk/www/eng/stocks/indexes_detail.php?subtype=HSI

##### 1: Hang Seng Index

Use methods `.find` with tag name and class name.

*Hint: Use "inspect" to find the class name needed.*

In [8]:
index_url = "http://www.etnet.com.hk/www/eng/stocks/indexes_detail.php?subtype=HSI"

response = requests.get(index_url, timeout=3)  
soup = BeautifulSoup(response.content, 'html.parser')

# complete the code below
hs_index = soup.find('div', {'class':'StkIndexesNorminal'})
hs_index.text

'22,849.81'

##### 2: Sub Menu Bar

2.1: At the top of the website, there is a menu bar (HTML: a `div` element with `id:'SubMenuBar'`) showing text items, like Home, RT Quote, Indices, ..., etc. Use methods `.find`, `.find_all` and `.get_text` to scrap the text items in the menu bar. 

In [9]:
# complete the code below
menu_bar = soup.find('div', {'id': 'SubMenuBar'})
items = menu_bar.find_all('div', {'class': 'SubMenuBlock'})

bar_belt_items = [item.get_text(strip=True) for item in items if '|' not in item.text]
print(*bar_belt_items, sep='\n')

Home
RT Quote
Indices
Industry
Top 20
Record High
Short Sell
Hot Sector
AH
IPO
Company Info


2.2: Now, use `.get` to scrap the hyperlink attribute of each text items in the bar belt. 


In [10]:
# complete the code below
bar_belt_items_links = [item.a.get('href') for item in items if '|' not in item.text]
print(*bar_belt_items_links, sep='\n')

/www/eng/stocks/realtime/index.php
/www/eng/stocks/realtime/quote.php?code=1
/www/eng/stocks/indexes_main.php
/www/eng/stocks/industry.php
/www/eng/stocks/realtime/top20.php
/www/eng/stocks/breakrecord.php
/www/eng/stocks/ci_act_sell.php
/www/eng/stocks/sector_hot.php
/www/eng/stocks/ah.php
/www/eng/stocks/ci_ipo.php
/www/eng/stocks/ci_database.php


---
### Pandas Dataframe: library for data analysis

To create a dataframe:

```Python
var = pd.DataFrame({'header1': list1, 'header2': list2})
```

Each header and list corresponds to a column; string-list pair given in a dictionary.

We can use a dataframe to display the data scraped!

---

In [11]:
import pandas as pd

In [12]:
sid = [20789215, 20791348, 20795589, 20834892, 20861624, 20954221]
last_name = ["Chan", "Lam", "Chau", "Lau", "Au-Yeung", "Chan"]
first_name= ["Thomas", "Vivian", "Angus", "Charlotte", "Jason", "Annie"]
asm1 = [100, 100, 80, 86, 69, 100]
asm2 = [85, 79, 84, 93, 88, 85]
final_exam = [85, 79, 90, 65, 77, 80]

gradebook = list(zip(sid, last_name, first_name, asm1, asm2, final_exam))
gradebook

[(20789215, 'Chan', 'Thomas', 100, 85, 85),
 (20791348, 'Lam', 'Vivian', 100, 79, 79),
 (20795589, 'Chau', 'Angus', 80, 84, 90),
 (20834892, 'Lau', 'Charlotte', 86, 93, 65),
 (20861624, 'Au-Yeung', 'Jason', 69, 88, 77),
 (20954221, 'Chan', 'Annie', 100, 85, 80)]

In [13]:
# DataFrame version of the table

pd_gradebook = pd.DataFrame({'Student ID': sid, 'Last Name': last_name, 'First Name': first_name, 'Assignment 1': asm1, 'Assignment 2': asm2, 'Final Exam': final_exam})
pd_gradebook

Unnamed: 0,Student ID,Last Name,First Name,Assignment 1,Assignment 2,Final Exam
0,20789215,Chan,Thomas,100,85,85
1,20791348,Lam,Vivian,100,79,79
2,20795589,Chau,Angus,80,84,90
3,20834892,Lau,Charlotte,86,93,65
4,20861624,Au-Yeung,Jason,69,88,77
5,20954221,Chan,Annie,100,85,80


### Lab Task 3: Stocks and Prices

3.1: From the HSI stocks table, extract the trending HSI stocks data including stock `names`, `codes` & `prices`. 

Use methods `.find` & `.find_all` to extract the required data, then organize the extracted data into a pandas *DataFrame*.


In [14]:
# complete the code below
index_url = "http://www.etnet.com.hk/www/eng/stocks/indexes_detail.php?subtype=HSI"

response = requests.get(index_url, timeout=3)  
soup = BeautifulSoup(response.content, 'html.parser')

# prepare empty lists to store data
name_ls = []
code_ls = []
price_ls = []

#locate the table rows
stocks = soup.find('table', {'class':'figureTable'}).find_all('tr')[1:]        
#print(len(stocks))

for stock in stocks:
    # unpack the located row and get the required data
    #print(stock.find_all('td'))
    code, name, arrow, price, *rest = stock.find_all('td')
    
    # append the data to the pre-defined lists
    code_ls.append(code.text)
    name_ls.append(name.text)
    price_ls.append(price.text)
    
# create a pandas dataframe
stock_tb = pd.DataFrame({'Stock':name_ls, 'Stock No.':code_ls, 'Price':price_ls})
stock_tb


Unnamed: 0,Stock,Stock No.,Price
0,CKH HOLDINGS,00001,43.850
1,CLP HOLDINGS,00002,63.950
2,HK & CHINA GAS,00003,6.900
3,HSBC HOLDINGS,00005,86.100
4,POWER ASSETS,00006,48.050
...,...,...,...
78,BIDU-SW,09888,87.800
79,NEW ORIENTAL-S,09901,36.600
80,TRIP.COM-S,09961,488.000
81,BABA-W,09988,123.500


3.2: Scrape also the `change`, `&change` & `turnover` information. 

You can revise from above code or write the code again from scratch for practice. Organize the extracted data into a pandas *DataFrame*.

In [15]:
# complete the code below
index_url = "http://www.etnet.com.hk/www/eng/stocks/indexes_detail.php?subtype=HSI"

response = requests.get(index_url, timeout=3)  
soup = BeautifulSoup(response.content, 'html.parser')

# prepare empty lists to store data
name_ls = []
code_ls = []
price_ls = []
change_ls = []
per_change_ls = []
turnover_ls = []

#locate the table rows
stocks = soup.find('table', {'class':'figureTable'}).find_all('tr')[1:]        
print(len(stocks))

for stock in stocks:
    # unpack the located row and get the required data
    code, name, arrow, price, change, per_change, turnover, *rest = stock.find_all('td')
    
    # append the data to the pre-defined lists
    code_ls.append(code.text)
    name_ls.append(name.text)
    price_ls.append(price.text)
    change_ls.append(change.text)
    per_change_ls.append(per_change.text)
    turnover_ls.append(turnover.text)
    
# create a pandas dataframe
stock_tb = pd.DataFrame({'Stock':name_ls, 'Stock No.':code_ls, 'Price':price_ls, 'Change':change_ls, '%Change':per_change_ls, 'Turnover':turnover_ls})
stock_tb

83


Unnamed: 0,Stock,Stock No.,Price,Change,%Change,Turnover
0,CKH HOLDINGS,00001,43.850,-0.450,-1.016%,581.809M
1,CLP HOLDINGS,00002,63.950,+0.300,+0.471%,260.189M
2,HK & CHINA GAS,00003,6.900,+0.100,+1.471%,322.676M
3,HSBC HOLDINGS,00005,86.100,-1.900,-2.159%,2.433B
4,POWER ASSETS,00006,48.050,+0.450,+0.945%,216.599M
...,...,...,...,...,...,...
78,BIDU-SW,09888,87.800,-2.200,-2.444%,913.691M
79,NEW ORIENTAL-S,09901,36.600,-1.050,-2.789%,152.276M
80,TRIP.COM-S,09961,488.000,-7.200,-1.454%,896.041M
81,BABA-W,09988,123.500,-6.500,-5.000%,20.483B


### Take-home practice: Section Menu
Use methods `.find`, `.find_all` and `.get` to scrap the text and hyperlink in the `div` element with `id: 'SectionMenu'`. 

Try to format the output as shown below:

```
1: Local Indices
Link: /www/eng/stocks/indexes_main.php

2: China Indices
Link: /www/eng/stocks/indexes_china.php

3: Global Indices
Link: /www/eng/stocks/indexes_global.php

...
```

*(Hint 1: you may refer to the submenu task)*

*(Hint 2: you may want to use enumerate)*

In [16]:
# write your code below

ele_container = soup.find('div', {'id': 'SectionMenu'})
ele_list = ele_container.find_all('a')

for idx, ele in enumerate(ele_list, start=1):
    # print(ele.text)
    # print(ele.get('href'))
    print(f"{idx}: {ele.text}\nLink: {ele.get('href')}\n")

1: Local Indices
Link: /www/eng/stocks/indexes_main.php

2: China Indices
Link: /www/eng/stocks/indexes_china.php

3: Global Indices
Link: /www/eng/stocks/indexes_global.php

4: MSCI Indices
Link: /www/eng/stocks/indexes_msci.php

5: Blue Chips
Link: /www/eng/stocks/indexes_detail.php?subtype=hsi

6: HSCEI
Link: /www/eng/stocks/indexes_detail.php?subtype=cei

7: Red Chips
Link: /www/eng/stocks/indexes_detail.php?subtype=cci

8: HS TECH
Link: /www/eng/stocks/indexes_detail.php?subtype=teh

9: HFI
Link: /www/eng/stocks/indexes_detail.php?subtype=hfi

10: BIO
Link: /www/eng/stocks/indexes_detail.php?subtype=bio

11: HS SC AM
Link: /www/eng/stocks/indexes_detail.php?subtype=s44

12: HCL
Link: /www/eng/stocks/indexes_detail.php?subtype=hcl

13: HCM
Link: /www/eng/stocks/indexes_detail.php?subtype=hcm

14: HS China 100
Link: /www/eng/stocks/indexes_detail.php?subtype=mlh

15: HS China 25
Link: /www/eng/stocks/indexes_detail.php?subtype=m25

16: HS HK 35
Link: /www/eng/stocks/indexes_detail.p

### Self-practice: CSS selectors

This time, use `.select` or `.select_one` with css selectors, to complete the exercises above. 

1. Scrape the section menu (take-home practice)

In [17]:
# complete the code below

# Section Menu
ele_container = soup.select_one('div#SectionMenu')
ele_list = ele_container.select('a')

for idx, ele in enumerate(ele_list, start=1):
    print(f"{idx}: {ele.text}\nLink: {ele.get('href')}\n")

1: Local Indices
Link: /www/eng/stocks/indexes_main.php

2: China Indices
Link: /www/eng/stocks/indexes_china.php

3: Global Indices
Link: /www/eng/stocks/indexes_global.php

4: MSCI Indices
Link: /www/eng/stocks/indexes_msci.php

5: Blue Chips
Link: /www/eng/stocks/indexes_detail.php?subtype=hsi

6: HSCEI
Link: /www/eng/stocks/indexes_detail.php?subtype=cei

7: Red Chips
Link: /www/eng/stocks/indexes_detail.php?subtype=cci

8: HS TECH
Link: /www/eng/stocks/indexes_detail.php?subtype=teh

9: HFI
Link: /www/eng/stocks/indexes_detail.php?subtype=hfi

10: BIO
Link: /www/eng/stocks/indexes_detail.php?subtype=bio

11: HS SC AM
Link: /www/eng/stocks/indexes_detail.php?subtype=s44

12: HCL
Link: /www/eng/stocks/indexes_detail.php?subtype=hcl

13: HCM
Link: /www/eng/stocks/indexes_detail.php?subtype=hcm

14: HS China 100
Link: /www/eng/stocks/indexes_detail.php?subtype=mlh

15: HS China 25
Link: /www/eng/stocks/indexes_detail.php?subtype=m25

16: HS HK 35
Link: /www/eng/stocks/indexes_detail.p

2. Scrape the Table Data (Task 3)

In [18]:
# Table data

# complete the code below
index_url = "http://www.etnet.com.hk/www/eng/stocks/indexes_detail.php?subtype=HSI"

response = requests.get(index_url, timeout=3)  
soup = BeautifulSoup(response.content, 'html.parser')

# prepare empty lists to store data
name_ls = []
code_ls = []
price_ls = []
change_ls = []
per_change_ls = []
turnover_ls = []

#locate the table rows
stocks = soup.select('table.figureTable tr')[1:]        
#print(len(stocks))

for stock in stocks:
    # unpack the located row and get the required data
    code, name, arrow, price, change, per_change, turnover, *rest = stock.find_all('td')
    
    # append the data to the pre-defined lists
    code_ls.append(code.text)
    name_ls.append(name.text)
    price_ls.append(price.text)
    change_ls.append(change.text)
    per_change_ls.append(per_change.text)
    turnover_ls.append(turnover.text)
    
# create a pandas datafrome
stock_tb = pd.DataFrame({'Stock':name_ls, 'Stock No.':code_ls, 'Price':price_ls, 'Change':change_ls, '%Change':per_change_ls, 'Turnover':turnover_ls})
stock_tb

Unnamed: 0,Stock,Stock No.,Price,Change,%Change,Turnover
0,CKH HOLDINGS,00001,43.850,-0.450,-1.016%,581.809M
1,CLP HOLDINGS,00002,63.950,+0.300,+0.471%,260.189M
2,HK & CHINA GAS,00003,6.900,+0.100,+1.471%,322.676M
3,HSBC HOLDINGS,00005,86.100,-1.900,-2.159%,2.433B
4,POWER ASSETS,00006,48.050,+0.450,+0.945%,216.599M
...,...,...,...,...,...,...
78,BIDU-SW,09888,87.800,-2.200,-2.444%,913.691M
79,NEW ORIENTAL-S,09901,36.600,-1.050,-2.789%,152.276M
80,TRIP.COM-S,09961,488.000,-7.200,-1.454%,896.041M
81,BABA-W,09988,123.500,-6.500,-5.000%,20.483B


### Self practice: challenge

Insert a `News` column to the above dataframe and show information for the top 10 stocks. The news of each stocks is posted on another page. Extract the links from the HSI page, route to the news page and insert data for the `News` column.

In [19]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

index_url = 'http://www.etnet.com.hk/www/eng/stocks/'
hsi_url = 'indexes_detail.php?subtype=HSI'

response = requests.get(index_url+hsi_url, timeout=3)  
soup = BeautifulSoup(response.content, 'html.parser')

name = []
code = []
price = []
turnover = []
news_url = []
news = []

stocks = soup.select('table.figureTable tr')
stocks[1:11]

for stock in stocks[1:11]:
    news_url.append(stock.select('td')[8].a.get('href'))

for idx, data in enumerate(stocks[1:11]):
    name.append(data.select_one('td.TextEng').text)
    code.append(data.a.text)
    price.append(data.select('td')[3].text)
    turnover.append(data.select('td')[6].text)

    # get link again with extended url to scrap the news
    response = requests.get(index_url+news_url[idx], timeout=5) 
    soup_2 = BeautifulSoup(response.content, 'html.parser')
    #print(soup_2.title)
    news.append(soup_2.select_one('#NewsContent').get_text(strip=True))


stock_tb = pd.DataFrame({'Stock':name, 'Stock No.':code, 'Price':price, 'Tureover':turnover, 'News':news})
stock_tb



Unnamed: 0,Stock,Stock No.,Price,Tureover,News
0,CKH HOLDINGS,1,43.85,581.809M,"[ET Net News Agency, 31 March 2025] 5 listed ..."
1,CLP HOLDINGS,2,63.95,260.189M,"[ET Net News Agency, 25 February 2025] HSBC Gl..."
2,HK & CHINA GAS,3,6.9,322.676M,"[ET Net News Agency, 27 March 2025] 2 listed ..."
3,HSBC HOLDINGS,5,86.1,2.433B,"[ET Net News Agency, 27 March 2025] 4 compani..."
4,POWER ASSETS,6,48.05,216.599M,"[ET Net News Agency, 20 March 2025] POWER ASS..."
5,HANG SENG BANK,11,105.4,207.756M,"[ET Net News Agency, 24 March 2025] 10 compan..."
6,HENDERSON LAND,12,22.45,70.942M,"[ET Net News Agency, 21 March 2025] HENDERSON..."
7,SHK PPT,16,73.65,171.398M,"[ET Net News Agency, 21 March 2025] 4 compani..."
8,GALAXY ENT,27,30.15,273.065M,"[ET Net News Agency, 31 March 2025] GALAXY EN..."
9,MTR CORPORATION,66,25.7,144.269M,"[ET Net News Agency, 31 March 2025] MTR CORPO..."
