In [1]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from urllib.parse import parse_qs, urlparse
from datetime import datetime

# I. Crawl Information

Helper function to extract information about Recipients of Hong Kong Special Administrative Region Honours and Awards

In [2]:
def extract_data(soup, language = 'Eng'):
    '''
    @soup: BeautifulSoup object contains html of the website
    @language: a string indicates whether the language is Chi or Eng
    
    the function extract member information on current page and return a data dictionary with below format
    return data: [member1: {'year': }, member2: {'year': }]
    ''' 
    data = []
    content = soup.find_all('tr',{'valign':'top'})
    for member in content:
        name = member.find_all('td')[0].text
        date = member.find_all('td')[1].text
        # skip rows without proper name and date
        try:
            day, month, year = date.split('.')
        except:
            continue
        date = datetime(int(year),int(month),int(day))
        data.append({'Member':name, 'Date':datetime.date(date)})
        
    return data

## English version 

Extract html on the first page

In [3]:
eng_url = "https://www.info.gov.hk/cml/eng/miscell/index3.htm"
eng_html = requests.get(eng_url)
soup = BeautifulSoup(eng_html.text, 'html.parser')

Get award Eng name

In [4]:
en_award = soup.find('h1',{'class':'header'}).text.strip()

Get urls of next 3 pages

In [5]:
urls = []
page_urls = soup.find('td',{'align':'center'}).find_all('a')
for page_url in page_urls:
    urls.append(page_url.get('href'))

main_url = urlparse(eng_url).scheme + '://' + urlparse(eng_url).netloc
paths = urlparse(eng_url).path.split('/')

Extract member data on all pages

In [6]:
i = 0
en_data = []
while True: 
        
    # extract data of current page and append to en_data
    data = extract_data(soup, language = 'Eng') 
    en_data = en_data + data
    
    if i==3:
        break
        
    # replace path to get url of next page, update soup
    paths[-1] = urls[i]
    eng_url = main_url + '/'.join(paths)
    eng_html = requests.get(eng_url)
    soup = BeautifulSoup(eng_html.text, 'html.parser')
    
    i += 1

Save data to Dataframe

In [7]:
en_df = pd.DataFrame(en_data)
en_df['Award'] = en_award
en_df.columns = ['Eng Name','Award Date','Award']
print(en_df.shape)

(315, 3)


## Chinese version 

To avoid Chinese decoding problem, use "utf-8" encoding. Extract html on the first page.

In [8]:
cn_url = "https://www.info.gov.hk/cml/miscell/index3.htm"
cn_html = requests.get(cn_url)
cn_html.encoding = 'utf-8'
soup = BeautifulSoup(cn_html.text, 'html.parser')

Get urls of next 3 pages

In [9]:
urls = []
page_urls = soup.find('td',{'align':'center'}).find_all('a')
for page_url in page_urls:
    urls.append(page_url.get('href'))

main_url = urlparse(cn_url).scheme + '://' + urlparse(cn_url).netloc
paths = urlparse(cn_url).path.split('/')

Extract data on all pages

In [10]:
i = 0
cn_data = []
while True: 
        
    # extract data of current page and append to en_data
    data = extract_data(soup, language = 'Eng') 
    cn_data = cn_data + data
    
    if i==3:
        break
        
    # replace path to get url of next page, update soup
    paths[-1] = urls[i]
    cn_url = main_url + '/'.join(paths)
    cn_html = requests.get(cn_url)
    cn_html.encoding = 'utf-8'
    soup = BeautifulSoup(cn_html.text, 'html.parser')
    
    i += 1

Save data to Dataframe

In [11]:
cn_df = pd.DataFrame(cn_data)
cn_df.columns = ['Chi Name','Award Date']
cn_df.shape

(315, 2)

## Data cleaning

Merger Eng and Chi dataframes and rename columns

In [12]:
df = en_df.merge(cn_df, left_index = True, right_index = True, how='outer')
df.drop(['Award Date_y'],axis = 1,inplace = True)
df.columns = ['Eng Name','Award Date','Award','Chi Name']

Add organization and catefory

In [13]:
df['Category'] = 'Awards & Recognitions'
df['Organization'] = 'Justices of the Peace'

Delete (the Honourable) and award abbreviation in Eng names

In [14]:
# delete (the Honourable) in Eng Name
df['Eng Name'] = df['Eng Name'].apply(lambda x: x[:x.find('(')] if '(' in x else x)

In [15]:
def del_abbreviation(name):
    abbreviate = re.search('\, [A-Z]\.',name)
    if abbreviate:
        return name[:name.find(abbreviate.group())].strip()
    else:
        return name.strip()

In [16]:
# delete abbreviation in Eng Name
df['Eng Name'] = df['Eng Name'].apply(del_abbreviation)

Delete abbreviation in Chi Name

In [17]:
df['Chi Name'] = df['Chi Name'].apply(lambda x: x.split('，')[0].strip() if '，' in x else x)

Add gender based on prefix in Chi Name

In [18]:
df['Gender'] = df['Chi Name'].apply(lambda x: 'F' if '女士' in x else "")

Split prefix in Chi Name only, since there is no prefix in Eng name

In [19]:
def split_prefix_cn(name):
    # if the name in Engslish, don't split prefix
    en_name = re.search(r'[a-zA-Z0-9]+', name)
    if en_name:
        return ""
    # if the name is in Chinese, return Chinese prefix
    else:
        prefix = re.search('(?:先生|小姐|女士|工程師|機長|博士|首席|議員|首席法官|爵士|法師|教授|醫生|大主教|副庭長|勳爵|法官|牧師)',name)
        if prefix:
            return prefix.group()
        else:
            return ""

In [20]:
df['Prefix CN'] = df['Chi Name'].apply(split_prefix_cn)

Split first and last Eng name, also split if there is preferred name

In [21]:
def split_last_name_eng(name):
    last_name = re.search('[A-Z]+(?![a-z])',name)
    if last_name:
        return last_name.group()
    else:
        return ""

In [22]:
df['Last Name Eng'] = df['Eng Name'].apply(split_last_name_eng)

In [23]:
df['First Name Eng'] = df.apply(lambda x: x['Eng Name'].replace(x['Last Name Eng'], "").strip(), axis=1)

In [24]:
df['Preferred Name Eng'] = df.apply(lambda x: x['First Name Eng'].split(',')[1] if ',' in x['First Name Eng'] else "", axis =1)

In [25]:
# remove preferred name from first name
df['First Name Eng'] = df['First Name Eng'].apply(lambda x: x.split(',')[0] if ',' in x else x)
# remove - at the beginning of the first name
df['First Name Eng'] = df['First Name Eng'].apply(lambda x: x[1:] if x.find('-') == 0 else x)

Split first and last Chi name

In [26]:
def split_last_name_chi(name):
    # if the name is in Chinese, return first character 
    en_name = re.search(r'[a-zA-Z0-9]+', name)
    if en_name:
        return split_last_name_eng(en_name.group())
    else:
        return name[0]

In [27]:
df['Last Name Chi'] = df['Chi Name'].apply(split_last_name_chi)

In [28]:
df['First Name Chi'] = df.apply(lambda x: x['Chi Name'].replace(x['Prefix CN'], "")
                               .replace(x['Last Name Chi'], "").strip(), axis=1)

Order the columns

In [29]:
df = df[['Last Name Chi','First Name Chi','Chi Name','Prefix CN',
         'Last Name Eng','First Name Eng','Preferred Name Eng','Eng Name',
         'Gender','Category','Organization','Award Date','Award']]

Save to csv file

In [30]:
df.to_excel('太平绅士名单.xlsx',encoding='utf_8_sig')
df.to_csv('太平绅士名单.csv',encoding='utf_8_sig')