In [1]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

# I. Crawl Information

Helper function to extract information about Recipients of Hong Kong Special Administrative Region Honours and Awards

In [48]:
def extract_data(soup, content, language = 'Eng'):
    '''
    @soup: BeautifulSoup object contains html of the website
    @content: list of content contains awards, years and member names info
    @language: a string indicates whether the language is Chi or Eng
    
    the function extract information store in html and return a data dictionary with below format
    return data: [member1: {'award': , 'year': }, member2: {'award': , 'year': }]
    '''
    
    data = []
    # number of awards
    awards = soup.select('.italic')
    awards = [x.text for x in awards]
    n_awards = len(awards)

    for i in range(n_awards):
        # get award name without abbreviation
        if '(' in awards[i]:
            # slice until the second (), keep (Gold) / (Silver)
            if '(Gold)' in awards[i] or '(Silver)' in awards[i]:
                idx = awards[i].find('(', awards[i].find('(') + 1) -1
            else:
                idx = awards[i].find('(')
            awards[i] = awards[i][0:idx]
        # delete white space or newline at two sides of the string
        awards[i] = awards[i].strip() 

        # get start and end index of content under award i
        start_idx = content.index(soup.select('.italic')[i])
        if i != (n_awards - 1):
            end_idx = content.index(soup.select('.italic')[i+1])
        else:
            end_idx = len(content)
        # get list of content items under this award
        award_content = content[start_idx:end_idx]
        
        # get index of year in award_content list
        year_idx = [i for i,value in enumerate(award_content) if 'strong' in str(value)]
        for yr in range(len(year_idx)):
            year = int(award_content[year_idx[yr]].text)
            # get member content in each year's award
            if yr != (len(year_idx) -1):
                annual_content = award_content[year_idx[yr]+1:year_idx[yr+1]]
            else:
                annual_content = award_content[year_idx[yr]+1:]
            for member in annual_content:
                member = member.text
                if language == 'Eng':
                    member = member.split(',')[0]
                elif language =='Chi':
                    member = member.split('，')[0]
                # the member could receive more than one award, thus save as list
                data.append({'Member':member,'Award':awards[i], 'Year':year})
            
    return data, awards

## English version 

In [50]:
eng_url = "https://www.info.gov.hk/cml/eng/miscell/index2.htm"
eng_html = requests.get(eng_url)

In [51]:
soup = BeautifulSoup(eng_html.text, 'html.parser')
content = soup.find_all(['h2','strong','li'])[4:]

In [52]:
en_data, en_awards = extract_data(soup, content, language = 'Eng')

## Chinese version 

To avoid Chinese decoding problem, use "utf-8" encoding

In [53]:
cn_url = 'https://www.info.gov.hk/cml/miscell/index2.htm'
cn_html = requests.get(cn_url)
cn_html.encoding = 'utf-8'

In [54]:
soup = BeautifulSoup(cn_html.text, 'html.parser')
content = soup.find_all(['h2','strong','li'])[4:]

In [55]:
cn_data, cn_awards = extract_data(soup, content, language = 'Chi')

## Match Eng and Chi Award Names 

In [56]:
awards = {e:c for e,c in zip(en_awards, cn_awards)}

# II. Store Data

Save data in pandas dataframe

In [57]:
en_df = pd.DataFrame(en_data)
en_df.columns=['Eng Name','Award','Year']

cn_df = pd.DataFrame(cn_data)
cn_df.columns=['Chi Name','Award','Year']

There might be repeated member names who receive more than one awards.

In [58]:
print(en_df.shape, cn_df.shape)

(6840, 3) (6840, 3)


In [59]:
print(en_df.isnull().sum())
print(cn_df.isnull().sum())

Eng Name    0
Award       0
Year        0
dtype: int64
Chi Name    0
Award       0
Year        0
dtype: int64


## Data cleaning

In [60]:
# delete the Honourable in Eng Name
en_df['Eng Name'] = en_df['Eng Name'].apply(lambda x: x[x.find('Honourable')+10:] if 'Honourable' in x else x)

In [61]:
# delete the \t at the end of CN name
cn_df['Chi Name'] = cn_df['Chi Name'].apply(lambda x: x.rstrip())

In [423]:
# merge two dataframes
df = en_df.merge(cn_df, left_index = True, right_index = True, how='outer')
# rename columns
df = df.drop(['Award_y','Year_y'], axis = 1)
df.columns = ['Eng Name', 'Award','Year','Chi Name']

In [424]:
# add organization and catefory
df['Category'] = 'Awards & Recognitions'
df['Organization'] = 'Hong Kong Special Administrative Region Honours and Awards'

In [425]:
# remove whitespace in names
df['Eng Name'] = df['Eng Name'].str.strip()
df['Eng Name'] = df['Eng Name'].apply(lambda x: ' '.join(x.split()))
df['Chi Name'] = df['Chi Name'].str.strip()
# bug, cannot replace '\u3000' inline
df['Eng Name'].replace(u'\u3000',u'',inplace=True)

Split EN prefix

In [426]:
def split_prefix_eng(name):
    prefix = re.search('(?:Mrs|Mr|Ms|Miss|Dr|Sir)\.?',name)
    if prefix:
        return prefix.group()
    else:
        return ""

In [427]:
df['Prefix'] = df['Eng Name'].apply(split_prefix_eng)

Split first and last Eng name

In [428]:
def split_last_name_eng(name):
    last_name = re.search('[A-Z]+(?![a-z])',name)
    if last_name:
        return last_name.group()
    else:
        return ""

In [430]:
df['Last Name Eng'] = df['Eng Name'].apply(split_last_name_eng)

In [431]:
df['First Name Eng'] = df.apply(lambda x: x['Eng Name'].replace(x['Prefix'], "")
                               .replace(x['Last Name Eng'], "").strip(), axis=1)

Split prefix cn 

In [432]:
def split_prefix_cn(name):
    # if the name in Engslish, use helper function
    en_name = re.search(r'[a-zA-Z0-9]+', name)
    if en_name:
        return split_prefix_eng(en_name.group())
    # if the name is in Chinese, return Chinese prefix
    else:
        prefix = re.search('(?:先生|小姐|女士|工程師|機長|博士|首席|議員|首席法官|爵士|法師|教授|醫生|大主教|副庭長|勳爵|法官|牧師)',name)
        if prefix:
            return prefix.group()
        else:
            return ""

In [433]:
df['Prefix CN'] = df['Chi Name'].apply(split_prefix_cn)

Split first and last Chi name

In [434]:
def split_last_name_chi(name):
    # if the name is in Chinese, return first character 
    en_name = re.search(r'[a-zA-Z0-9]+', name)
    if en_name:
        return split_last_name_eng(en_name.group())
    else:
        return name[0]

In [435]:
df['Last Name Chi'] = df['Chi Name'].apply(split_last_name_chi)

In [436]:
df['First Name Chi'] = df.apply(lambda x: x['Chi Name'].replace(x['Prefix CN'], "")
                               .replace(x['Last Name Chi'], "").strip(), axis=1)

Add gender based on EN and CN prefix

In [437]:
def add_gender(x):
    if x in ['先生']:
        return 'M'
    elif x in ['小姐','女士']:
        return 'F'
    else:
        return ""

In [438]:
df['Gender'] = df['Prefix CN'].apply(add_gender)

Order the columns

In [439]:
df = df[['Last Name Chi','First Name Chi','Chi Name','Prefix CN',
         'Last Name Eng','First Name Eng','Eng Name','Prefix',
         'Gender','Category','Organization','Year','Award']]

Save to csv file

In [441]:
df.to_excel('香港特别行政区获授勋及嘉奖人士名单.xlsx',encoding='utf_8_sig')
df.to_csv('香港特别行政区获授勋及嘉奖人士名单.csv',encoding='utf_8_sig')