In [1]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from urllib.parse import parse_qs, urlparse
from datetime import datetime
import time
import random

# I. Crawl Information

Extract main page html of The Law Society of Hong Kong, the website uses Big5 encoding, rather than utf-8

In [2]:
url = "http://www.hklawsoc.org.hk/pub_e/memberlawlist/honour.asp?"
session = requests.Session()
html = session.get(url)
html.encoding = 'Big5'
# print(html.encoding)
# print(html.apparent_encoding)
soup = BeautifulSoup(html.text, 'html.parser')

Extract Eng, Chi name and admission year of all members

In [3]:
eng_name = soup.find_all('td',{'class':'atext'})
chi_name = soup.find_all('td',{'class':'ctext'})
year = soup.find_all('td',{'class':'pubtext','valign':'middle','align':'center'})[2:]
data = [{'Eng Name':e.text.strip(), 
         'Chi Name':c.text.strip(), 
         'Year of Admission': y.text.strip()} for e,c,y in zip(eng_name, chi_name, year)]

Save data to Dataframe

In [20]:
df = pd.DataFrame(data)
print(df.shape)
print(df.head(8))

(34, 3)
                                            Eng Name Chi Name  \
0         Sir Yuet-Keung KAN , GBE, Hon. LLD, BA, JP      簡悅強   
1  P.A.L. VINE, OBE, VRD, LLB (Lond), Hon. LLD (H...      范培德   
2        Dr. The Hon. P. C. WOO, LLB, PhD (Lond), JP      胡百全   
3                                        W.I. CHEUNG      張永賢   
4                                         Kenneth LO      羅德璋   
5                                  Francis H.B. WONG      黃學斌   
6                                       WONG Wai Pat      黃維弼   
7                              Ella S. K. CHEONG, JP      張淑姬   

  Year of Admission  
0              2001  
1              2001  
2              2001  
3              2002  
4              2002  
5              2002  
6              2003  
7              2004  


Extract member's url and info if possible

In [5]:
member_url = []
url_prefix = 'http://www.hklawsoc.org.hk/pub_e/memberlawlist/'
for member in eng_name:
    if member.find('a'):
        member_url.append(url_prefix + member.find('a').get('href'))

In [6]:
items_keep = ['Name (English)','Name (Chinese)',
              'Admission in Hong Kong','Remark',
             'Admission in Other Jurisdiction(s)',
             'Post','Firm/Company (English)','Firm/Company (Chinese)']

In [7]:
USER_AGENT_LIST = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1"
    ]

In [8]:
member_info = []
i = 0 
start = time.time()
for url in member_url:
    USER_AGENT = random.choice(USER_AGENT_LIST)
    headers = {'user-agent': USER_AGENT}
    html = session.get(url, headers=headers)
    html.encoding = 'Big5'
    soup = BeautifulSoup(html.text, 'html.parser')
    content = soup.find('td',{'class':'pubtext'}).find('td').find_all('tr')
    member = {}
    for item in content:
        values = item.find_all('td')
        if len(values) > 1:
            key = values[0].text.strip()
            if key in items_keep:
                value = values[1].text.strip()
                # special process for admission in other jurisdictions
                if key == 'Admission in Other Jurisdiction(s)':
                    value = [x.strip() for x in value.split('\n') if x][2:]
                    value = [country + '-' + year for country, year in zip(value[0::2], value[1::2])]
                member[key] = value       
    member_info.append(member)
    print(i, member_info[-1])
    i += 1
end = time.time()
print('It takes {} seconds to crawl {} member info'.format(end-start, len(member_url)))

0 {'Name (English)': 'CHEONG SHUK KI, ELLA', 'Name (Chinese)': '張淑姬', 'Admission in Hong Kong': '03/1963', 'Remark': 'Holding Current Practising CertificateNotary Public (member of the Hong Kong Society of Notaries)', 'Admission in Other Jurisdiction(s)': ['ENGLAND AND WALES-1967', 'VICTORIA (AUSTRALIA)-11/1975'], 'Post': 'Partner', 'Firm/Company (English)': 'ELLALAN', 'Firm/Company (Chinese)': '張淑姬趙之威律師行'}
1 {'Name (English)': 'WONG CHUNG HIN', 'Name (Chinese)': '黃頌顯', 'Admission in Hong Kong': '02/1959', 'Remark': 'Holding Current Practising Certificate', 'Admission in Other Jurisdiction(s)': ['ENGLAND AND WALES-11/1958'], 'Post': 'Consultant', 'Firm/Company (English)': 'WOO & CO., P.C.', 'Firm/Company (Chinese)': '胡百全律師事務所'}
2 {'Name (English)': 'CHEUNG YAN SHUN, EDMUND', 'Name (Chinese)': '張恩純', 'Admission in Hong Kong': '02/1957', 'Remark': 'Holding Current Practising Certificate', 'Admission in Other Jurisdiction(s)': ['ENGLAND AND WALES-07/1966'], 'Post': 'Consultant', 'Firm/Com

Save to dataframe

In [21]:
member_info_df = pd.DataFrame(member_info)
print(member_info_df.shape)
print(member_info_df.head())

(23, 8)
            Name (English) Name (Chinese) Admission in Hong Kong  \
0     CHEONG SHUK KI, ELLA            張淑姬                03/1963   
1           WONG CHUNG HIN            黃頌顯                02/1959   
2  CHEUNG YAN SHUN, EDMUND            張恩純                02/1957   
3      LEUNG OI SIE, ELSIE            梁愛詩                01/1968   
4  CHAN CHEUK, CHRISTOPHER            陳 爵                11/1970   

                                              Remark  \
0  Holding Current Practising CertificateNotary P...   
1             Holding Current Practising Certificate   
2             Holding Current Practising Certificate   
3  Holding Current Practising CertificateNotary P...   
4     Solicitor Not in Private Practice in Hong Kong   

                  Admission in Other Jurisdiction(s)        Post  \
0  [ENGLAND AND WALES-1967, VICTORIA (AUSTRALIA)-...     Partner   
1                        [ENGLAND AND WALES-11/1958]  Consultant   
2                        [ENGLAND AND WALE

## Data cleaning

Merge two dataframes, change column names, add category, organization and title

In [22]:
df = df.merge(member_info_df, left_on = 'Chi Name', right_on = 'Name (Chinese)', how='left')

In [23]:
df['Category'] = 'Professional Qualifications'
df['Organization'] = 'The Law Society of Hong Kong'

In [24]:
df.to_csv('香港律师会_raw.csv',encoding='utf_8_sig')

In [27]:
df = pd.read_csv('香港律师会_raw.csv',index_col=0,encoding='utf-8')

Change date format

In [29]:
df['Admission in Hong Kong'] = pd.to_datetime(df['Admission in Hong Kong'],format='%m/%Y')

Split First and Last Name Chi. If name is in Eng, skip

In [30]:
df['Last Name Chi'] = df['Chi Name'].apply(lambda x: "" if not str(x) else str(x)[0])
df['First Name Chi'] = df['Chi Name'].apply(lambda x: "" if not str(x) else str(x)[1:])

Split First and Last Name Eng, delete (Ms) in first name if necessary

In [38]:
df['Eng Name'] = df['Eng Name'].apply(lambda x: x.split(',')[0] if ',' in x else x)

In [47]:
def split_last_name_eng(name):
    last_name = re.search('[A-Z]{2,7}',name)
    if last_name:
        return last_name.group()
    else:
        return ""

In [48]:
df['Last Name Eng'] = df['Eng Name'].apply(split_last_name_eng)

In [51]:
df['First Name Eng'] = df.apply(lambda x: x['Eng Name'].replace(x['Last Name Eng'],"").strip(),axis=1)

Order the columns

In [53]:
df.columns

Index(['Eng Name', 'Chi Name', 'Year of Admission', 'Name (English)',
       'Name (Chinese)', 'Admission in Hong Kong', 'Remark',
       'Admission in Other Jurisdiction(s)', 'Post', 'Firm/Company (English)',
       'Firm/Company (Chinese)', 'Category', 'Organization', 'Last Name Chi',
       'First Name Chi', 'Last Name Eng', 'First Name Eng'],
      dtype='object')

In [55]:
df = df[['Last Name Chi','First Name Chi','Chi Name',
         'Last Name Eng','First Name Eng','Eng Name',
         'Year of Admission','Admission in Hong Kong','Remark',
         'Admission in Other Jurisdiction(s)','Post',
         'Firm/Company (English)','Firm/Company (Chinese)',
         'Category','Organization']]

Save to csv file

In [56]:
df.to_excel('香港律师会.xlsx',encoding='utf_8_sig')
df.to_csv('香港律师会.csv',encoding='utf_8_sig')