In [1]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from urllib.parse import parse_qs, urlparse
from datetime import datetime
import time
import random

# I. Crawl Information

Helper function to extract information about Hong Kong Bar Association - senior counsel

In [2]:
def extract_data(soup, language = 'Eng'):
    '''
    @soup: BeautifulSoup object contains html of the website
    @language: a string indicates whether the language is Chi or Eng
    
    the function extract member information on current page and return a data dictionary with below format
    return data: {'Member': name, 'Name of Chambers': ,Address': , 'Tel.No': , 'Fax.No':, 'Website': , Call':}
    ''' 
    data = {}
    
    # name format: last name Eng, middle name Eng, initials Eng, title, name Chi, title Chi 
    name = soup.find('div',{'class':'barrister-name'}).text.strip().split(' ')
    name = ' '.join(name[:-1]).strip()
    data['Member'] = name
    
    # other info, split key and value using the first :
    info = soup.find_all('div',{'class':'barrister-row'})
    for item in info:
        content = item.text
        key = content[:content.find(':')]
        value = content[content.find(':')+1:]
        data[key] = value
        
    return data

## English version 

Extract main page html, the website stores the names in javascript dataTable

In [3]:
eng_url = "https://www.hkba.org/Bar-List/arbitrators"
session = requests.Session()
eng_html = session.get(eng_url)
soup = BeautifulSoup(eng_html.text, 'html.parser')

get the html where the javascript defines the dataSet variable

In [4]:
content = soup.find_all('script')[25].text
end_str = 'david-kk","",""],]'
idx = content.find(end_str)
# get only dataset html
content = content[:idx+len(end_str)]
# get only lists of html
content = content.split('[')[2:]

Get the html of each member's website

In [5]:
def extract_member_html(html):
    html_prefix = 'https://www.hkba.org/'
    url = re.search('barrister/[0-9a-zA-ZÀ-ÿ-.\-]+',html)
    if url:
        return html_prefix + url.group()
    else:
        return ""

In [6]:
content_df = pd.DataFrame(content, columns = ['html'])
content_df.html = content_df.html.apply(extract_member_html)
member_url = content_df.html.tolist()

Extract member data on all pages. To mimic human visits, set to headers as below.

In [7]:
USER_AGENT_LIST = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    ]

My network is a bit slow so I separate the crawling into 4 parts.

In [8]:
data = []
i = 0
start = time.time()
for url in member_url:
    USER_AGENT = random.choice(USER_AGENT_LIST)
    headers = {'user-agent': USER_AGENT}
    html = session.get(url, headers=headers)
    soup = BeautifulSoup(html.text, 'html.parser')
    data.append(extract_data(soup, 'Eng'))
    print(i, data[-1])
    i += 1
end = time.time()
print('Take {} second to crawl {} items'.format(end-start,len(member_url)))

0 {'Member': 'Acton-Bond, Jonathan Edward 龐仲寧', 'Name of Chambers': 'Trinity Chambers', 'Address': '18/F, Bangkok Bank Bldg, 28 Des Voeux Rd Central, H.K.', 'Tel.No': '25212666', 'Fax.No': '28454844', 'Mobile': '96570742', 'E-mail': 'jeactonbond@gmail.com', 'Quals': 'BA (Sussex U.) , FCI Arb', 'Call': 'HK( 1988 ), UK( 1971  )'}
1 {'Member': 'Aiken, Nigel, S.C. 區啟賢 資深大律師', 'Name of Chambers': 'Temple Chambers', 'Address': '16/F, One Pacific Place, 88 Queensway, Admiralty, H.K.', 'Tel.No': '25232003', 'Fax.No': '28100302', 'Website': 'http://www.templechambers.com', 'Call': 'HK( 1981 ), Inner HK( 1994 ), UK( 1974  )'}
2 {'Member': 'Alder, Edward A.G.', 'Name of Chambers': "Prince's Chambers", 'Address': '3002, Tower Two, Lippo Centre, 89 Queensway, Admiralty, H.K.', 'Tel.No': '25257388', 'Fax.No': '25304241', 'Mobile': '60189770', 'E-mail': 'edwardalder@princeschambers.com.hk', 'Quals': 'Solicitor (HK) [1994-2006] , Solicitor (NSW) [1992 --]', 'Call': 'HK( 2006  )'}
3 {'Member': 'Au, Hel

23 {'Member': 'Chung, Kenneth K.H. 鍾建康 *', 'Name of Chambers': "Sir Oswald Cheung's Chambers", 'Address': '10/F, New Henry House, 10 Ice House St, Central, H.K.', 'Tel.No': '25242156', 'Fax.No': '28105656', 'E-mail': 'kchung@siroswald.com', 'Quals': 'LLM , BSc (Civil Engineering) , MBA , Chartered Engineer (UK) , Registered Structural Engineer (HK) , Registered Professional Engineer (Structural) (HK) , MHKIE , MIStructE , MICE , FHKI Arb , MCI Arb , HKIAC Accredited General Mediator', 'Call': 'HK( 2002  )'}
24 {'Member': 'Chung, la Fontaine L.F. (Ms) 鍾麗芳  (女士) *', 'Address': 'Rm 52, New Henry House, 10 Ice House St, Central, H.K.', 'Tel.No': '25861550', 'Fax.No': '28109196', 'E-mail': 'chunglf@graduate.hku.hk', 'Quals': 'LLB , PCLL (HKU) , LLM (HKU) , BA (Accountancy) , FCCA , CPA , ACIS , ACS , FHKI Arb , HKIAC Accredited General Mediator , HKMAAL Accredited General Mediator', 'Call': 'HK( 2004  )'}
25 {'Member': 'Chung, Peter K.T. 鍾錦棠', 'Name of Chambers': 'Sky Chambers', 'Address': 

Save data to Dataframe

In [12]:
df = pd.DataFrame(data)
df['Category'] = 'Professional Qualifications'
df['Organization'] = 'Hong Kong Bar Association'
df['Title'] = 'Arbitrator'
print(df.head())
print(df.shape)

                            Member    Name of Chambers  \
0  Acton-Bond, Jonathan Edward 龐仲寧    Trinity Chambers   
1     Aiken, Nigel, S.C. 區啟賢 資深大律師     Temple Chambers   
2               Alder, Edward A.G.   Prince's Chambers   
3  Au, Helen H.L. (Ms) 區曉嵐  (女士) *                 NaN   
4             Barlow, Barrie, S.C.  Des Voeux Chambers   

                                             Address    Tel.No    Fax.No  \
0  18/F, Bangkok Bank Bldg, 28 Des Voeux Rd Centr...  25212666  28454844   
1  16/F, One Pacific Place, 88 Queensway, Admiral...  25232003  28100302   
2  3002, Tower Two, Lippo Centre, 89 Queensway, A...  25257388  25304241   
3  Rm 28, New Henry House, 10 Ice House St, Centr...  35763130  35763030   
4  38/F, Gloucester Tower, The Landmark,  Central...  25263071  28105287   

     Mobile                              E-mail  \
0  96570742               jeactonbond@gmail.com   
1       NaN                                 NaN   
2  60189770  edwardalder@princeschambers.

## Data cleaning

Add gender

In [13]:
df['Gender'] = df['Member'].apply(lambda x: 'F' if 'Ms' in x else 'M')

Separate Eng and Chi name

In [14]:
def split_chi_name(name):
    chi_name = re.findall(r'[\u4e00-\u9fa5]+',name)
    if chi_name:
        return chi_name[0]
    else:
        return ""

In [15]:
df['Chi Name'] = df['Member'].apply(split_chi_name)

In [26]:
def split_eng_name(name):
    chi_name = split_chi_name(name)
    if chi_name:
        chi_name_idx = name.find(chi_name)
        return name[:chi_name_idx].strip()
    else: 
        return name

In [27]:
df['Eng Name'] = df['Member'].apply(split_eng_name)
df['Eng Name'] = df['Eng Name'].apply(lambda x: x[:x.find('(')] if '(' in x else x)

Split First and Last Name Eng

In [29]:
df['First Name Eng'] = df['Eng Name'].apply(lambda x: x.split(',')[1] if ',' in x else x)
df['Last Name Eng'] = df['Eng Name'].apply(lambda x: x.split(',')[0] if ',' in x else x)

Split First and Last Name Chi

In [30]:
df['First Name Chi'] = df['Chi Name'].apply(lambda x: x[1:] if x else "")
df['Last Name Chi'] = df['Chi Name'].apply(lambda x: x[0] if x else "")

Order the columns

In [32]:
df.columns

Index(['Member', 'Name of Chambers', 'Address', 'Tel.No', 'Fax.No', 'Mobile',
       'E-mail', 'Quals', 'Call', 'Website', 'Pager', 'Category',
       'Organization', 'Title', 'Gender', 'Chi Name', 'Eng Name',
       'First Name Eng', 'Last Name Eng', 'First Name Chi', 'Last Name Chi'],
      dtype='object')

In [33]:
df = df[['Last Name Chi','First Name Chi','Chi Name',
         'Last Name Eng','First Name Eng','Eng Name','Member',
         'Gender','Title','Category','Organization',
         'Name of Chambers','Quals','Call',
         'Address','Mobile','Tel.No','Fax.No','E-mail','Website','Pager']]

Save to csv file

In [34]:
df.to_excel('香港大律师公会_Arbitrator.xlsx',encoding='utf_8_sig')
df.to_csv('香港大律师公会_Arbitrator.csv',encoding='utf_8_sig')