In [8]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from urllib.parse import parse_qs, urlparse
from datetime import datetime
import time
import random

# I. Crawl Information

Helper function to extract information about Hong Kong Bar Association - senior counsel

In [2]:
def extract_data(soup, language = 'Eng'):
    '''
    @soup: BeautifulSoup object contains html of the website
    @language: a string indicates whether the language is Chi or Eng
    
    the function extract member information on current page and return a data dictionary with below format
    return data: {'Member': name, 'Name of Chambers': ,Address': , 'Tel.No': , 'Fax.No':, 'Website': , Call':}
    ''' 
    data = {}
    
    # name format: last name Eng, middle name Eng, initials Eng, title, name Chi, title Chi 
    name = soup.find('div',{'class':'barrister-name'}).text.strip().split(' ')
    name = ' '.join(name[:-1]).strip()
    data['Member'] = name
    
    # other info, split key and value using the first :
    info = soup.find_all('div',{'class':'barrister-row'})
    for item in info:
        content = item.text
        key = content[:content.find(':')]
        value = content[content.find(':')+1:]
        data[key] = value
        
    return data

## English version 

Extract main page html, the website stores the names in javascript dataTable

In [3]:
eng_url = "https://www.hkba.org/Bar-List/senior-counsel"
session = requests.Session()
eng_html = session.get(eng_url)
soup = BeautifulSoup(eng_html.text, 'html.parser')

get the html where the javascript defines the dataSet variable

In [4]:
content = soup.find_all('script')[27].text
end_str = 'ty-sc","",""],]'
idx = content.find(end_str)
# get only dataset html
content = content[:idx+len(end_str)]
# get only lists of html
content = content.split('[')[2:]

Get the html of each member's website

In [5]:
# extract only the html part from data, remove ""
member_url = [x.split(',')[-4][1:-1] for x in content]
# concat the prefix and member specific html
html_prefix = "https://www.hkba.org/"
member_url = [html_prefix + x for x in member_url]

Extract member data on all pages. To mimic human visits, set to headers as below.

In [6]:
USER_AGENT_LIST = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)"
]

In [10]:
data = []
i = 0
start = time.time()
for url in member_url:
    USER_AGENT = random.choice(USER_AGENT_LIST)
    headers = {'user-agent': USER_AGENT}
    html = session.get(url, headers=headers)
    soup = BeautifulSoup(html.text, 'html.parser')
    data.append(extract_data(soup, 'Eng'))
    print(i, data[-1])
    i += 1
end = time.time()
print('Take {} second to crawl 104 items'.format(end-start))

0 {'Member': 'Lee, Martin C.M., S.C. 李柱銘 資深大律師', 'Address': '704A, Tower 1, Admiralty Centre, 18 Harcourt Rd, Admiralty, H.K.', 'Tel.No': '25290864', 'Fax.No': '28612829', 'Call': 'HK( 1966 ), Inner HK( 1979 ), UK( 1965  )'}
1 {'Member': 'Chang, Denis K.L., S.C. 張健利 資深大律師', 'Name of Chambers': "Denis Chang's Chambers", 'Address': '9th Floor, One Lippo Centre, 89 Queensway, Admiralty, H.K.', 'Tel.No': '28107222', 'Fax.No': '28450439', 'E-mail': 'dchang@dcc.law', 'Website': 'https://dcc.law/barrister/denis-chang/', 'Call': 'HK( 1970 ), Inner HK( 1981 ), UK( 1968  )'}
2 {'Member': 'Griffiths, John, S.C.', 'Name of Chambers': 'Des Voeux Chambers', 'Address': '38/F, Gloucester Tower, The Landmark,  Central, H.K.', 'Tel.No': '25263071', 'Fax.No': '28105287', 'Call': 'HK( 1979 ), Inner HK( 1982 ), UK( 1956  )'}
3 {'Member': 'Cheng, Huan, S.C. 清洪 資深大律師', 'Name of Chambers': "Cheng Huan SC's Chambers", 'Address': '15/F, Dina House, 11 Duddell St, Central, H.K.', 'Tel.No': '25262293', 'Fax.No': 

29 {'Member': 'Dykes, Philip John, S.C. 戴啟思 資深大律師', 'Name of Chambers': 'Bernacchi Chambers', 'Address': '1402, Tower 1, Admiralty Centre, 18 Harcourt Rd, Admiralty, H.K.', 'Tel.No': '25220066', 'Fax.No': '28450851', 'E-mail': 'pjd@bernacchichambers.com', 'Call': 'HK( 1985 ), Inner HK( 1997 ), UK( 1977  )'}
30 {'Member': 'Leong, Alan K.K., S.C. 梁家傑 資深大律師 *', 'Name of Chambers': "Alan Leong, S.C.'s Chambers", 'Address': "Rms 1323A-1328, Prince's Bldg, 10 Chater Rd, Central, H.K.", 'Tel.No': '25266182', 'Fax.No': '28681730', 'Quals': 'FCI Arb', 'Call': 'HK( 1983 ), Inner HK( 1998  )'}
31 {'Member': 'Reading, John Richard, S.C. 李定國 資深大律師', 'Name of Chambers': 'Pacific Chambers', 'Address': 'Rm 901, Dina House, 11 Duddell St, Central, H.K.', 'Tel.No': '25215544', 'Fax.No': '25245912', 'Mobile': '90428840', 'E-mail': 'johnreadingsc@gmail.com', 'Quals': 'Dip Law (BAB) , Dip in CRIM , BA (Deakin U. of Victoria) , LLM (HKU) , FCI Arb , CEDR Accredited Mediator', 'Call': 'HK( 2002 ), Inner HK( 

55 {'Member': 'Tse, Joseph W.Y., S.C. 謝華淵,若瑟 資深大律師', 'Name of Chambers': 'Des Voeux Chambers', 'Address': '38/F, Gloucester Tower, The Landmark,  Central, H.K.', 'Tel.No': '28101008 /25263071', 'Fax.No': '28105287', 'Call': 'HK( 1984 ), Inner HK( 2006  )'}
56 {'Member': 'Tam, Winnie, S.C. (Ms) 譚允芝 資深大律師 (女士)', 'Name of Chambers': 'Des Voeux Chambers', 'Address': '38/F, Gloucester Tower, The Landmark,  Central, H.K.', 'Tel.No': '25242225', 'Fax.No': '25373393', 'E-mail': 'winnietam@dvc.com.hk', 'Call': 'HK( 1984 ), Inner HK( 2006 ), UK( 1988  )'}
57 {'Member': 'Mok, Johnny S.L., S.C. 莫樹聯 資深大律師', 'Name of Chambers': 'Des Voeux Chambers', 'Address': '38/F, Gloucester Tower, The Landmark,  Central, H.K.', 'Tel.No': '25242225', 'Fax.No': '25373393', 'E-mail': 'mail@johnnymok.com', 'Call': 'HK( 1986 ), Inner HK( 2006  )(Voluntary Removal – 1992)\n(Restoration of Admission – 1993)'}
58 {'Member': 'Harris, Paul, S.C. 夏博義 資深大律師', 'Name of Chambers': "Denis Chang's Chambers", 'Address': '9th Flo

81 {'Member': 'Wong, Anson M.K., S.C. 黃文傑 資深大律師 *', 'Name of Chambers': 'Des Voeux Chambers', 'Address': '38/F, Gloucester Tower, The Landmark,  Central, H.K.', 'Tel.No': '25263071', 'Fax.No': '28105287', 'Mobile': '60922188', 'E-mail': 'ansonwong@dvc.com.hk', 'Quals': 'LLB (HKU) , LLM (Cantab) , Bar Scholarship (1999)', 'Call': 'HK( 1999 ), Inner HK( 2014  )'}
82 {'Member': 'Kat, Nigel, S.C. 祁志 資深大律師', 'Name of Chambers': 'Parkside Chambers', 'Address': 'Suite 3101, Two Pacific Place, 88 Queensway, Admiralty, H.K.', 'Tel.No': '28401130', 'Fax.No': '28100612', 'E-mail': 'chambers@nkat.net', 'Website': 'http://www.parksidechambers.com.hk/members/nigel-kat/', 'Quals': 'LLB , MBA (Export Management & International Business) , FHKI Arb , FCI Arb , Chartered Arbitrator , CEDR Accredited Mediator', 'Call': 'HK( 1984 ), Inner HK( 2015 ), UK( 1977  )'}
83 {'Member': 'Hui, Martin S.T., S.C. 許紹鼎 資深大律師', 'Name of Chambers': 'Gary Plowman S.C. Chambers', 'Address': 'Rm 1401, Tower One, Lippo Centr

Save data to Dataframe

In [69]:
df = pd.DataFrame(data)
df['Category'] = 'Professional Qualifications'
df['Organization'] = 'Hong Kong Bar Association'
print(df.head())
print(df.shape)

                              Member  \
0   Lee, Martin C.M., S.C. 李柱銘 資深大律師   
1  Chang, Denis K.L., S.C. 張健利 資深大律師   
2              Griffiths, John, S.C.   
3         Cheng, Huan, S.C. 清洪 資深大律師   
4  Chan, Edward K.S., S.C. 陳景生 資深大律師   

                                             Address    Tel.No    Fax.No  \
0  704A, Tower 1, Admiralty Centre, 18 Harcourt R...  25290864  28612829   
1  9th Floor, One Lippo Centre, 89 Queensway, Adm...  28107222  28450439   
2  38/F, Gloucester Tower, The Landmark,  Central...  25263071  28105287   
3     15/F, Dina House, 11 Duddell St, Central, H.K.  25262293  28453749   
4  10/F, New Henry House, 10 Ice House St, Centra...  25242156  28105656   

                                        Call              Name of Chambers  \
0  HK( 1966 ), Inner HK( 1979 ), UK( 1965  )                           NaN   
1  HK( 1970 ), Inner HK( 1981 ), UK( 1968  )        Denis Chang's Chambers   
2  HK( 1979 ), Inner HK( 1982 ), UK( 1956  )            Des Voeux Ch

## Data cleaning

Add gender

In [70]:
df['Gender'] = df['Member'].apply(lambda x: 'F' if 'Ms' in x else 'M')

Separate Eng and Chi name

In [71]:
df['Eng Name'] = df['Member'].apply(lambda x: x.split('S.C.')[0].strip()[:-1] if 'S.C.' in x else x)

In [72]:
def split_chi_name(name):
    chi_name = re.findall(r'[\u4e00-\u9fa5]+',name)
    if chi_name:
        return chi_name[0]
    else:
        return ""

In [73]:
df['Chi Name'] = df['Member'].apply(split_chi_name)

Split First and Last Name Eng

In [74]:
df['First Name Eng'] = df['Eng Name'].apply(lambda x: x.split(',')[1] if ',' in x else x)
df['Last Name Eng'] = df['Eng Name'].apply(lambda x: x.split(',')[0] if ',' in x else x)

Split First and Last Name Chi

In [75]:
df['First Name Chi'] = df['Chi Name'].apply(lambda x: x[1:] if x else "")
df['Last Name Chi'] = df['Chi Name'].apply(lambda x: x[0] if x else "")

Add title

In [76]:
df['Title'] = 'Senior Counsel'

Order the columns

In [77]:
df.columns

Index(['Member', 'Address', 'Tel.No', 'Fax.No', 'Call', 'Name of Chambers',
       'E-mail', 'Website', 'Quals', 'Mobile', 'Category', 'Organization',
       'Gender', 'Eng Name', 'Chi Name', 'First Name Eng', 'Last Name Eng',
       'First Name Chi', 'Last Name Chi', 'Title'],
      dtype='object')

In [78]:
df = df[['Last Name Chi','First Name Chi','Chi Name',
         'Last Name Eng','First Name Eng','Eng Name','Member',
         'Gender','Title','Category','Organization',
         'Name of Chambers','Quals','Call',
         'Address','Mobile','Tel.No','Fax.No','E-mail','Website']]

Save to csv file

In [79]:
df.to_excel('香港大律师公会_SeniorCounsel.xlsx',encoding='utf_8_sig')
df.to_csv('香港大律师公会_SeniorCounsel.csv',encoding='utf_8_sig')