In [1]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from urllib.parse import parse_qs, urlparse
from datetime import datetime
import time
import random

# I. Crawl Information

## English version 

Extract main page html, the website stores the names in javascript dataTable

In [2]:
eng_url = "https://www.hkba.org/Bar-List/pupils"
session = requests.Session()
eng_html = session.get(eng_url)
soup = BeautifulSoup(eng_html.text, 'html.parser')

In [24]:
name = soup.find_all('td',{'class':'views-field views-field-field-name'})
commence_date = soup.find_all('td',{'class':'views-field views-field-field-pupillage-commencement'})
address = soup.find_all('td',{'class':'views-field views-field-field-chamber-address'})
en_data = [{'Member': n.text.strip(), 
         'Commence Date': d.text.strip(), 
         'Address':a.text.split('/o')[1].strip()} for n,d,a in zip(name, commence_date, address)]

Save data to Dataframe

In [67]:
en_df = pd.DataFrame(en_data)
print(en_df.shape)

(89, 3)


## Chinese version 

In [26]:
cn_url = "https://www.hkba.org/zh-hant/Bar-List/pupils"
session = requests.Session()
cn_html = session.get(cn_url)
soup = BeautifulSoup(cn_html.text, 'html.parser')

In [38]:
name = soup.find_all('td',{'class':'views-field views-field-field-name'})
commence_date = soup.find_all('td',{'class':'views-field views-field-field-pupillage-commencement'})
address = soup.find_all('td',{'class':'views-field views-field-field-chamber-address'})
cn_data = [{'Member': n.text.strip(), 
         'Commence Date': d.text.strip(), 
         'Address':a.text.split('\n')[2].strip()} for n,d,a in zip(name, commence_date, address)]

Save to dataframe

In [39]:
cn_df = pd.DataFrame(cn_data)
print(cn_df.head())
print(cn_df.shape)

      Member Commence Date                           Address
0        包德偉    01.12.2019     (臨時辦事處) 香港中環 都爹利街11號 帝納大廈901室
1        陳振達    01.12.2019   (臨時辦事處) 香港中環 夏慤道12號 美國銀行中心3403室
2  陳曉姸  (女士)    01.09.2019     (臨時辦事處) 香港金鐘 金鐘道89號 力寶中心第1期9樓
3        陳珽浺    01.12.2019  (臨時辦事處) 香港金鐘 金鐘道89號 力寶中心第1期1401室
4        陳文軒    01.09.2019    (臨時辦事處) 香港金鐘 金鐘道88號 太古廣場第1期16樓
(89, 3)


Merge two dataframes

In [77]:
df = en_df.merge(cn_df, left_index = True, right_index=True, how='outer')
df.drop(['Commence Date_y','Address_y'],axis=1,inplace=True)
df.columns = ['Eng Name','Commence Date','Address','Chi Name']
df['Category'] = 'Professional Qualifications'
df['Organization'] = 'Hong Kong Bar Association'
df['Title'] = 'Pupil'

## Data cleaning

Change date format

In [78]:
df['Commence Date'] = pd.to_datetime(df['Commence Date'],format='%d.%m.%Y')

Add gender

In [79]:
df['Gender'] = df['Eng Name'].apply(lambda x: 'F' if 'Ms' in x else 'M')

Split First and Last Name Eng, delete (Ms) in first name if necessary

In [80]:
df['First Name Eng'] = df['Eng Name'].apply(lambda x: x.split(',')[1] if ',' in x else x)
df['First Name Eng'] = df['First Name Eng'].apply(lambda x: x[:x.find('(')] if '(' in x else x)
df['Last Name Eng'] = df['Eng Name'].apply(lambda x: x.split(',')[0] if ',' in x else x)

If Chi name in Eng, change its Chi name to null

In [74]:
def search_eng_character(name):
    eng_cha = re.search('[a-zA-Z]+',name)
    if eng_cha:
        return ""
    else:
        return name

In [82]:
df['Chi Name'] = df['Chi Name'].apply(search_eng_character)

Split First and Last Name Chi, delete (女士) in first name if necessary. If name is in Eng, skip

In [85]:
df['First Name Chi'] = df['Chi Name'].apply(lambda x: x[1:] if x else "")
df['First Name Chi'] = df['First Name Chi'].apply(lambda x: x[:x.find('(')] if '(' in x else x)
df['Last Name Chi'] = df['Chi Name'].apply(lambda x: x[0] if x else "")

Order the columns

In [87]:
df.columns

Index(['Eng Name', 'Commence Date', 'Address', 'Chi Name', 'Category',
       'Organization', 'Title', 'Gender', 'First Name Eng', 'Last Name Eng',
       'First Name Chi', 'Last Name Chi'],
      dtype='object')

In [88]:
df = df[['Last Name Chi','First Name Chi','Chi Name',
         'Last Name Eng','First Name Eng','Eng Name',
         'Gender','Title','Commence Date',
         'Category','Organization']]

Save to csv file

In [90]:
df.to_excel('香港大律师公会_Pupil.xlsx',encoding='utf_8_sig')
df.to_csv('香港大律师公会_Pupil.csv',encoding='utf_8_sig')