In [1]:
import requests
import json
import time
from bs4 import BeautifulSoup, Comment
import IPython.display as display
import pickle
import pandas as pd
import re

## Roadmap

In [182]:
def url_to_root(url):
    headers = {'Accept-Encoding': 'identity',
               'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
                }
    response = requests.get(url, headers=headers)
    print(response.status_code)
    
    root = BeautifulSoup(response.text, 'html5lib')
    return root

original_url = 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp'
root = url_to_root(original_url)

200


## Hieachy 1: parent court categories

In [261]:
def parse_courts(root):
    """Parse the info of parent courts.
    Args:
        root : BeautifulSoup Object
    Return:
        courts_dict (dict): court -> url
    """
    # get all raw infomation about courts (looks urgly)
    info_list = root.find('script', string=re.compile("var myMenu")).text.split("'")
    # find the parent courts' name inside 'span' tag
    courts_list = [BeautifulSoup(i).find('span').text.strip(" ") for i in info_list if i[1:5]=='span']
    # find corresponding urls of parents courts(order matters here)
    courts_urls = [j for j in info_list if j[:5]=='https']
    # store to dict
    courts_dict = {i:j for i, j in zip(courts_list, courts_urls)}
    return courts_dict

courts = parse_courts(root)
courts

{'Court of Final Appeal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1',
 'Court of Appeal of the High Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CA#H2',
 'High Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=HC#H3',
 'Competition Tribunal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CT#H4',
 'District Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=DC#H5',
 'Family Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FC#H6',
 'Lands Tribunal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=LD#H7',
 'Miscellaneous': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=OT#H8'}

In [262]:
df_courts = pd.DataFrame([(k,v) for k,v in courts.items()], columns=['parent_court', 'parent_court_url'])
df_courts

Unnamed: 0,parent_court,parent_court_url
0,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...
1,Court of Appeal of the High Court,https://legalref.judiciary.hk/lrs/common/ju/ju...
2,High Court,https://legalref.judiciary.hk/lrs/common/ju/ju...
3,Competition Tribunal,https://legalref.judiciary.hk/lrs/common/ju/ju...
4,District Court,https://legalref.judiciary.hk/lrs/common/ju/ju...
5,Family Court,https://legalref.judiciary.hk/lrs/common/ju/ju...
6,Lands Tribunal,https://legalref.judiciary.hk/lrs/common/ju/ju...
7,Miscellaneous,https://legalref.judiciary.hk/lrs/common/ju/ju...


## Hieachy 2: sub courts

In [273]:
url =  'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1'
root = url_to_root(url)
        
# get sub courts names
# get raw infomation
info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")


200


In [276]:
parent_courts_set = set(df_courts['parent_court'])
parent_courts_set

{'Competition Tribunal',
 'Court of Appeal of the High Court',
 'Court of Final Appeal',
 'District Court',
 'Family Court',
 'High Court',
 'Lands Tribunal',
 'Miscellaneous'}

In [277]:
parent_courts_set = set(df_courts['parent_court'])
now = set([BeautifulSoup(i.strip("' ")).text.strip(" ") for i in info_list if len(re.findall('<a name=', i))!=0])
sub_courts = list(now - parent_courts_set)


{'Final Appeal (Civil)',
 'Final Appeal (Criminal)',
 'Miscellaneous Proceedings',
 'Miscellaneous Proceedings (Civil)',
 'Miscellaneous Proceedings (Criminal)'}

In [278]:
def parse_sub_courts(courts):
    """Parse sub-courts information.
    Args:
        courts (dict) : parent_courts -> url
    Return:
        sub_dict (dict) : parent_courts -> dictionary(sub_courts -> url) 
    
    """
    sub_dict = {} # initialize
    parent_courts_set = set(df_courts['parent_court'])
    
    for c, url in courts.items():
        time.sleep(1) # request web slowly for benevolence
        root = url_to_root(url)
        
        # get sub courts names
        # get raw infomation
        info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")
        # create bs4 object list for strings has tag "a" and attr name
        now = set([BeautifulSoup(i.strip("' ")).text.strip(" ") for i in info_list if len(re.findall('<a name=', i))!=0])
        sub_courts = list(now - parent_courts_set)

#         # alternative way for sepcial case
#         if c == 'Miscellaneous':
#             # get raw infomation
#             info_list = root.find('script', string=re.compile("var myMenu")).text.split(",") # notice split by comma
#             # get text of bs4 object which has tag "a" and attr name and cut first eight
#             sub_courts = [BeautifulSoup(i[2:-1]).text for i in info_list if len(re.findall('<a name=', i))!=0][8:]
        print('subcourts:',len(sub_courts))
        
        # get sub courts urls
        # re-get raw infomation
        info_list = root.find('script', string=re.compile("var myMenu")).text.split("'") # notice split by up-comma
        # observe sub-courts' urls are longer than parents', so find sub-courts' urls' length
        http_min_len = min(set([len(i) for i in info_list if i[:5]=='https']))
        # get sub-courts' urls
        sub_courts_urls = [i for i in info_list if i[:5]=='https' if len(i)>http_min_len]
        print('urls', len(sub_courts_urls))
        # store in dict
        subs = {sc.strip(" "):url for sc, url in zip(sub_courts, sub_courts_urls)}
        # assign to parent courts
        print('dict length: ',len(subs))
        sub_dict[c] = subs
    return sub_dict

sub_dict = parse_sub_courts(courts)
sub_dict

200
subcourts: 5
urls 5
dict length:  5
200
subcourts: 6
urls 6
dict length:  6
200
subcourts: 31
urls 31
dict length:  31
200
subcourts: 2
urls 2
dict length:  2
200
subcourts: 11
urls 11
dict length:  11
200
subcourts: 3
urls 3
dict length:  3
200
subcourts: 19
urls 19
dict length:  19
200
subcourts: 11
urls 11
dict length:  11


{'Court of Final Appeal': {'Miscellaneous Proceedings': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=CV&AR=1#A1',
  'Final Appeal (Civil)': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=CC&AR=2#A2',
  'Miscellaneous Proceedings (Civil)': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MP&AR=3#A3',
  'Miscellaneous Proceedings (Criminal)': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MV&AR=4#A4',
  'Final Appeal (Criminal)': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MC&AR=5#A5'},
 'Court of Appeal of the High Court': {'Criminal Appeal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CA&L2=AR&AR=1#A1',
  'Civil Appeal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CA&L2=AG&AR=2#A2',
  'Miscellaneous Proceedings': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CA&L2=CV&AR=3#A3',
  'Application for Review': 'https://legalref.judiciary.hk/l

In [279]:
def df_for_sub(sub_dict):
    return pd.DataFrame([(k,v) for k,v in sub_dict.items()], columns=['sub_court', 'sub_court_url'])

dfs = []
for p_court, s_dict in sub_dict.items():
    sub_df = df_for_sub(s_dict)
    sub_df['parent_court'] = p_court
    dfs.append(sub_df)
#     print(sub_df)
df_all = pd.concat(dfs, axis=0)
df_all.index = range(len(df_all))
df_all = pd.merge(df_all, df_courts, how='left', on='parent_court')
df_all.head()

Unnamed: 0,sub_court,sub_court_url,parent_court,parent_court_url
0,Miscellaneous Proceedings,https://legalref.judiciary.hk/lrs/common/ju/ju...,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...
1,Final Appeal (Civil),https://legalref.judiciary.hk/lrs/common/ju/ju...,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...
2,Miscellaneous Proceedings (Civil),https://legalref.judiciary.hk/lrs/common/ju/ju...,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...
3,Miscellaneous Proceedings (Criminal),https://legalref.judiciary.hk/lrs/common/ju/ju...,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...
4,Final Appeal (Criminal),https://legalref.judiciary.hk/lrs/common/ju/ju...,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...


## Hiearchy 3: years

In [313]:
sub_court = 'Final Appeal (Civil)'

In [328]:
list(df_all[df_all['sub_court']==sub_court]['sub_court_url'])[0]

'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=CC&AR=2#A2'

In [394]:
def subCourt_get_years(sub_court, df_all, all_courts):
    url = list(df_all[df_all['sub_court']==sub_court]['sub_court_url'])[0]
#     print(url)
    root = url_to_root(url)
    info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")
    
    now_set = set([BeautifulSoup(i.strip("' ")).text.strip(" ") for i in info_list if len(re.findall('<a name=', i))!=0])
    years = now_set - all_courts
    return list(years)

courts_set = set(df_all['parent_court']) | set(df_all['sub_court'])
years = subCourt_get_years(sub_court, df_all, courts_set)
years

200


['2017', '2015', '2018', '2016', 'Pre2013', '2013', '2014']

In [395]:
def subCourt_get_urls(sub_court, df_all, urls_set):
    url = list(df_all[df_all['sub_court']==sub_court]['sub_court_url'])[0]
    root = url_to_root(url)
    info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")
    
    potential_urls = set([i.strip("' ") for i in info_list if i.strip("' ")[:5]=='https'])
    urls = potential_urls - urls_set
    return list(urls)

urls_set = set(df_all['sub_court_url']) | set(df_all['parent_court_url'])
urls = subCourt_get_urls(sub_court, df_all, urls_set)  
urls

200


['https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2017&AR=2_2#A2_2',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2014&AR=2_5#A2_5',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2016&AR=2_3#A2_3',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2018&AR=2_1#A2_1',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2013&AR=2_6#A2_6',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CC&AR=2#A2',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2015&AR=2_4#A2_4']

In [396]:
def match_nomals(years, urls, match_dict):
    url_pairs = [(re.findall('L3=....', i)[0][-4:], i) for i in urls]
    for y in years:
        for y_url, url in url_pairs:
            if y == y_url:
                match_dict[y] = url
    return match_dict


In [407]:
def match_year_urls(years, urls):
    match_dict = {} # initialize
    # check if has special('Pre****')
    special_ = [years[idx] for idx, y in enumerate(years) if len(re.findall('Pre', y))!=0]
#     print(special_)
    if len(special_) != 0:
        has_special = True
        # kick out special ones
        special_year = special_[0]
        normal_years = [y for y in years if len(re.findall('Pre', y))==0]
        normal_urls = [u for idx, u in enumerate(urls) if len(re.findall('L3=....', u))!=0]
        special_url = [u for idx, u in enumerate(urls) if len(re.findall('L3=....', u))==0][0]
    # normal case
    else: 
        has_special = False
        normal_years = years
        normal_urls = urls

    # match normals
    match_dict = match_nomals(normal_years, normal_urls, match_dict)
    
    # add special case
    if has_special:
        match_dict[special_year] = special_url
    return match_dict     
    
match_year_urls(years, urls)

['Pre2013']
['2017', '2015', '2018', '2016', '2013', '2014']


{'2017': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2017&AR=2_2#A2_2',
 '2015': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2015&AR=2_4#A2_4',
 '2018': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2018&AR=2_1#A2_1',
 '2016': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2016&AR=2_3#A2_3',
 '2013': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2013&AR=2_6#A2_6',
 '2014': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2014&AR=2_5#A2_5',
 'Pre2013': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CC&AR=2#A2'}

In [None]:
def get_year_urls(sub_court, df_all, urls_set, ):
    
    urls = subCourt_get_years(sub_court, df_all, urls_set)
    
    df = pd.DataFrame()

    
urls_set = set(df_all['sub_court_url']) | set(df_all['parent_court_url'])

In [284]:
all_courts = set(df_all['parent_court']) | set(df_all['sub_court'])

In [288]:
url = df_all.loc[0]['sub_court_url']
root = url_to_root(url)
info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")


200


In [None]:
urls_set = set(df_all['sub_court_url']) | set(df_all['parent_court_url'])
urls_set

In [309]:
set(potential_urls) - urls_set

{'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2013&AR=1_6#A1_6',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2014&AR=1_5#A1_5',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2015&AR=1_4#A1_4',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2016&AR=1_3#A1_3',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2017&AR=1_2#A1_2',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2018&AR=1_1#A1_1',
 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&AR=1#A1'}

In [306]:
potential_urls = [i.strip("' ") for i in info_list if i.strip("' ")[:5]=='https']
[len(i) for i in potential_urls]

[65,
 76,
 92,
 92,
 92,
 92,
 92,
 92,
 81,
 76,
 76,
 76,
 76,
 65,
 65,
 65,
 65,
 65,
 65,
 65]

In [289]:
c = set([BeautifulSoup(i.strip("' ")).text.strip(" ") for i in info_list if len(re.findall('<a name=', i))!=0])

In [290]:
c - all_courts

{'2013', '2014', '2015', '2016', '2017', '2018', 'Pre2013'}

In [263]:
B = BeautifulSoup(root.find('script', string=re.compile("var myMenu")).text.split('],[')[6])

In [295]:
B.find('td')

<td> <table bgcolor="#F7F5F0" border="\'1\'" bordercolor="\'#CAB484\'" cellpadding="\'2\'" cellspacing="\'0\'" width="\'100%\'"> <tr> <td valign="top" width="5%"><img alt="\'Judgment" chinese="" english="" in="" src="\'https://legalref.judiciary.hk/lrs/images/ThemeXP/CTpage.gif\'" traditional="" translation="" with=""/></td> <td valign="top" width="40%"> <a class="ThemeXPRowAnchor" href="javascript:matchpop(\'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=93798\',\'matchpage\');">CAAR2/2014</a><br/><font color="#006633">(07/07/2014)</font><br/><img height="1" src="\'https://legalref.judiciary.hk/lrs/images/spacer.gif\'" width="142"/></td> <td class="\'chinesefont\'" valign="top" width="55%">律政司司長 訴 林裕偉 <img src="\'/lrs/images/ThemeXP/star.gif\'"/>  Reported in :<font color="#8C029B">[2015] 1 HKLRD 393</font><br/><img height="2" src="\'https://legalref.judiciary.hk/lrs/images/spacer.gif\'" width="348"/></td> </tr></table></td>

In [306]:
B.find('td').find('img')['src']

"\\'https://legalref.judiciary.hk/lrs/images/ThemeXP/CTpage.gif\\'"

In [291]:
B.find('td').find('a').text

'CAAR2/2014'

In [284]:
B.find('td').find('font')

<font color="#006633">(07/07/2014)</font>

In [290]:
B.find('td').find('font').find_next('td').text

'律政司司長 訴 林裕偉  \xa0Reported in :[2015] 1 HKLRD 393'

In [285]:
B.find('td')

<td> <table bgcolor="#F7F5F0" border="\'1\'" bordercolor="\'#CAB484\'" cellpadding="\'2\'" cellspacing="\'0\'" width="\'100%\'"> <tr> <td valign="top" width="5%"><img alt="\'Judgment" chinese="" english="" in="" src="\'https://legalref.judiciary.hk/lrs/images/ThemeXP/CTpage.gif\'" traditional="" translation="" with=""/></td> <td valign="top" width="40%"> <a class="ThemeXPRowAnchor" href="javascript:matchpop(\'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=93798\',\'matchpage\');">CAAR2/2014</a><br/><font color="#006633">(07/07/2014)</font><br/><img height="1" src="\'https://legalref.judiciary.hk/lrs/images/spacer.gif\'" width="142"/></td> <td class="\'chinesefont\'" valign="top" width="55%">律政司司長 訴 林裕偉 <img src="\'/lrs/images/ThemeXP/star.gif\'"/>  Reported in :<font color="#8C029B">[2015] 1 HKLRD 393</font><br/><img height="2" src="\'https://legalref.judiciary.hk/lrs/images/spacer.gif\'" width="348"/></td> </tr></table></td>