In [1]:
import requests
import json
import time
from bs4 import BeautifulSoup, Comment
import IPython.display as display
import pickle
import pandas as pd
import re

## Roadmap

In [2]:
def url_to_root(url):
    headers = {'Accept-Encoding': 'identity',
               'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
                }
    response = requests.get(url, headers=headers)
    print(response.status_code)
    
    root = BeautifulSoup(response.text, 'html5lib')
    return root

original_url = 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp'
root = url_to_root(original_url)

200


## Hieachy 1: parent court categories

In [3]:
def parse_courts(root):
    """Parse the info of parent courts.
    Args:
        root : BeautifulSoup Object
    Return:
        courts_dict (dict): court -> url
    """
    # get all raw infomation about courts (looks urgly)
    info_list = root.find('script', string=re.compile("var myMenu")).text.split("'")
    # find the parent courts' name inside 'span' tag
    courts_list = [BeautifulSoup(i).find('span').text.strip(" ") for i in info_list if i[1:5]=='span']
    # find corresponding urls of parents courts(order matters here)
    courts_urls = [j for j in info_list if j[:5]=='https']
    # store to dict
    courts_dict = {i:j for i, j in zip(courts_list, courts_urls)}
    return courts_dict

courts = parse_courts(root)
courts

{'Court of Final Appeal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1',
 'Court of Appeal of the High Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CA#H2',
 'High Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=HC#H3',
 'Competition Tribunal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CT#H4',
 'District Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=DC#H5',
 'Family Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FC#H6',
 'Lands Tribunal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=LD#H7',
 'Miscellaneous': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=OT#H8'}

In [4]:
df_courts = pd.DataFrame([(k,v) for k,v in courts.items()], columns=['parent_court', 'parent_court_url'])
df_courts

Unnamed: 0,parent_court,parent_court_url
0,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...
1,Court of Appeal of the High Court,https://legalref.judiciary.hk/lrs/common/ju/ju...
2,High Court,https://legalref.judiciary.hk/lrs/common/ju/ju...
3,Competition Tribunal,https://legalref.judiciary.hk/lrs/common/ju/ju...
4,District Court,https://legalref.judiciary.hk/lrs/common/ju/ju...
5,Family Court,https://legalref.judiciary.hk/lrs/common/ju/ju...
6,Lands Tribunal,https://legalref.judiciary.hk/lrs/common/ju/ju...
7,Miscellaneous,https://legalref.judiciary.hk/lrs/common/ju/ju...


## Hieachy 2: sub courts

In [5]:
url = 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=DC#H5'
root = url_to_root(url)
        
# get raw infomation
info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")

200


In [6]:
parent_courts_set = set(df_courts['parent_court'])
parent_courts_set

{'Competition Tribunal',
 'Court of Appeal of the High Court',
 'Court of Final Appeal',
 'District Court',
 'Family Court',
 'High Court',
 'Lands Tribunal',
 'Miscellaneous'}

In [7]:
def clear_name(a):
    ans = ' '.join([i.strip("\\s'")for i in a.split(" ")])
    return ans

parent_courts_set = set(df_courts['parent_court'])
now = set([BeautifulSoup(i.strip("' ")).text.strip(" ") for i in info_list if len(re.findall('<a name=', i))!=0])
sub_courts = sorted(list(now - parent_courts_set))
sub_courts = [clear_name(s) for s in sub_courts]
sub_courts

['Civil Action',
 'Criminal Case',
 'Distraint Case',
 'District Court Tax Claim',
 'Employee Compensation Case',
 'Equal Opportunitie Action',
 'Intended Action',
 'Miscellaneou Proceeding',
 'Occupational Deafne (Compensation) Appeal',
 'Personal Injurie Action',
 'Stamp Duty Appeal']

In [8]:
def parse_sub_courts(courts, parent_courts_set):
    """Parse sub-courts information.
    Args:
        courts (dict) : parent_courts -> url
    Return:
        sub_dict (dict) : parent_courts -> dictionary(sub_courts -> url) 
    
    """
    sub_dict = {} # initialize

    for c, url in courts.items():
        time.sleep(1) # request web slowly for benevolence
        root = url_to_root(url)
        
        # get sub courts names
        # get raw infomation
        info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")
        # create bs4 object list for strings has tag "a" and attr name
        now = set([BeautifulSoup(i.strip("' ")).text.strip(" ") for i in info_list if len(re.findall('<a name=', i))!=0])
        sub_courts = sorted(list(now - parent_courts_set)) # notice order in alphbets
        sub_courts_cleared = [clear_name(s) for s in sub_courts] # clear names
        print('subcourts:',len(sub_courts))
        
        # get sub courts urls
        # re-get raw infomation
        info_list = root.find('script', string=re.compile("var myMenu")).text.split("'") # notice split by up-comma
        # observe sub-courts' urls are longer than parents', so find sub-courts' urls' length
        http_min_len = min(set([len(i) for i in info_list if i[:5]=='https']))
        # get sub-courts' urls
        sub_courts_urls = [i for i in info_list if i[:5]=='https' if len(i)>http_min_len]
        print('urls:', len(sub_courts_urls))
        # store in dict
        subs = {sc.strip(" "):(sc_, url) for sc, sc_, url in zip(sub_courts, sub_courts_cleared, sub_courts_urls)}
        # assign to parent courts
        print('dict length: ',len(subs))
        sub_dict[c] = subs
    return sub_dict

parent_courts_set = set(df_courts['parent_court'])
sub_dict = parse_sub_courts(courts, parent_courts_set)
sub_dict

200
subcourts: 5
urls: 5
dict length:  5
200
subcourts: 6
urls: 6
dict length:  6
200
subcourts: 31
urls: 31
dict length:  31
200
subcourts: 2
urls: 2
dict length:  2
200
subcourts: 11
urls: 11
dict length:  11
200
subcourts: 3
urls: 3
dict length:  3
200
subcourts: 19
urls: 19
dict length:  19
200
subcourts: 11
urls: 11
dict length:  11


{'Court of Final Appeal': {'Final Appeal (Civil)': ('Final Appeal (Civil)',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=CV&AR=1#A1'),
  'Final Appeal (Criminal)': ('Final Appeal (Criminal)',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=CC&AR=2#A2'),
  'Miscellaneous Proceedings': ('Miscellaneou Proceeding',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MP&AR=3#A3'),
  'Miscellaneous Proceedings (Civil)': ('Miscellaneou Proceeding (Civil)',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MV&AR=4#A4'),
  'Miscellaneous Proceedings (Criminal)': ('Miscellaneou Proceeding (Criminal)',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MC&AR=5#A5')},
 'Court of Appeal of the High Court': {'Application for Review': ('Application for Review',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CA&L2=AR&AR=1#A1'),
  "Attorney General\\'s Reference": ('Attorney General 

In [9]:
def df_for_sub(sub_dict):
    return pd.DataFrame([(k,v[0],v[1]) for k,v in sub_dict.items()], columns=['sub_court', 'sub_court_cleared', 'sub_court_url'])

dfs = []
for p_court, s_dict in sub_dict.items():
    sub_df = df_for_sub(s_dict)
    sub_df['parent_court'] = p_court
    dfs.append(sub_df)
    
df_all = pd.concat(dfs, axis=0)
df_all.index = range(len(df_all))
df_all = pd.merge(df_all, df_courts, how='left', on='parent_court')
df_all.head()

Unnamed: 0,sub_court,sub_court_cleared,sub_court_url,parent_court,parent_court_url
0,Final Appeal (Civil),Final Appeal (Civil),https://legalref.judiciary.hk/lrs/common/ju/ju...,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...
1,Final Appeal (Criminal),Final Appeal (Criminal),https://legalref.judiciary.hk/lrs/common/ju/ju...,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...
2,Miscellaneous Proceedings,Miscellaneou Proceeding,https://legalref.judiciary.hk/lrs/common/ju/ju...,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...
3,Miscellaneous Proceedings (Civil),Miscellaneou Proceeding (Civil),https://legalref.judiciary.hk/lrs/common/ju/ju...,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...
4,Miscellaneous Proceedings (Criminal),Miscellaneou Proceeding (Criminal),https://legalref.judiciary.hk/lrs/common/ju/ju...,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/ju...


In [10]:
df_all.sub_court_cleared

0                                  Final Appeal (Civil)
1                               Final Appeal (Criminal)
2                               Miscellaneou Proceeding
3                       Miscellaneou Proceeding (Civil)
4                    Miscellaneou Proceeding (Criminal)
5                                Application for Review
6                            Attorney General Reference
7                                          Civil Appeal
8                                       Criminal Appeal
9                               Miscellaneou Proceeding
10                       Reservation of Question of Law
11                                     Admiralty Action
12                                 Adoption Application
13                                Application for Grant
14    Application to et aside a Statutory Demand (un...
15        Application under the Mental Health Ordinance
16                                Bankruptcy Proceeding
17                                              

## Hiearchy 3: years

In [11]:
sub_ = "West Kowloon Magistrate Court Charge Case"

In [12]:
list(df_all[df_all['sub_court_cleared']==sub_]['sub_court_url'])

['https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=WK&L2=CC&AR=11#A11']

In [13]:
def subCourt_get_years(sc_cleared, df_all, all_courts):
    url = list(df_all[df_all['sub_court_cleared']==sc_cleared]['sub_court_url'])[0]
#     print(url)
    time.sleep(1)
    root = url_to_root(url)
    info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")
    
    now_set = set([BeautifulSoup(i.strip("' ")).text.strip(" ") for i in info_list if len(re.findall('<a name=', i))!=0])
    years = now_set - all_courts
    return list(years)

courts_set = set(df_all['parent_court']) | set(df_all['sub_court'])
years = subCourt_get_years(sub_, df_all, courts_set)
years

200


['2017']

In [14]:
def subCourt_get_urls(sub_court, df_all, urls_set):
    url = list(df_all[df_all['sub_court_cleared']==sub_court]['sub_court_url'])[0]
    time.sleep(1)
    root = url_to_root(url)
    info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")
    
    potential_urls = set([i.strip("' ") for i in info_list if i.strip("' ")[:5]=='https'])
    urls = potential_urls - urls_set
    return list(urls)

urls_set = set(df_all['sub_court_url']) | set(df_all['parent_court_url'])
urls = subCourt_get_urls(sub_, df_all, urls_set)  
urls

200


['https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=WK&L2=CC&L3=2017&AR=11_1#A11_1']

In [15]:
def match_nomals(years, urls, match_dict):
    url_pairs = [(re.findall('L3=....', i)[0][-4:], i) for i in urls]
    for y in years:
        for y_url, url in url_pairs:
            if y == y_url:
                match_dict[y] = url
    return match_dict


In [16]:
def match_year_urls(years, urls):
    match_dict = {} # initialize
    # check if has special('Pre****')
    special_ = [years[idx] for idx, y in enumerate(years) if len(re.findall('Pre', y))!=0]
#     print(special_)
    if len(special_) != 0:
        has_special = True
        # kick out special ones
        special_year = special_[0]
        normal_years = [y for y in years if len(re.findall('Pre', y))==0]
        normal_urls = [u for idx, u in enumerate(urls) if len(re.findall('L3=....', u))!=0]
        special_url = [u for idx, u in enumerate(urls) if len(re.findall('L3=....', u))==0][0]
    # normal case
    else: 
        has_special = False
        normal_years = years
        normal_urls = urls

    # match normals
    match_dict = match_nomals(normal_years, normal_urls, match_dict)
    
    # add special case
    if has_special:
        match_dict[special_year] = special_url
    return match_dict     
    
match_year_urls(years, urls)

{'2017': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=WK&L2=CC&L3=2017&AR=11_1#A11_1'}

In [17]:
pd.set_option('max_colwidth',1000)

In [18]:
def get_year_urls(sub_court, df_all, courts_set, urls_set):
    # get years
    years = subCourt_get_years(sub_court, df_all, courts_set)
    # get urls
    urls = subCourt_get_urls(sub_court, df_all, urls_set)  
    # match them
    match_dict = match_year_urls(years, urls)
    # convert to DataFrame
    data = [(k,v) for k, v in match_dict.items()]
    match = pd.DataFrame(data, columns=['year', 'year_url'])
    match['sub_court_cleared'] = sub_court
    return match

courts_set = set(df_all['parent_court']) | set(df_all['sub_court'])
urls_set = set(df_all['sub_court_url']) | set(df_all['parent_court_url'])
df_test = get_year_urls(sub_, df_all, courts_set, urls_set)
df_test

200
200


Unnamed: 0,year,year_url,sub_court_cleared
0,2017,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=WK&L2=CC&L3=2017&AR=11_1#A11_1,West Kowloon Magistrate Court Charge Case


In [19]:
from tqdm import tqdm

In [20]:
dfs = []
for sc in tqdm(df_all['sub_court_cleared']):
    print('for sub_court: ',sc)
    match = get_year_urls(sc, df_all, courts_set, urls_set)
    dfs.append(match)
    
df_years = pd.concat(dfs, axis=0)
print(df_years.shape)
df_years

  0%|                                                   | 0/88 [00:00<?, ?it/s]

for sub_court:  Final Appeal (Civil)
200
200


  1%|▍                                          | 1/88 [00:03<04:48,  3.31s/it]

for sub_court:  Final Appeal (Criminal)
200
200


  2%|▉                                          | 2/88 [00:23<12:04,  8.43s/it]

for sub_court:  Miscellaneou Proceeding
200
200


  3%|█▍                                         | 3/88 [00:27<10:07,  7.15s/it]

for sub_court:  Miscellaneou Proceeding (Civil)
200
200


  5%|█▉                                         | 4/88 [00:37<11:06,  7.94s/it]

for sub_court:  Miscellaneou Proceeding (Criminal)
200
200


  6%|██▍                                        | 5/88 [00:52<13:58, 10.10s/it]

for sub_court:  Application for Review
200
200


  7%|██▉                                        | 6/88 [00:56<11:16,  8.26s/it]

for sub_court:  Attorney General Reference
200
200


  8%|███▍                                       | 7/88 [01:05<11:25,  8.46s/it]

for sub_court:  Civil Appeal
200
200


  9%|███▉                                       | 8/88 [01:09<09:26,  7.08s/it]

for sub_court:  Criminal Appeal
200
200


 10%|████▍                                      | 9/88 [01:15<08:58,  6.81s/it]

for sub_court:  Miscellaneou Proceeding
200
200


 11%|████▊                                     | 10/88 [01:28<11:20,  8.73s/it]

for sub_court:  Reservation of Question of Law
200
200


 12%|█████▎                                    | 11/88 [01:38<11:29,  8.96s/it]

for sub_court:  Admiralty Action
200
200


 14%|█████▋                                    | 12/88 [01:47<11:21,  8.96s/it]

for sub_court:  Adoption Application
200
200


 15%|██████▏                                   | 13/88 [01:51<09:25,  7.54s/it]

for sub_court:  Application for Grant
200
200


 16%|██████▋                                   | 14/88 [01:55<07:53,  6.40s/it]

for sub_court:  Application to et aside a Statutory Demand (under Bankruptcy Ordinance)
200
200


 17%|███████▏                                  | 15/88 [01:59<06:54,  5.68s/it]

for sub_court:  Application under the Mental Health Ordinance
200
200


 18%|███████▋                                  | 16/88 [02:10<08:48,  7.34s/it]

for sub_court:  Bankruptcy Proceeding
200
200


 19%|████████                                  | 17/88 [02:28<12:19, 10.41s/it]

for sub_court:  Caveat
200
200


 20%|████████▌                                 | 18/88 [02:31<09:47,  8.40s/it]

for sub_court:  Citation Application
200
200


 22%|█████████                                 | 19/88 [02:53<14:21, 12.48s/it]

for sub_court:  Civil Action
200
200


 23%|█████████▌                                | 20/88 [02:57<11:14,  9.92s/it]

for sub_court:  Commercial Action
200
200


 24%|██████████                                | 21/88 [03:01<09:08,  8.18s/it]

for sub_court:  Companie Winding-up Proceeding
200
200


 25%|██████████▌                               | 22/88 [03:05<07:31,  6.85s/it]

for sub_court:  Confidential Miscellaneou Proceeding
200
200


 26%|██████████▉                               | 23/88 [03:09<06:28,  5.97s/it]

for sub_court:  Constitutional and Administrative Law Proceeding
200
200


 27%|███████████▍                              | 24/88 [03:15<06:22,  5.97s/it]

for sub_court:  Construction and Arbitration Proceeding
200
200


 28%|███████████▉                              | 25/88 [03:19<05:30,  5.25s/it]

for sub_court:  Criminal Case
200
200


 30%|████████████▍                             | 26/88 [03:22<04:53,  4.74s/it]

for sub_court:  Estate Duty Appeal
200
200


 31%|████████████▉                             | 27/88 [03:26<04:29,  4.42s/it]

for sub_court:  Ex-parte Application
200
200


 32%|█████████████▎                            | 28/88 [03:43<08:12,  8.20s/it]

for sub_court:  High Court Bankruptcy Interim Order
200
200


 33%|█████████████▊                            | 29/88 [03:47<06:47,  6.90s/it]

for sub_court:  Inland Revenue Appeal
200
200


 34%|██████████████▎                           | 30/88 [03:51<05:56,  6.14s/it]

for sub_court:  Intended Action
200
200


 35%|██████████████▊                           | 31/88 [03:55<05:08,  5.42s/it]

for sub_court:  Labour Tribunal Appeal
200
200


 36%|███████████████▎                          | 32/88 [03:58<04:30,  4.83s/it]

for sub_court:  Legal Aid Appeal
200
200


 38%|███████████████▊                          | 33/88 [04:02<04:07,  4.50s/it]

for sub_court:  Magistracy Appeal
200
200


 39%|████████████████▏                         | 34/88 [04:25<08:57,  9.95s/it]

for sub_court:  Matrimonial Cause
200
200


 40%|████████████████▋                         | 35/88 [04:29<07:10,  8.12s/it]

for sub_court:  Minor Employment Claim Appeal
200
200


 41%|█████████████████▏                        | 36/88 [04:33<05:57,  6.88s/it]

for sub_court:  Miscellaneou Proceeding
200
200


 42%|█████████████████▋                        | 37/88 [04:36<05:04,  5.97s/it]

for sub_court:  Miscellaneou Proceeding (Criminal)
200
200


 43%|██████████████████▏                       | 38/88 [04:40<04:22,  5.24s/it]

for sub_court:  Obscene Article Tribunal Appeal
200
200


 44%|██████████████████▌                       | 39/88 [04:44<03:54,  4.79s/it]

for sub_court:  Personal Injurie Action
200
200


 45%|███████████████████                       | 40/88 [04:52<04:43,  5.91s/it]

for sub_court:  Probate Action
200
200


 47%|███████████████████▌                      | 41/88 [04:56<04:08,  5.28s/it]

for sub_court:  Small Claim Tribunal Appeal
200
200


 48%|████████████████████                      | 42/88 [05:09<05:50,  7.62s/it]

for sub_court:  Competition Tribunal Action
200
200


 49%|████████████████████▌                     | 43/88 [05:13<04:47,  6.39s/it]

for sub_court:  Competition Tribunal Enforcement Action
200
200


 50%|█████████████████████                     | 44/88 [05:16<04:06,  5.60s/it]

for sub_court:  Civil Action
200
200


 51%|█████████████████████▍                    | 45/88 [05:30<05:41,  7.95s/it]

for sub_court:  Criminal Case
200
200


 52%|█████████████████████▉                    | 46/88 [05:34<04:45,  6.80s/it]

for sub_court:  Distraint Case
200
200


 53%|██████████████████████▍                   | 47/88 [05:38<04:06,  6.01s/it]

for sub_court:  District Court Tax Claim
200
200


 55%|██████████████████████▉                   | 48/88 [05:43<03:43,  5.59s/it]

for sub_court:  Employee Compensation Case
200
200


 56%|███████████████████████▍                  | 49/88 [05:47<03:19,  5.11s/it]

for sub_court:  Equal Opportunitie Action
200
200


 57%|███████████████████████▊                  | 50/88 [05:51<03:08,  4.95s/it]

for sub_court:  Intended Action
200
200


 58%|████████████████████████▎                 | 51/88 [05:55<02:54,  4.71s/it]

for sub_court:  Miscellaneou Proceeding
200
200


 59%|████████████████████████▊                 | 52/88 [06:00<02:44,  4.56s/it]

for sub_court:  Occupational Deafne (Compensation) Appeal
200
200


 60%|█████████████████████████▎                | 53/88 [06:04<02:35,  4.45s/it]

for sub_court:  Personal Injurie Action
200
200


 61%|█████████████████████████▊                | 54/88 [06:08<02:25,  4.28s/it]

for sub_court:  Stamp Duty Appeal
200
200


 62%|██████████████████████████▎               | 55/88 [06:12<02:18,  4.20s/it]

for sub_court:  Joint application
200
200


 64%|██████████████████████████▋               | 56/88 [06:16<02:12,  4.15s/it]

for sub_court:  Matrimonial Cause
200
200


 65%|███████████████████████████▏              | 57/88 [06:20<02:08,  4.15s/it]

for sub_court:  Miscellaneou Proceeding
200
200


 66%|███████████████████████████▋              | 58/88 [06:24<02:04,  4.16s/it]

for sub_court:  Building Management Application
200
200


 67%|████████████████████████████▏             | 59/88 [06:28<02:01,  4.20s/it]

for sub_court:  Building Ordinance Application
200
200


 68%|████████████████████████████▋             | 60/88 [06:33<02:00,  4.32s/it]

for sub_court:  Demolished Building Appeal
200
200


 69%|█████████████████████████████             | 61/88 [06:47<03:12,  7.14s/it]

for sub_court:  Demolished Building Application
200
200


 70%|█████████████████████████████▌            | 62/88 [06:51<02:47,  6.43s/it]

for sub_court:  Government Rent Appeal
200
200


 72%|██████████████████████████████            | 63/88 [06:55<02:20,  5.64s/it]

for sub_court:  Housing Ordinance Appeal
200
200


 73%|██████████████████████████████▌           | 64/88 [07:00<02:09,  5.41s/it]

for sub_court:  Land Compulsory Sale Application
200
200


 74%|███████████████████████████████           | 65/88 [07:04<01:57,  5.10s/it]

for sub_court:  Land Resumption Application
200
200


 75%|███████████████████████████████▌          | 66/88 [07:09<01:50,  5.04s/it]

for sub_court:  Landlord  Appeal
200
200


 76%|███████████████████████████████▉          | 67/88 [07:25<02:53,  8.28s/it]

for sub_court:  MTR Ordinance Application
200
200


 77%|████████████████████████████████▍         | 68/88 [07:31<02:29,  7.46s/it]

for sub_court:  Miscellaneou Proceeding Application
200
200


 78%|████████████████████████████████▉         | 69/88 [07:35<02:02,  6.47s/it]

for sub_court:  Miscellaneou Reference Application
200
200


 80%|█████████████████████████████████▍        | 70/88 [07:40<01:48,  6.01s/it]

for sub_court:  New Tenancy Application
200
200


 81%|█████████████████████████████████▉        | 71/88 [07:49<01:57,  6.89s/it]

for sub_court:  Part I Possession Application
200
200


 82%|██████████████████████████████████▎       | 72/88 [07:53<01:38,  6.15s/it]

for sub_court:  Part II Possession Application
200
200


 83%|██████████████████████████████████▊       | 73/88 [07:57<01:21,  5.45s/it]

for sub_court:  Part IV Possession Application
200
200


 84%|███████████████████████████████████▎      | 74/88 [08:01<01:12,  5.15s/it]

for sub_court:  Part V Possession Application
200
200


 85%|███████████████████████████████████▊      | 75/88 [08:05<01:02,  4.78s/it]

for sub_court:  Railway Ordinance Application
200
200


 86%|████████████████████████████████████▎     | 76/88 [08:14<01:09,  5.83s/it]

for sub_court:  Rating Appeal
200
200


 88%|████████████████████████████████████▊     | 77/88 [08:18<00:59,  5.38s/it]

for sub_court:  Coroner Court Death Inquest
200
200


 89%|█████████████████████████████████████▏    | 78/88 [08:22<00:49,  4.99s/it]

for sub_court:  Eastern Magistrate Court Charge Case
200
200


 90%|█████████████████████████████████████▋    | 79/88 [08:26<00:41,  4.65s/it]

for sub_court:  Eastern Magistrate Court Summon Case
200
200


 91%|██████████████████████████████████████▏   | 80/88 [08:30<00:35,  4.47s/it]

for sub_court:  Kowloon City Magistrate Court Charge Case
200
200


 92%|██████████████████████████████████████▋   | 81/88 [08:44<00:52,  7.48s/it]

for sub_court:  Kowloon City Magistrates’ Court Summon Case
200
200


 93%|███████████████████████████████████████▏  | 82/88 [09:00<00:59,  9.85s/it]

for sub_court:  Labour Tribunal Claim
200
200


 94%|███████████████████████████████████████▌  | 83/88 [09:04<00:40,  8.09s/it]

for sub_court:  Obscene Article Tribunal Case
200
200


 95%|████████████████████████████████████████  | 84/88 [09:10<00:29,  7.47s/it]

for sub_court:  Shatin Magistrate Court Charge Case
200
200


 97%|████████████████████████████████████████▌ | 85/88 [09:21<00:25,  8.43s/it]

for sub_court:  Small Claim Tribunal Claim
200
200


 98%|█████████████████████████████████████████ | 86/88 [09:25<00:14,  7.09s/it]

for sub_court:  Tuen Mun Magistrate Court Charge Case
200
200


 99%|█████████████████████████████████████████▌| 87/88 [09:29<00:06,  6.16s/it]

for sub_court:  West Kowloon Magistrate Court Charge Case
200
200


100%|██████████████████████████████████████████| 88/88 [09:33<00:00,  5.55s/it]


(490, 3)


Unnamed: 0,year,year_url,sub_court_cleared
0,2015,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2015&AR=1_4#A1_4,Final Appeal (Civil)
1,2013,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2013&AR=1_6#A1_6,Final Appeal (Civil)
2,2016,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2016&AR=1_3#A1_3,Final Appeal (Civil)
3,2018,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2018&AR=1_1#A1_1,Final Appeal (Civil)
4,2017,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2017&AR=1_2#A1_2,Final Appeal (Civil)
5,2014,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2014&AR=1_5#A1_5,Final Appeal (Civil)
6,Pre2013,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&AR=1#A1,Final Appeal (Civil)
0,2015,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2015&AR=2_4#A2_4,Final Appeal (Criminal)
1,2013,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2013&AR=2_6#A2_6,Final Appeal (Criminal)
2,2016,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CC&L3=2016&AR=2_3#A2_3,Final Appeal (Criminal)


In [21]:
df_years.index = range(len(df_years))
df_years.head()

Unnamed: 0,year,year_url,sub_court_cleared
0,2015,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2015&AR=1_4#A1_4,Final Appeal (Civil)
1,2013,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2013&AR=1_6#A1_6,Final Appeal (Civil)
2,2016,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2016&AR=1_3#A1_3,Final Appeal (Civil)
3,2018,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2018&AR=1_1#A1_1,Final Appeal (Civil)
4,2017,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=&L1=FA&L2=CV&L3=2017&AR=1_2#A1_2,Final Appeal (Civil)


In [22]:
df_years.loc[len(df_years)-1]

year                                                                                                            2017
year_url             https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=WK&L2=CC&L3=2017&AR=11_1#A11_1
sub_court_cleared                                                          West Kowloon Magistrate Court Charge Case
Name: 489, dtype: object

In [23]:
# store data
with open('data.pickle', 'wb') as ww:
    pickle.dump(df_years, ww)


In [24]:
f = open('data.pickle', 'rb')
df_year = pickle.load(f)

## Hiearchy 4: details

In [122]:
# get df with "Pre****"
idx = [i[:3]=="Pre" for i in list(df_years['year'])]
df_special = df_years[idx]

# get df without "Pre****"
idx = [i[:3]!="Pre" for i in list(df_years['year'])]
df_regular = df_years[idx]
df_regular.shape

(442, 3)

In [None]:
def get_detail(df):
    dfs = [] # init
    for (idx, r) in tqdm(df.iterrows(), total=len(df)):
        data_onepage = [] # initialize
        url = r['year_url'] 
        time.sleep(1)
        root = url_to_root(url)

        info = root.find('script', string=re.compile("var myMenu")).text.split(",[")
        B_s = [BeautifulSoup(i) for i in info]
        for B in B_s:
            try:
                outcome = B.find('td', valign="top", width="5%").find('img')['src']
                name = B.find('a', class_="ThemeXPRowAnchor").text
                
                url = B.find('a', class_="ThemeXPRowAnchor")['href']
                url = url.split(',')[0]
                detail_url = url[re.search("https", url).span()[0]:-2]
                date = B.find('font',color="#006633").text
                persons = B.find('td', valign="top", width="55%").text
                data_onepage.append((outcome, name, detail_url, date, persons))
            except AttributeError:
                continue
        
        print('this page has entries: ', len(data_onepage))
        # build DataFrame
        df = pd.DataFrame(data_onepage, columns=['outcome', 'name', 'detail_url', 'date', 'persons'])
        dfs.append(df)
    df_detail = pd.concat(dfs, axis=0)
    return df_detail        
        
df_detail = get_detail(df_regular)      

  0%|                                                  | 0/442 [00:00<?, ?it/s]

200
this page has entries:  31


  0%|                                          | 1/442 [00:02<16:26,  2.24s/it]

200
this page has entries:  33


  0%|▏                                         | 2/442 [00:04<16:19,  2.23s/it]

200
this page has entries:  15


  1%|▎                                         | 3/442 [00:06<15:27,  2.11s/it]

200
this page has entries:  16


  1%|▍                                         | 4/442 [00:08<14:47,  2.03s/it]

200
this page has entries:  26


  1%|▍                                         | 5/442 [00:10<16:06,  2.21s/it]

200
this page has entries:  17


  1%|▌                                         | 6/442 [00:12<15:29,  2.13s/it]

200
this page has entries:  9


  2%|▋                                         | 7/442 [00:14<15:01,  2.07s/it]

200
this page has entries:  13


  2%|▊                                         | 8/442 [00:16<14:27,  2.00s/it]

200
this page has entries:  21


  2%|▊                                         | 9/442 [00:18<14:22,  1.99s/it]

200
this page has entries:  24


  2%|▉                                      | 10/442 [00:46<1:09:42,  9.68s/it]

200
this page has entries:  14


  2%|█                                        | 11/442 [00:47<52:44,  7.34s/it]

200
this page has entries:  14


  3%|█                                        | 12/442 [00:49<40:31,  5.66s/it]

200
this page has entries:  1


  3%|█▏                                       | 13/442 [00:51<31:45,  4.44s/it]

200
this page has entries:  1


  3%|█▎                                       | 14/442 [00:54<28:53,  4.05s/it]

200
this page has entries:  3


  3%|█▍                                       | 15/442 [00:57<26:35,  3.74s/it]

200
this page has entries:  1


  4%|█▍                                       | 16/442 [00:59<22:08,  3.12s/it]

200
this page has entries:  1


  4%|█▌                                       | 17/442 [01:00<19:21,  2.73s/it]

200
this page has entries:  1


  4%|█▋                                       | 18/442 [01:02<16:55,  2.40s/it]

200
this page has entries:  32


  4%|█▊                                       | 19/442 [01:05<17:30,  2.48s/it]

200
this page has entries:  2


  5%|█▊                                       | 20/442 [01:07<16:49,  2.39s/it]

200
this page has entries:  26


  5%|█▉                                       | 21/442 [01:09<15:54,  2.27s/it]

200
this page has entries:  15


  5%|██                                       | 22/442 [01:11<15:08,  2.16s/it]

200
this page has entries:  14


  5%|██▏                                      | 23/442 [01:13<14:43,  2.11s/it]

200
this page has entries:  36


  5%|██▏                                      | 24/442 [01:15<14:34,  2.09s/it]

200
this page has entries:  23


  6%|██▎                                      | 25/442 [01:17<14:20,  2.06s/it]

200
this page has entries:  3


  6%|██▍                                      | 26/442 [01:19<13:31,  1.95s/it]

200
this page has entries:  22


  6%|██▌                                      | 27/442 [01:20<13:16,  1.92s/it]

200
this page has entries:  50


  6%|██▌                                      | 28/442 [01:23<14:08,  2.05s/it]

200
this page has entries:  25


  7%|██▋                                      | 29/442 [01:34<33:49,  4.91s/it]

200
this page has entries:  24


  7%|██▊                                      | 30/442 [01:36<27:28,  4.00s/it]

200
this page has entries:  1


  7%|██▉                                      | 31/442 [01:38<22:32,  3.29s/it]

200
this page has entries:  4


  7%|██▉                                      | 32/442 [01:41<22:26,  3.28s/it]

200
this page has entries:  8


  7%|███                                      | 33/442 [01:43<19:08,  2.81s/it]

200
this page has entries:  4


  8%|███▏                                     | 34/442 [01:44<16:47,  2.47s/it]

200
this page has entries:  2


  8%|███▏                                     | 35/442 [01:47<16:09,  2.38s/it]

200
this page has entries:  6


  8%|███▎                                     | 36/442 [01:49<15:11,  2.25s/it]

In [187]:
B.find('td', valign="top", width="5%").find('img')['src']

"\\'https://legalref.judiciary.hk/lrs/images/ThemeXP/Cpage.gif\\'"

In [188]:
B.find('a', class_="ThemeXPRowAnchor").text

'WKCC3654/2017'

In [189]:
B.find('a', class_="ThemeXPRowAnchor")['href']

"javascript:matchpop(\\'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=121879\\',\\'matchpage\\');"

In [192]:
url = B.find('a', class_="ThemeXPRowAnchor")['href']
url

"javascript:matchpop(\\'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=121879\\',\\'matchpage\\');"

In [191]:
url = B.find('a', class_="ThemeXPRowAnchor")['href']
url = url.split(',')[0]
url = url[re.search("https", url).span()[0]:-2]

javascript:matchpop(\'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=121879\',\'matchpage\');
javascript:matchpop(\'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=121879\'


'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=121879'

In [193]:
B.find('font',color="#006633").text

'(15/05/2019)'

In [194]:
B.find('td', valign="top", width="55%").text

'香港特別行政區 訴 吳文遠及另六人 \xa0'

In [59]:
[BeautifulSoup(i) for i in a][1]

<html><body><p>null, '</p><table border="\'0\'" bordercolor="\'#CCCCCC\'" cellpadding="\'0\'" cellspacing="\'0\'" width="\'99%\'"><tr><td> <table bgcolor="#F7F5F0" border="\'1\'" bordercolor="\'#CAB484\'" cellpadding="\'2\'" cellspacing="\'0\'" width="\'100%\'"> <tr> <td valign="top" width="5%"><img alt="\'Judgment" english="" in="" src="\'https://legalref.judiciary.hk/lrs/images/ThemeXP/Epage.gif\'"/></td> <td valign="top" width="40%"> <a class="ThemeXPRowAnchor" href="javascript:matchpop(\'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=114838\',\'matchpage\');">FACV1/2018</a><br/>[2018] HKCFA 17<br/><font color="#006633">(30/04/2018)</font><br/><img height="1" src="\'https://legalref.judiciary.hk/lrs/images/spacer.gif\'" width="142"/></td> <td valign="top" width="55%">ABN AMRO BANK N.V. AND OTHERS v. QT  Reported in :<font color="#8C029B">(2018) 21 HKCFAR 150</font><br/><img height="2" src="\'https://legalref.judiciary.hk/lrs/images/spacer.gif\'" width="348"/></td> </tr

In [43]:
B = [BeautifulSoup(i, 'html5lib') for i in a if len(re.findall('matchpage', i))!=0][4]
B.text

"\\'matchpage\\');>FACV3/2018[2018] HKCFA 41(13/09/2018)\tCHANG WA SHAN  v.  ESTHER CHAN PUI KWAN（陳佩君）also known as CHAN PUI CHUN（陳佩珍） \xa0\t'"

In [45]:
B = [i for i in a if len(re.findall('matchpage', i))!=0][0]
B

"\\'matchpage\\');>FACV1/2018</a><BR>[2018] HKCFA 28<BR><font color=#006633>(04/07/2018)</font><BR><img src=\\'https://legalref.judiciary.hk/lrs/images/spacer.gif\\' height=1 width=142></td>\t<td  valign=top width=55% >DIRECTOR OF IMMIGRATION v. QT &nbsp;Reported in :<font color=#8C029B>(2018) 21 HKCFAR 324</font><BR><img src=\\'https://legalref.judiciary.hk/lrs/images/spacer.gif\\' height=2 width=348></td>\t</tr></table></td></tr><tr><TD><img src=\\'https://legalref.judiciary.hk/lrs/images/spacer.gif\\' height=2 ></td></tr><tr><TD></TD></tr></table>'"

In [41]:
B.text

"\\'matchpage\\');>FACV1/2018[2018] HKCFA 28(04/07/2018)\tDIRECTOR OF IMMIGRATION v. QT \xa0Reported in :(2018) 21 HKCFAR 324\t'"

In [32]:
B.find('td')

<td valign="top" width="55%">DIRECTOR OF IMMIGRATION v. QT  Reported in :<font color="#8C029B">(2018) 21 HKCFAR 324</font><br/><img height="2" src="\'https://legalref.judiciary.hk/lrs/images/spacer.gif\'" width="348"/></td>

In [34]:
B.find('td').find('img')['src']

"\\'https://legalref.judiciary.hk/lrs/images/spacer.gif\\'"

In [35]:
B.find('td').find('a').text

AttributeError: 'NoneType' object has no attribute 'text'

In [284]:
B.find('td').find('font')

<font color="#006633">(07/07/2014)</font>

In [290]:
B.find('td').find('font').find_next('td').text

'律政司司長 訴 林裕偉  \xa0Reported in :[2015] 1 HKLRD 393'

In [285]:
B.find('td')

<td> <table bgcolor="#F7F5F0" border="\'1\'" bordercolor="\'#CAB484\'" cellpadding="\'2\'" cellspacing="\'0\'" width="\'100%\'"> <tr> <td valign="top" width="5%"><img alt="\'Judgment" chinese="" english="" in="" src="\'https://legalref.judiciary.hk/lrs/images/ThemeXP/CTpage.gif\'" traditional="" translation="" with=""/></td> <td valign="top" width="40%"> <a class="ThemeXPRowAnchor" href="javascript:matchpop(\'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=93798\',\'matchpage\');">CAAR2/2014</a><br/><font color="#006633">(07/07/2014)</font><br/><img height="1" src="\'https://legalref.judiciary.hk/lrs/images/spacer.gif\'" width="142"/></td> <td class="\'chinesefont\'" valign="top" width="55%">律政司司長 訴 林裕偉 <img src="\'/lrs/images/ThemeXP/star.gif\'"/>  Reported in :<font color="#8C029B">[2015] 1 HKLRD 393</font><br/><img height="2" src="\'https://legalref.judiciary.hk/lrs/images/spacer.gif\'" width="348"/></td> </tr></table></td>