In [1]:
import requests
import json
import time
from bs4 import BeautifulSoup, Comment
import IPython.display as display
import pickle
import pandas as pd
pd.set_option('max_colwidth',1000)
import re
from tqdm import tqdm

## Start with original url

In [32]:
def url_to_root(url):
    headers = {'Accept-Encoding': 'identity',
               'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
                }
    response = requests.get(url, headers=headers, timeout=5)
#     print(response.status_code)
    
    root = BeautifulSoup(response.text, 'html5lib')
    return root

original_url = 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp'
root = url_to_root(original_url)

## Hieachy 1: parent court categories

In [33]:
def parse_courts(root):
    """Parse the info of parent courts.
    Args:
        root : BeautifulSoup Object
    Return:
        courts_dict (dict): court -> url
    """
    # get all raw infomation about courts (looks urgly)
    info_list = root.find('script', string=re.compile("var myMenu")).text.split("'")
    # find the parent courts' name inside 'span' tag
    courts_list = [BeautifulSoup(i).find('span').text.strip(" ") for i in info_list if i[1:5]=='span']
    # find corresponding urls of parents courts(order matters here)
    courts_urls = [j for j in info_list if j[:5]=='https']
    # store to dict
    courts_dict = {i:j for i, j in zip(courts_list, courts_urls)}
    return courts_dict

courts = parse_courts(root)
courts

{'Court of Final Appeal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1',
 'Court of Appeal of the High Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CA#H2',
 'High Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=HC#H3',
 'Competition Tribunal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CT#H4',
 'District Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=DC#H5',
 'Family Court': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FC#H6',
 'Lands Tribunal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=LD#H7',
 'Miscellaneous': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=OT#H8'}

In [34]:
df_courts = pd.DataFrame([(k,v) for k,v in courts.items()], columns=['parent_court', 'parent_court_url'])
df_courts

Unnamed: 0,parent_court,parent_court_url
0,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1
1,Court of Appeal of the High Court,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CA#H2
2,High Court,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=HC#H3
3,Competition Tribunal,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CT#H4
4,District Court,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=DC#H5
5,Family Court,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FC#H6
6,Lands Tribunal,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=LD#H7
7,Miscellaneous,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=OT#H8


## Hieachy 2: sub courts

In [35]:
# test case
url = 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=DC#H5'
root = url_to_root(url)
        
# get raw infomation
info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")

In [36]:
parent_courts_set = set(df_courts['parent_court'])
parent_courts_set

{'Competition Tribunal',
 'Court of Appeal of the High Court',
 'Court of Final Appeal',
 'District Court',
 'Family Court',
 'High Court',
 'Lands Tribunal',
 'Miscellaneous'}

In [37]:
def clear_name(a):
    ans = ' '.join([i.strip("\\s'")for i in a.split(" ")])
    return ans

parent_courts_set = set(df_courts['parent_court'])
now = set([BeautifulSoup(i.strip("' ")).text.strip(" ") for i in info_list if len(re.findall('<a name=', i))!=0])
sub_courts = sorted(list(now - parent_courts_set))
sub_courts = [clear_name(s) for s in sub_courts]
sub_courts

['Civil Action',
 'Criminal Case',
 'Distraint Case',
 'District Court Tax Claim',
 'Employee Compensation Case',
 'Equal Opportunitie Action',
 'Intended Action',
 'Miscellaneou Proceeding',
 'Occupational Deafne (Compensation) Appeal',
 'Personal Injurie Action',
 'Stamp Duty Appeal']

In [38]:
def parse_sub_courts(courts, parent_courts_set):
    """Parse sub-courts information.
    Args:
        courts (dict) : parent_courts -> url
        parent_courts_set 
    Return:
        sub_dict (dict) : parent_courts -> dictionary(sub_courts -> url) 
    
    """
    sub_dict = {} # initialize

    for c, url in courts.items():
        time.sleep(1) # request web slowly for benevolence
        root = url_to_root(url)
        
        # get sub courts names
        # get raw infomation
        info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")
        # create bs4 object list for strings has tag "a" and attr name
        now = set([BeautifulSoup(i.strip("' ")).text.strip(" ") for i in info_list if len(re.findall('<a name=', i))!=0])
        sub_courts = sorted(list(now - parent_courts_set)) # notice order in alphbets
        sub_courts_cleared = [clear_name(s) for s in sub_courts] # clear names
        print('subcourts:',len(sub_courts))
        
        # get sub courts urls
        # re-get raw infomation
        info_list = root.find('script', string=re.compile("var myMenu")).text.split("'") # notice split by up-comma
        # observe sub-courts' urls are longer than parents', so find sub-courts' urls' length
        http_min_len = min(set([len(i) for i in info_list if i[:5]=='https']))
        # get sub-courts' urls
        sub_courts_urls = [i for i in info_list if i[:5]=='https' if len(i)>http_min_len]
        print('urls:', len(sub_courts_urls))
        # store in dict
        subs = {sc.strip(" "):(sc_, url) for sc, sc_, url in zip(sub_courts, sub_courts_cleared, sub_courts_urls)}
        # assign to parent courts
        print('dict length: ',len(subs))
        sub_dict[c] = subs
    return sub_dict

parent_courts_set = set(df_courts['parent_court'])
sub_dict = parse_sub_courts(courts, parent_courts_set)
sub_dict

subcourts: 5
urls: 5
dict length:  5
subcourts: 6
urls: 6
dict length:  6
subcourts: 31
urls: 31
dict length:  31
subcourts: 2
urls: 2
dict length:  2
subcourts: 11
urls: 11
dict length:  11
subcourts: 3
urls: 3
dict length:  3
subcourts: 19
urls: 19
dict length:  19
subcourts: 11
urls: 11
dict length:  11


{'Court of Final Appeal': {'Final Appeal (Civil)': ('Final Appeal (Civil)',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=CV&AR=1#A1'),
  'Final Appeal (Criminal)': ('Final Appeal (Criminal)',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=CC&AR=2#A2'),
  'Miscellaneous Proceedings': ('Miscellaneou Proceeding',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MP&AR=3#A3'),
  'Miscellaneous Proceedings (Civil)': ('Miscellaneou Proceeding (Civil)',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MV&AR=4#A4'),
  'Miscellaneous Proceedings (Criminal)': ('Miscellaneou Proceeding (Criminal)',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MC&AR=5#A5')},
 'Court of Appeal of the High Court': {'Application for Review': ('Application for Review',
   'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=CA&L2=AR&AR=1#A1'),
  "Attorney General\\'s Reference": ('Attorney General 

In [39]:
def df_for_sub(sub_dict):
    return pd.DataFrame([(k,v[0],v[1]) for k,v in sub_dict.items()], columns=['sub_court', 'sub_court_cleared', 'sub_court_url'])

dfs = []
for p_court, s_dict in sub_dict.items():
    sub_df = df_for_sub(s_dict)
    sub_df['parent_court'] = p_court
    dfs.append(sub_df)
    
df_all = pd.concat(dfs, axis=0)
df_all.index = range(len(df_all))
df_all = pd.merge(df_all, df_courts, how='left', on='parent_court')
df_all.head()

Unnamed: 0,sub_court,sub_court_cleared,sub_court_url,parent_court,parent_court_url
0,Final Appeal (Civil),Final Appeal (Civil),https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=CV&AR=1#A1,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1
1,Final Appeal (Criminal),Final Appeal (Criminal),https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=CC&AR=2#A2,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1
2,Miscellaneous Proceedings,Miscellaneou Proceeding,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MP&AR=3#A3,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1
3,Miscellaneous Proceedings (Civil),Miscellaneou Proceeding (Civil),https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MV&AR=4#A4,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1
4,Miscellaneous Proceedings (Criminal),Miscellaneou Proceeding (Criminal),https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=MC&AR=5#A5,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1


In [40]:
df_all.sub_court_cleared

0                                                        Final Appeal (Civil)
1                                                     Final Appeal (Criminal)
2                                                     Miscellaneou Proceeding
3                                             Miscellaneou Proceeding (Civil)
4                                          Miscellaneou Proceeding (Criminal)
5                                                      Application for Review
6                                                  Attorney General Reference
7                                                                Civil Appeal
8                                                             Criminal Appeal
9                                                     Miscellaneou Proceeding
10                                             Reservation of Question of Law
11                                                           Admiralty Action
12                                                       Adoptio

In [41]:
with open('df_all.pickle', 'wb') as ww:
    pickle.dump(df_all, ww)

In [6]:
f = open('df_all.pickle', 'rb')
df_all = pickle.load(f)

## Hiearchy 3: years

In [11]:
# for test case
sub_ = "West Kowloon Magistrate Court Charge Case"

In [12]:
list(df_all[df_all['sub_court_cleared']==sub_]['sub_court_url'])

['https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=WK&L2=CC&AR=11#A11']

In [206]:
def check_special(sc_cleared, df_all, courts_set, urls_set):
    """Check if sub-court has Pre****"""
    years = subCourt_get_years(sc_cleared, df_all, courts_set)
    urls = subCourt_get_urls(sc_cleared, df_all, urls_set) 
    print(years)
    special_ = [years[idx] for idx, y in enumerate(years) if len(re.findall('Pre', y))!=0]
    if len(special_) != 0:
        has_special = True
        special_year = special_[0]
        special_url = [u for idx, u in enumerate(urls) if len(re.findall('L3=....', u))==0][0]
    else: 
        has_special = False
        special_year = None
        special_url = None
    return has_special, special_year, special_url

        
courts_set = set(df_all['parent_court']) | set(df_all['sub_court'])
urls_set = set(df_all['sub_court_url']) | set(df_all['parent_court_url'])

special_courts = {}
for sc_ in tqdm(df_all['sub_court_cleared']):
    has_special, special_year, special_url = check_special(sc_, df_all, courts_set, urls_set)
    if has_special:
        special_courts[sc_] = special_url
special_courts


  0%|                                                   | 0/88 [00:00<?, ?it/s]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


  1%|▍                                          | 1/88 [00:04<07:01,  4.85s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


  2%|▉                                          | 2/88 [00:10<07:08,  4.98s/it]

200
200
['2009', '2004', '2013', '2010', '2003', '2014']


  3%|█▍                                         | 3/88 [00:14<06:47,  4.79s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


  5%|█▉                                         | 4/88 [00:19<06:45,  4.82s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


  6%|██▍                                        | 5/88 [00:25<07:00,  5.07s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2017', '2018', '2014']


  7%|██▉                                        | 6/88 [00:29<06:48,  4.99s/it]

200
200
['1986', '1994']


  8%|███▍                                       | 7/88 [00:33<06:14,  4.63s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


  9%|███▉                                       | 8/88 [00:50<11:06,  8.33s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 10%|████▍                                      | 9/88 [00:58<10:40,  8.11s/it]

200
200
['2009', '2004', '2013', '2010', '2003', '2014']


 11%|████▊                                     | 10/88 [01:11<12:40,  9.75s/it]

200
200
['2018', '1997', '1994']


 12%|█████▎                                    | 11/88 [01:15<10:10,  7.93s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 14%|█████▋                                    | 12/88 [01:19<08:27,  6.68s/it]

200
200
['1970', '1973', '1985', '1978', '1992', '1971', '1988']


 15%|██████▏                                   | 13/88 [01:23<07:21,  5.89s/it]

200
200
['2011', '2015', '2012', '2009', '2010', 'Pre2009', '2014']


 16%|██████▋                                   | 14/88 [01:27<06:35,  5.35s/it]

200
200
['2011', '2012', '2013', '2016', '2010', 'Pre2010', '2014']


 17%|███████▏                                  | 15/88 [01:43<10:36,  8.72s/it]

200
200
['2008', '2011', '2015', '2012', '2016', '2010', '2014']


 18%|███████▋                                  | 16/88 [01:47<08:39,  7.22s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 19%|████████                                  | 17/88 [01:51<07:21,  6.22s/it]

200
200
['Pre2012', '2015', '2012', '2013', '2016', '2017', '2014']


 20%|████████▌                                 | 18/88 [01:55<06:23,  5.47s/it]

200
200
['2012', '2008']


 22%|█████████                                 | 19/88 [01:58<05:40,  4.94s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 23%|█████████▌                                | 20/88 [02:02<05:11,  4.58s/it]

200
200
['Pre2012', '2015', '2012', '2013', '2016', '2017', '2014']


 24%|██████████                                | 21/88 [02:06<04:58,  4.46s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 25%|██████████▌                               | 22/88 [02:14<05:57,  5.42s/it]

200
200
['2018', '2009', '2003']


 26%|██████████▉                               | 23/88 [02:36<11:11, 10.33s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 27%|███████████▍                              | 24/88 [02:40<09:01,  8.47s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 28%|███████████▉                              | 25/88 [02:48<08:36,  8.20s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 30%|████████████▍                             | 26/88 [02:53<07:40,  7.43s/it]

200
200
['2011', '1974', '2013', '2007', '2005']


 31%|████████████▉                             | 27/88 [02:57<06:28,  6.36s/it]

200
200
['2008', '2009', '2004', '2013', '2010', '2005']


 32%|█████████████▎                            | 28/88 [03:01<05:46,  5.77s/it]

200
200
['2002', '2008', '2012', '2009', '2004', '2001', '2003', '2006']


 33%|█████████████▊                            | 29/88 [03:11<06:40,  6.79s/it]

200
200
['2015', '2009', '2016', '2018', '2017', '2010', 'Pre2009']


 34%|██████████████▎                           | 30/88 [03:14<05:42,  5.90s/it]

200
200
['1986', '2000', '2012', '2015', '2009', '2017']


 35%|██████████████▊                           | 31/88 [03:18<04:55,  5.19s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 36%|███████████████▎                          | 32/88 [03:22<04:25,  4.74s/it]

200
200
['2017', '1999', '2013']


 38%|███████████████▊                          | 33/88 [03:30<05:14,  5.72s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 39%|████████████████▏                         | 34/88 [03:36<05:15,  5.84s/it]

200
200
['2011', '2012', '2009', '2016', '2018', '2010', 'Pre2009']


 40%|████████████████▋                         | 35/88 [03:40<04:37,  5.24s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 41%|█████████████████▏                        | 36/88 [03:46<04:53,  5.64s/it]

200
200
['2009', '2004', '2013', '2010', '2003', '2014']


 42%|█████████████████▋                        | 37/88 [03:56<05:50,  6.87s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 43%|██████████████████▏                       | 38/88 [04:00<04:58,  5.98s/it]

200
200
['1997', '1996', '1994', '1993', '1995', '1991']


 44%|██████████████████▌                       | 39/88 [04:04<04:26,  5.43s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 45%|███████████████████                       | 40/88 [04:08<04:05,  5.11s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 47%|███████████████████▌                      | 41/88 [04:13<03:49,  4.89s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 48%|████████████████████                      | 42/88 [04:22<04:49,  6.29s/it]

200
200
['2018']


 49%|████████████████████▌                     | 43/88 [04:30<05:02,  6.71s/it]

200
200
['2017']


 50%|█████████████████████                     | 44/88 [04:39<05:27,  7.45s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 51%|█████████████████████▍                    | 45/88 [04:43<04:34,  6.37s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 52%|█████████████████████▉                    | 46/88 [04:48<04:15,  6.08s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 53%|██████████████████████▍                   | 47/88 [04:56<04:28,  6.54s/it]

200
200
['2011', '2015', '2012', 'Pre2011', '2013', '2016', '2017']


 55%|██████████████████████▉                   | 48/88 [05:06<04:59,  7.48s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 56%|███████████████████████▍                  | 49/88 [05:55<12:56, 19.91s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 57%|███████████████████████▊                  | 50/88 [06:13<12:22, 19.55s/it]

200
200
['1986', '2000', '2012', '2015', '2009', '2017']


 58%|████████████████████████▎                 | 51/88 [06:19<09:31, 15.44s/it]

200
200
['2009', '2004', '2013', '2010', '2003', '2014']


 59%|████████████████████████▊                 | 52/88 [06:23<07:16, 12.11s/it]

200
200
['2005']


 60%|█████████████████████████▎                | 53/88 [06:27<05:39,  9.69s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 61%|█████████████████████████▊                | 54/88 [06:31<04:28,  7.91s/it]

200
200
['Pre2012', '2015', '2012', '2016', '2018', '2017', '2014']


 62%|██████████████████████████▎               | 55/88 [06:43<04:58,  9.04s/it]

200
200
['Pre2012', '2015', '2012', '2013', '2016', '2017', '2014']


 64%|██████████████████████████▋               | 56/88 [06:47<03:57,  7.43s/it]

200
200
['2011', '2012', '2009', '2016', '2018', '2010', 'Pre2009']


 65%|███████████████████████████▏              | 57/88 [06:55<03:55,  7.58s/it]

200
200
['2009', '2004', '2013', '2010', '2003', '2014']


 66%|███████████████████████████▋              | 58/88 [07:10<04:58,  9.95s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 67%|████████████████████████████▏             | 59/88 [07:17<04:21,  9.01s/it]

200
200
['1981', '1982']


 68%|████████████████████████████▋             | 60/88 [07:21<03:29,  7.47s/it]

200
200
['1985']


 69%|█████████████████████████████             | 61/88 [07:31<03:42,  8.24s/it]

200
200
['1989', '1992']


 70%|█████████████████████████████▌            | 62/88 [07:35<02:59,  6.91s/it]

200
200
['2012', '2013', '2007', '2006', '2005', '2014', 'Pre2005']


 72%|██████████████████████████████            | 63/88 [07:39<02:30,  6.02s/it]

200
200
['1983', '2002', '2000', '2012', '2004', '2001', '2010', '2003']


 73%|██████████████████████████████▌           | 64/88 [07:51<03:10,  7.95s/it]

200
200
['Pre2013', '2015', '2013', '2016', '2018', '2017', '2014']


 74%|███████████████████████████████           | 65/88 [07:55<02:33,  6.66s/it]

200
200
['2011', '2015', '2012', 'Pre2011', '2013', '2016', '2014']


 75%|███████████████████████████████▌          | 66/88 [07:58<02:07,  5.78s/it]

200
200
['1997', 'Pre1987', '1987', '1995', '1988', '1989', '1991']


 76%|███████████████████████████████▉          | 67/88 [08:07<02:19,  6.64s/it]

200
200
['1984', '1983', '1981', '2004', '2001', '1988', '1982']


 77%|████████████████████████████████▍         | 68/88 [08:27<03:31, 10.58s/it]

200
200
['2009', '2016', '2018', '2017', '2010', '2003', 'Pre2003']


 78%|████████████████████████████████▉         | 69/88 [08:31<02:42,  8.56s/it]

200
200
['2002', '2012', 'Pre2002', '2004', '2003', '2006', '2005']


 80%|█████████████████████████████████▍        | 70/88 [08:36<02:18,  7.70s/it]

200
200
['2002', '2000', '1999', 'Pre1999', '2004', '2001', '2003']


 81%|█████████████████████████████████▉        | 71/88 [08:41<01:53,  6.66s/it]

200
200
['1986', '1983', '1981', '1985', '1995', '1982']


 82%|██████████████████████████████████▎       | 72/88 [08:52<02:08,  8.05s/it]

200
200
['1997', '1998', '2000', '1994', '1995']


 83%|██████████████████████████████████▊       | 73/88 [08:56<01:42,  6.83s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 84%|███████████████████████████████████▎      | 74/88 [08:59<01:22,  5.89s/it]

200
200
['2015', '2019', 'Pre2014', '2016', '2017', '2018', '2014']


 85%|███████████████████████████████████▊      | 75/88 [09:05<01:13,  5.68s/it]

200
200
['2015', '2004', '2001', '2003', '2005']


 86%|████████████████████████████████████▎     | 76/88 [09:08<01:00,  5.06s/it]

200
200
['2015', '2009', '2007', '2006', '2005', '2014', 'Pre2005']


 88%|████████████████████████████████████▊     | 77/88 [09:17<01:06,  6.06s/it]

200
200
['2013']


 89%|█████████████████████████████████████▏    | 78/88 [09:21<00:54,  5.44s/it]

200
200
['2018', '2015', '2011', '2016']


 90%|█████████████████████████████████████▋    | 79/88 [09:25<00:44,  4.97s/it]

200
200
['2013', '2016']


 91%|██████████████████████████████████████▏   | 80/88 [09:29<00:39,  4.92s/it]

200
200
['2017', '2018', '2002', '2016']


 92%|██████████████████████████████████████▋   | 81/88 [09:33<00:32,  4.58s/it]

200
200
['2013']


 93%|███████████████████████████████████████▏  | 82/88 [09:37<00:25,  4.25s/it]

200
200
['2012', '2008']


 94%|███████████████████████████████████████▌  | 83/88 [09:41<00:22,  4.43s/it]

200
200
['2006']


 95%|████████████████████████████████████████  | 84/88 [09:45<00:16,  4.22s/it]

200
200
['2016']


 97%|████████████████████████████████████████▌ | 85/88 [09:49<00:12,  4.07s/it]

200
200
['2012', '2015', '2001']


 98%|█████████████████████████████████████████ | 86/88 [09:53<00:07,  3.96s/it]

200
200
['2005']


 99%|█████████████████████████████████████████▌| 87/88 [09:57<00:04,  4.18s/it]

200
200
['2017']


100%|██████████████████████████████████████████| 88/88 [10:05<00:00,  5.31s/it]


{'Final Appeal (Civil)': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&AR=1#A1',
 'Final Appeal (Criminal)': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CC&AR=2#A2',
 'Miscellaneou Proceeding (Civil)': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=MV&AR=4#A4',
 'Miscellaneou Proceeding (Criminal)': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=MC&AR=5#A5',
 'Application for Review': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=CA&L2=AR&AR=1#A1',
 'Civil Appeal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=CA&L2=CV&AR=3#A3',
 'Criminal Appeal': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=CA&L2=CC&AR=4#A4',
 'Admiralty Action': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=AJ&AR=1#A1',
 'Application for Grant': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=AG&AR=3#A

In [209]:
def subCourt_get_years(sc_cleared, url, df_all, all_courts):
    time.sleep(1)
    root = url_to_root(url)
    info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")
    
    now_set = set([BeautifulSoup(i.strip("' ")).text.strip(" ") for i in info_list if len(re.findall('<a name=', i))!=0])
    years = now_set - all_courts
    return list(years)

courts_set = set(df_all['parent_court']) | set(df_all['sub_court'])
url = list(df_all[df_all['sub_court_cleared']==sub_]['sub_court_url'])[0]
years = subCourt_get_years(sub_, url, df_all, courts_set)
years

200


['2017']

In [210]:
def subCourt_get_urls(sub_court, url, df_all, urls_set):
    
    time.sleep(1)
    root = url_to_root(url)
    info_list = root.find('script', string=re.compile("var myMenu")).text.split(",")
    
    potential_urls = set([i.strip("' ") for i in info_list if i.strip("' ")[:5]=='https'])
    urls = potential_urls - urls_set
    return list(urls)

urls_set = set(df_all['sub_court_url']) | set(df_all['parent_court_url'])
url = list(df_all[df_all['sub_court_cleared']==sub_]['sub_court_url'])[0]
urls = subCourt_get_urls(sub_, url, df_all, urls_set)  
urls

200


['https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=WK&L2=CC&L3=2017&AR=11_1#A11_1']

In [211]:
def match_year_urls(years, urls):
    match_dict = {}
    url_pairs = [(re.findall('L3=....', i)[0][-4:], i) for i in urls]
    for y in years:
        for y_url, url in url_pairs:
            if y == y_url:
                match_dict[y] = url
    return match_dict

match_year_urls(years, urls)

{'2017': 'https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=WK&L2=CC&L3=2017&AR=11_1#A11_1'}

In [213]:
def get_year_urls(sub_court, url, df_all, courts_set, urls_set):
    # get years
    years = subCourt_get_years(sub_court, url, df_all, courts_set)
    # get urls
    urls = subCourt_get_urls(sub_court, url, df_all, urls_set)  
    # match them
    match_dict = match_year_urls(years, urls)
    # convert to DataFrame
    data = [(k,v) for k, v in match_dict.items()]
    match = pd.DataFrame(data, columns=['year', 'year_url'])
    match['sub_court_cleared'] = sub_court
    return match

# test case 1
courts_set = set(df_all['parent_court']) | set(df_all['sub_court'])
urls_set = set(df_all['sub_court_url']) | set(df_all['parent_court_url'])
url = list(df_all[df_all['sub_court_cleared']==sub_]['sub_court_url'])[0]
df_test = get_year_urls(sub_, url, df_all, courts_set, urls_set)
df_test

200
200


Unnamed: 0,year,year_url,sub_court_cleared
0,2017,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=WK&L2=CC&L3=2017&AR=11_1#A11_1,West Kowloon Magistrate Court Charge Case


In [217]:
# test case2
sub_ = 'Magistracy Appeal'
url = special_courts[sub_]
print(url)
df_test = get_year_urls(sub_, url, df_all, courts_set, urls_set)
df_test

https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&AR=23#A23
200
200


Unnamed: 0,year,year_url,sub_court_cleared
0,1986,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&L3=1986&AR=23_34#A23_34,Magistracy Appeal
1,1984,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&L3=1984&AR=23_36#A23_36,Magistracy Appeal
2,1983,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&L3=1983&AR=23_37#A23_37,Magistracy Appeal
3,2011,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&L3=2011&AR=23_9#A23_9,Magistracy Appeal
4,2000,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&L3=2000&AR=23_20#A23_20,Magistracy Appeal
5,2012,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&L3=2012&AR=23_8#A23_8,Magistracy Appeal
6,1998,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&L3=1998&AR=23_22#A23_22,Magistracy Appeal
7,1992,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&L3=1992&AR=23_28#A23_28,Magistracy Appeal
8,2001,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&L3=2001&AR=23_19#A23_19,Magistracy Appeal
9,2007,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=HC&L2=MA&L3=2007&AR=23_13#A23_13,Magistracy Appeal


In [218]:
dfs = []
for sc in tqdm(df_all['sub_court_cleared']):
    print('for sub_court: ',sc)
    # check if special sc
    if sc in special_courts:
        url = special_courts[sc]
    else:
        url = list(df_all[df_all['sub_court_cleared']==sc]['sub_court_url'])[0]
    match = get_year_urls(sc, url, df_all, courts_set, urls_set)
    dfs.append(match)
    
df_years = pd.concat(dfs, axis=0)
df_years.index = range(len(df_years))
print(df_years.shape)
df_years


  0%|                                                   | 0/88 [00:00<?, ?it/s]

for sub_court:  Final Appeal (Civil)
200
200



  1%|▍                                          | 1/88 [00:07<10:35,  7.30s/it]

for sub_court:  Final Appeal (Criminal)
200
200



  2%|▉                                          | 2/88 [00:20<13:04,  9.13s/it]

for sub_court:  Miscellaneou Proceeding
200
200



  3%|█▍                                         | 3/88 [00:34<14:46, 10.42s/it]

for sub_court:  Miscellaneou Proceeding (Civil)
200
200



  5%|█▉                                         | 4/88 [00:39<12:23,  8.85s/it]

for sub_court:  Miscellaneou Proceeding (Criminal)
200
200



  6%|██▍                                        | 5/88 [00:44<10:37,  7.68s/it]

for sub_court:  Application for Review
200
200



  7%|██▉                                        | 6/88 [00:48<08:58,  6.57s/it]

for sub_court:  Attorney General Reference
200
200



  8%|███▍                                       | 7/88 [00:51<07:36,  5.64s/it]

for sub_court:  Civil Appeal
200
200



  9%|███▉                                       | 8/88 [01:14<14:30, 10.88s/it]

for sub_court:  Criminal Appeal
200
200



 10%|████▍                                      | 9/88 [01:18<11:38,  8.84s/it]

for sub_court:  Miscellaneou Proceeding
200
200



 11%|████▊                                     | 10/88 [01:22<09:33,  7.35s/it]

for sub_court:  Reservation of Question of Law
200
200



 12%|█████▎                                    | 11/88 [01:27<08:33,  6.66s/it]

for sub_court:  Admiralty Action
200
200



 14%|█████▋                                    | 12/88 [01:31<07:20,  5.79s/it]

for sub_court:  Adoption Application
200
200



 15%|██████▏                                   | 13/88 [01:35<06:24,  5.12s/it]

for sub_court:  Application for Grant
200
200



 16%|██████▋                                   | 14/88 [01:42<06:58,  5.65s/it]

for sub_court:  Application to et aside a Statutory Demand (under Bankruptcy Ordinance)
200
200



 17%|███████▏                                  | 15/88 [01:46<06:29,  5.34s/it]

for sub_court:  Application under the Mental Health Ordinance
200
200



 18%|███████▋                                  | 16/88 [01:50<05:53,  4.90s/it]

for sub_court:  Bankruptcy Proceeding
200
200



 19%|████████                                  | 17/88 [01:55<05:40,  4.80s/it]

for sub_court:  Caveat
200
200



 20%|████████▌                                 | 18/88 [02:10<09:14,  7.92s/it]

for sub_court:  Citation Application
200
200



 22%|█████████                                 | 19/88 [02:13<07:38,  6.64s/it]

for sub_court:  Civil Action
200
200



 23%|█████████▌                                | 20/88 [02:18<06:39,  5.88s/it]

for sub_court:  Commercial Action
200
200



 24%|██████████                                | 21/88 [02:25<06:58,  6.25s/it]

for sub_court:  Companie Winding-up Proceeding
200
200



 25%|██████████▌                               | 22/88 [02:31<06:49,  6.20s/it]

for sub_court:  Confidential Miscellaneou Proceeding
200
200



 26%|██████████▉                               | 23/88 [02:35<05:55,  5.47s/it]

for sub_court:  Constitutional and Administrative Law Proceeding
200
200



 27%|███████████▍                              | 24/88 [02:41<06:17,  5.89s/it]

for sub_court:  Construction and Arbitration Proceeding
200
200



 28%|███████████▉                              | 25/88 [02:45<05:35,  5.32s/it]

for sub_court:  Criminal Case
200
200



 30%|████████████▍                             | 26/88 [03:01<08:37,  8.35s/it]

for sub_court:  Estate Duty Appeal
200
200



 31%|████████████▉                             | 27/88 [03:07<07:58,  7.84s/it]

for sub_court:  Ex-parte Application
200
200



 32%|█████████████▎                            | 28/88 [03:16<08:11,  8.18s/it]

for sub_court:  High Court Bankruptcy Interim Order
200
200



 33%|█████████████▊                            | 29/88 [03:21<06:54,  7.02s/it]

for sub_court:  Inland Revenue Appeal
200
200



 34%|██████████████▎                           | 30/88 [03:41<10:29, 10.85s/it]

for sub_court:  Intended Action
200
200



 35%|██████████████▊                           | 31/88 [03:44<08:15,  8.69s/it]

for sub_court:  Labour Tribunal Appeal
200
200



 36%|███████████████▎                          | 32/88 [03:48<06:45,  7.25s/it]

for sub_court:  Legal Aid Appeal
200
200



 38%|███████████████▊                          | 33/88 [03:57<07:00,  7.65s/it]

for sub_court:  Magistracy Appeal
200
200



 39%|████████████████▏                         | 34/88 [04:01<06:04,  6.75s/it]

for sub_court:  Matrimonial Cause
200
200



 40%|████████████████▋                         | 35/88 [04:05<05:13,  5.92s/it]

for sub_court:  Minor Employment Claim Appeal
200
200



 41%|█████████████████▏                        | 36/88 [04:21<07:42,  8.90s/it]

for sub_court:  Miscellaneou Proceeding
200
200



 42%|█████████████████▋                        | 37/88 [04:29<07:23,  8.69s/it]

for sub_court:  Miscellaneou Proceeding (Criminal)
200
200



 43%|██████████████████▏                       | 38/88 [04:33<06:04,  7.30s/it]

for sub_court:  Obscene Article Tribunal Appeal
200
200



 44%|██████████████████▌                       | 39/88 [04:37<05:02,  6.18s/it]

for sub_court:  Personal Injurie Action
200
200



 45%|███████████████████                       | 40/88 [04:43<04:47,  5.99s/it]

for sub_court:  Probate Action
200
200



 47%|███████████████████▌                      | 41/88 [04:46<04:12,  5.38s/it]

for sub_court:  Small Claim Tribunal Appeal
200
200



 48%|████████████████████                      | 42/88 [04:55<04:46,  6.22s/it]

for sub_court:  Competition Tribunal Action
200
200



 49%|████████████████████▌                     | 43/88 [04:59<04:12,  5.60s/it]

for sub_court:  Competition Tribunal Enforcement Action
200
200



 50%|█████████████████████                     | 44/88 [05:03<03:53,  5.31s/it]

for sub_court:  Civil Action
200
200



 51%|█████████████████████▍                    | 45/88 [05:13<04:43,  6.60s/it]

for sub_court:  Criminal Case
200
200



 52%|█████████████████████▉                    | 46/88 [05:17<04:04,  5.81s/it]

for sub_court:  Distraint Case
200
200



 53%|██████████████████████▍                   | 47/88 [05:26<04:35,  6.73s/it]

for sub_court:  District Court Tax Claim
200
200



 55%|██████████████████████▉                   | 48/88 [05:33<04:31,  6.80s/it]

for sub_court:  Employee Compensation Case
200
200



 56%|███████████████████████▍                  | 49/88 [05:37<03:50,  5.91s/it]

for sub_court:  Equal Opportunitie Action
200
200



 57%|███████████████████████▊                  | 50/88 [05:47<04:34,  7.21s/it]

for sub_court:  Intended Action
200
200



 58%|████████████████████████▎                 | 51/88 [05:51<03:53,  6.31s/it]

for sub_court:  Miscellaneou Proceeding
200
200



 59%|████████████████████████▊                 | 52/88 [05:55<03:19,  5.54s/it]

for sub_court:  Occupational Deafne (Compensation) Appeal
200
200



 60%|█████████████████████████▎                | 53/88 [05:59<02:54,  4.98s/it]

for sub_court:  Personal Injurie Action
200
200



 61%|█████████████████████████▊                | 54/88 [06:10<03:56,  6.95s/it]

for sub_court:  Stamp Duty Appeal
200
200



 62%|██████████████████████████▎               | 55/88 [06:20<04:18,  7.85s/it]

for sub_court:  Joint application
200
200



 64%|██████████████████████████▋               | 56/88 [06:28<04:12,  7.89s/it]

for sub_court:  Matrimonial Cause
200
200



 65%|███████████████████████████▏              | 57/88 [06:42<04:58,  9.63s/it]

for sub_court:  Miscellaneou Proceeding
200
200



 66%|███████████████████████████▋              | 58/88 [06:48<04:20,  8.69s/it]

for sub_court:  Building Management Application
200
200



 67%|████████████████████████████▏             | 59/88 [06:55<03:54,  8.08s/it]

for sub_court:  Building Ordinance Application
200
200



 68%|████████████████████████████▋             | 60/88 [07:06<04:11,  8.97s/it]

for sub_court:  Demolished Building Appeal
200
200



 69%|█████████████████████████████             | 61/88 [07:22<05:00, 11.11s/it]

for sub_court:  Demolished Building Application
200
200



 70%|█████████████████████████████▌            | 62/88 [07:27<04:04,  9.40s/it]

for sub_court:  Government Rent Appeal
200
200



 72%|██████████████████████████████            | 63/88 [07:37<03:56,  9.46s/it]

for sub_court:  Housing Ordinance Appeal
200
200



 73%|██████████████████████████████▌           | 64/88 [07:46<03:40,  9.19s/it]

for sub_court:  Land Compulsory Sale Application
200
200



 74%|███████████████████████████████           | 65/88 [07:50<02:55,  7.64s/it]

for sub_court:  Land Resumption Application
200
200



 75%|███████████████████████████████▌          | 66/88 [07:56<02:38,  7.23s/it]

for sub_court:  Landlord  Appeal
200
200



 76%|███████████████████████████████▉          | 67/88 [08:00<02:11,  6.25s/it]

for sub_court:  MTR Ordinance Application
200
200



 77%|████████████████████████████████▍         | 68/88 [08:04<01:50,  5.52s/it]

for sub_court:  Miscellaneou Proceeding Application
200
200



 78%|████████████████████████████████▉         | 69/88 [08:08<01:36,  5.10s/it]

for sub_court:  Miscellaneou Reference Application
200
200



 80%|█████████████████████████████████▍        | 70/88 [08:13<01:29,  4.99s/it]

for sub_court:  New Tenancy Application
200
200



 81%|█████████████████████████████████▉        | 71/88 [08:17<01:22,  4.88s/it]

for sub_court:  Part I Possession Application
200
200



 82%|██████████████████████████████████▎       | 72/88 [08:22<01:16,  4.80s/it]

for sub_court:  Part II Possession Application
200
200



 83%|██████████████████████████████████▊       | 73/88 [08:26<01:08,  4.55s/it]

for sub_court:  Part IV Possession Application
200
200



 84%|███████████████████████████████████▎      | 74/88 [08:35<01:24,  6.04s/it]

for sub_court:  Part V Possession Application
200
200



 85%|███████████████████████████████████▊      | 75/88 [08:39<01:11,  5.50s/it]

for sub_court:  Railway Ordinance Application
200
200



 86%|████████████████████████████████████▎     | 76/88 [08:49<01:21,  6.81s/it]

for sub_court:  Rating Appeal
200
200



 88%|████████████████████████████████████▊     | 77/88 [08:53<01:05,  5.99s/it]

for sub_court:  Coroner Court Death Inquest
200
200



 89%|█████████████████████████████████████▏    | 78/88 [08:59<00:57,  5.74s/it]

for sub_court:  Eastern Magistrate Court Charge Case
200
200



 90%|█████████████████████████████████████▋    | 79/88 [09:03<00:49,  5.48s/it]

for sub_court:  Eastern Magistrate Court Summon Case
200
200



 91%|██████████████████████████████████████▏   | 80/88 [09:08<00:41,  5.20s/it]

for sub_court:  Kowloon City Magistrate Court Charge Case
200
200



 92%|██████████████████████████████████████▋   | 81/88 [09:12<00:34,  4.97s/it]

for sub_court:  Kowloon City Magistrates’ Court Summon Case
200
200



 93%|███████████████████████████████████████▏  | 82/88 [09:30<00:53,  8.90s/it]

for sub_court:  Labour Tribunal Claim
200
200



 94%|███████████████████████████████████████▌  | 83/88 [09:38<00:41,  8.37s/it]

for sub_court:  Obscene Article Tribunal Case
200
200



 95%|████████████████████████████████████████  | 84/88 [09:42<00:28,  7.19s/it]

for sub_court:  Shatin Magistrate Court Charge Case
200
200



 97%|████████████████████████████████████████▌ | 85/88 [09:47<00:19,  6.54s/it]

for sub_court:  Small Claim Tribunal Claim
200
200



 98%|█████████████████████████████████████████ | 86/88 [09:51<00:11,  5.76s/it]

for sub_court:  Tuen Mun Magistrate Court Charge Case
200
200



 99%|█████████████████████████████████████████▌| 87/88 [09:55<00:05,  5.20s/it]

for sub_court:  West Kowloon Magistrate Court Charge Case
200
200



100%|██████████████████████████████████████████| 88/88 [09:59<00:00,  4.78s/it]


(1641, 3)


Unnamed: 0,year,year_url,sub_court_cleared
0,2011,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2011&AR=1_8#A1_8,Final Appeal (Civil)
1,1998,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=1998&AR=1_21#A1_21,Final Appeal (Civil)
2,2012,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2012&AR=1_7#A1_7,Final Appeal (Civil)
3,2000,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2000&AR=1_19#A1_19,Final Appeal (Civil)
4,2001,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2001&AR=1_18#A1_18,Final Appeal (Civil)
5,2007,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2007&AR=1_12#A1_12,Final Appeal (Civil)
6,2006,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2006&AR=1_13#A1_13,Final Appeal (Civil)
7,2005,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2005&AR=1_14#A1_14,Final Appeal (Civil)
8,2014,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2014&AR=1_5#A1_5,Final Appeal (Civil)
9,2008,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2008&AR=1_11#A1_11,Final Appeal (Civil)


In [222]:
df_years.head()

Unnamed: 0,year,year_url,sub_court_cleared
0,2011,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2011&AR=1_8#A1_8,Final Appeal (Civil)
1,1998,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=1998&AR=1_21#A1_21,Final Appeal (Civil)
2,2012,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2012&AR=1_7#A1_7,Final Appeal (Civil)
3,2000,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2000&AR=1_19#A1_19,Final Appeal (Civil)
4,2001,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2001&AR=1_18#A1_18,Final Appeal (Civil)


In [220]:
# store data
with open('data.pickle', 'wb') as ww:
    pickle.dump(df_years, ww)


In [5]:
f = open('data.pickle', 'rb')
df_years = pickle.load(f)

## Hiearchy 4: details

In [13]:
def get_onePage(url, r):
    data_onepage = [] # initialize
    root = url_to_root(url)
    info = root.find('script', string=re.compile("var myMenu")).text.split(",[")
    B_s = [BeautifulSoup(i) for i in info]
    for B in B_s:
        try:
            sub_court_cleared = r['sub_court_cleared']
            year = r['year']
            outcome = B.find('td', valign="top", width="5%").find('img')['src']
            name = B.find('a', class_="ThemeXPRowAnchor").text

            url = B.find('a', class_="ThemeXPRowAnchor")['href']
            url = url.split(',')[0]
            detail_url = url[re.search("https", url).span()[0]:-2]
            date = B.find('font',color="#006633").text
            persons = B.find('td', valign="top", width="55%").text
            data_onepage.append((sub_court_cleared, year, outcome, name, detail_url, date, persons))
        except AttributeError:
            continue
    return data_onepage


In [15]:
def store_onePage(idx, data_onepage):
    with open(str(idx)+'.pickle', 'wb') as ww:
        pickle.dump(data_onepage, ww)
    return 


In [25]:
def get_detail(df, x, y):
    for (idx, r) in tqdm(df.iterrows(), total=len(df)):      
        if idx >= x:
            url = r['year_url'] 
            print('now in row: ', idx)
            data_onepage = get_onePage(url, r)
            _ = store_onePage(idx, data_onepage)
            if idx >=y:
                break
    return      
        
df_detail = get_detail(df_years, 101, 500) # I do that in parallel fashion








  0%|                                                 | 0/1641 [00:00<?, ?it/s]

now in row:  101









  6%|██▎                                   | 102/1641 [00:00<00:13, 112.18it/s]

now in row:  102
now in row:  103
now in row:  104
now in row:  105









  6%|██▌                                    | 106/1641 [00:03<05:54,  4.33it/s]

now in row:  106
now in row:  107
now in row:  108









  7%|██▌                                    | 109/1641 [00:06<10:12,  2.50it/s]

now in row:  109
now in row:  110









  7%|██▋                                    | 111/1641 [00:07<12:56,  1.97it/s]

now in row:  111
now in row:  112









  7%|██▋                                    | 113/1641 [00:09<14:32,  1.75it/s]

now in row:  113









  7%|██▋                                    | 114/1641 [00:10<16:26,  1.55it/s]

now in row:  114









  7%|██▋                                    | 115/1641 [00:10<16:05,  1.58it/s]

now in row:  115









  7%|██▊                                    | 116/1641 [00:11<17:09,  1.48it/s]

now in row:  116









  7%|██▊                                    | 117/1641 [00:12<17:19,  1.47it/s]

now in row:  117









  7%|██▊                                    | 118/1641 [00:12<18:00,  1.41it/s]

now in row:  118









  7%|██▊                                    | 119/1641 [00:13<18:10,  1.40it/s]

now in row:  119









  7%|██▊                                    | 120/1641 [00:14<18:52,  1.34it/s]

now in row:  120









  7%|██▉                                    | 121/1641 [00:15<17:57,  1.41it/s]

now in row:  121









  7%|██▉                                    | 122/1641 [00:15<17:59,  1.41it/s]

now in row:  122









  7%|██▉                                    | 123/1641 [00:16<17:36,  1.44it/s]

now in row:  123









  8%|██▉                                    | 124/1641 [00:17<18:39,  1.36it/s]

now in row:  124









  8%|██▉                                    | 125/1641 [00:18<19:32,  1.29it/s]

now in row:  125









  8%|██▉                                    | 126/1641 [00:18<19:44,  1.28it/s]

now in row:  126









  8%|███                                    | 127/1641 [00:19<20:20,  1.24it/s]

now in row:  127









  8%|███                                    | 128/1641 [00:20<18:38,  1.35it/s]

now in row:  128









  8%|███                                    | 129/1641 [00:21<20:01,  1.26it/s]

now in row:  129









  8%|███                                    | 130/1641 [00:25<48:02,  1.91s/it]

now in row:  130









  8%|███                                    | 131/1641 [00:26<39:06,  1.55s/it]

now in row:  131









  8%|███▏                                   | 132/1641 [00:27<31:49,  1.27s/it]

now in row:  132









  8%|███▏                                   | 133/1641 [00:28<28:55,  1.15s/it]

now in row:  133









  8%|███▏                                   | 134/1641 [00:28<26:38,  1.06s/it]

now in row:  134









  8%|███▏                                   | 135/1641 [00:29<23:59,  1.05it/s]

now in row:  135









  8%|███▏                                   | 136/1641 [00:30<22:06,  1.13it/s]

now in row:  136









  8%|███▎                                   | 137/1641 [00:31<21:12,  1.18it/s]

now in row:  137









  8%|███▎                                   | 138/1641 [00:31<20:17,  1.23it/s]

now in row:  138









  8%|███▎                                   | 139/1641 [00:32<18:41,  1.34it/s]

now in row:  139









  9%|███▎                                   | 140/1641 [00:33<19:16,  1.30it/s]

now in row:  140









  9%|███▎                                   | 141/1641 [00:33<18:55,  1.32it/s]

now in row:  141









  9%|███▎                                   | 142/1641 [00:34<19:36,  1.27it/s]

now in row:  142









  9%|███▍                                   | 143/1641 [00:35<19:35,  1.27it/s]

now in row:  143









  9%|███▍                                   | 144/1641 [00:36<17:49,  1.40it/s]

now in row:  144









  9%|███▍                                   | 145/1641 [00:36<17:09,  1.45it/s]

now in row:  145









  9%|███▍                                   | 146/1641 [00:38<21:59,  1.13it/s]

now in row:  146









  9%|███▍                                   | 147/1641 [00:39<26:35,  1.07s/it]

now in row:  147









  9%|███▌                                   | 148/1641 [00:40<28:37,  1.15s/it]

now in row:  148









  9%|███▌                                   | 149/1641 [00:41<25:24,  1.02s/it]

now in row:  149









  9%|███▌                                   | 150/1641 [00:43<29:15,  1.18s/it]

now in row:  150









  9%|███▌                                   | 151/1641 [00:46<48:13,  1.94s/it]

now in row:  151









  9%|███▌                                   | 152/1641 [00:48<47:08,  1.90s/it]

now in row:  152









  9%|███▋                                   | 153/1641 [00:50<44:35,  1.80s/it]

now in row:  153









  9%|███▋                                   | 154/1641 [00:51<37:44,  1.52s/it]

now in row:  154









  9%|███▋                                   | 155/1641 [00:52<37:27,  1.51s/it]

now in row:  155









 10%|███▋                                   | 156/1641 [00:55<45:21,  1.83s/it]

now in row:  156









 10%|███▋                                   | 157/1641 [00:56<43:39,  1.76s/it]

now in row:  157









 10%|███▊                                   | 158/1641 [00:59<47:00,  1.90s/it]

now in row:  158









 10%|███▌                                 | 159/1641 [01:09<1:50:24,  4.47s/it]

now in row:  159









 10%|███▌                                 | 160/1641 [01:12<1:36:58,  3.93s/it]

now in row:  160









 10%|███▋                                 | 161/1641 [01:13<1:20:55,  3.28s/it]

now in row:  161









 10%|███▋                                 | 162/1641 [01:15<1:09:50,  2.83s/it]

now in row:  162









 10%|███▋                                 | 163/1641 [01:17<1:03:04,  2.56s/it]

now in row:  163









 10%|███▉                                   | 164/1641 [01:19<58:00,  2.36s/it]

now in row:  164









 10%|███▉                                   | 165/1641 [01:21<55:47,  2.27s/it]

now in row:  165









 10%|███▉                                   | 166/1641 [01:23<52:21,  2.13s/it]

now in row:  166









 10%|███▉                                   | 167/1641 [01:24<44:30,  1.81s/it]

now in row:  167









 10%|███▉                                   | 168/1641 [01:25<40:40,  1.66s/it]

now in row:  168









 10%|████                                   | 169/1641 [01:27<40:51,  1.67s/it]

now in row:  169









 10%|████                                   | 170/1641 [01:30<50:19,  2.05s/it]

now in row:  170









 10%|████                                   | 171/1641 [01:32<52:34,  2.15s/it]

now in row:  171









 10%|████                                   | 172/1641 [01:34<49:10,  2.01s/it]

now in row:  172









 11%|████                                   | 173/1641 [01:36<47:44,  1.95s/it]

now in row:  173









 11%|████▏                                  | 174/1641 [01:38<46:20,  1.90s/it]

now in row:  174









 11%|████▏                                  | 175/1641 [01:39<44:41,  1.83s/it]

now in row:  175









 11%|████▏                                  | 176/1641 [01:41<44:44,  1.83s/it]

now in row:  176









 11%|████▏                                  | 177/1641 [01:43<43:41,  1.79s/it]

now in row:  177









 11%|████▏                                  | 178/1641 [01:44<42:38,  1.75s/it]

now in row:  178









 11%|████▎                                  | 179/1641 [01:46<38:26,  1.58s/it]

now in row:  179









 11%|████                                 | 180/1641 [01:53<1:18:23,  3.22s/it]

now in row:  180









 11%|████                                 | 181/1641 [01:56<1:19:26,  3.26s/it]

now in row:  181









 11%|████                                 | 182/1641 [01:58<1:09:28,  2.86s/it]

now in row:  182









 11%|████▎                                  | 183/1641 [01:59<57:02,  2.35s/it]

now in row:  183









 11%|████▎                                  | 184/1641 [02:01<53:47,  2.21s/it]

now in row:  184









 11%|████▍                                  | 185/1641 [02:03<51:27,  2.12s/it]

now in row:  185









 11%|████▍                                  | 186/1641 [02:05<47:55,  1.98s/it]

now in row:  186









 11%|████▍                                  | 187/1641 [02:06<43:38,  1.80s/it]

now in row:  187









 11%|████▍                                  | 188/1641 [02:07<39:49,  1.64s/it]

now in row:  188









 12%|████▍                                  | 189/1641 [02:08<34:46,  1.44s/it]

now in row:  189









 12%|████▌                                  | 190/1641 [02:11<44:03,  1.82s/it]

now in row:  190









 12%|████▌                                  | 191/1641 [02:13<43:28,  1.80s/it]

now in row:  191









 12%|████▌                                  | 192/1641 [02:14<38:49,  1.61s/it]

now in row:  192









 12%|████▌                                  | 193/1641 [02:15<33:31,  1.39s/it]

now in row:  193









 12%|████▌                                  | 194/1641 [02:16<35:17,  1.46s/it]

now in row:  194









 12%|████▋                                  | 195/1641 [02:20<47:56,  1.99s/it]

now in row:  195









 12%|████▋                                  | 196/1641 [02:21<44:08,  1.83s/it]

now in row:  196









 12%|████▋                                  | 197/1641 [02:22<39:36,  1.65s/it]

now in row:  197









 12%|████▋                                  | 198/1641 [02:24<40:09,  1.67s/it]

now in row:  198









 12%|████▋                                  | 199/1641 [02:25<38:56,  1.62s/it]

now in row:  199









 12%|████▊                                  | 200/1641 [02:27<38:32,  1.60s/it]

now in row:  200









 12%|████▊                                  | 201/1641 [02:29<39:37,  1.65s/it]

now in row:  201









 12%|████▊                                  | 202/1641 [02:32<51:04,  2.13s/it]

now in row:  202









 12%|████▊                                  | 203/1641 [02:34<46:48,  1.95s/it]

now in row:  203









 12%|████▊                                  | 204/1641 [02:35<45:19,  1.89s/it]

now in row:  204









 12%|████▊                                  | 205/1641 [02:36<37:03,  1.55s/it]

now in row:  205









 13%|████▉                                  | 206/1641 [02:39<47:10,  1.97s/it]

now in row:  206









 13%|████▉                                  | 207/1641 [02:41<44:49,  1.88s/it]

now in row:  207









 13%|████▉                                  | 208/1641 [02:42<44:40,  1.87s/it]

now in row:  208









 13%|████▉                                  | 209/1641 [02:44<44:17,  1.86s/it]

now in row:  209









 13%|████▋                                | 210/1641 [02:52<1:26:31,  3.63s/it]

now in row:  210









 13%|████▊                                | 211/1641 [02:53<1:07:12,  2.82s/it]

now in row:  211









 13%|████▊                                | 212/1641 [02:55<1:00:18,  2.53s/it]

now in row:  212









 13%|█████                                  | 213/1641 [02:57<54:47,  2.30s/it]

now in row:  213









 13%|█████                                  | 214/1641 [02:58<51:25,  2.16s/it]

now in row:  214









 13%|████▊                                | 215/1641 [03:11<2:06:11,  5.31s/it]

now in row:  215









 13%|████▊                                | 216/1641 [03:13<1:41:17,  4.27s/it]

now in row:  216









 13%|████▉                                | 217/1641 [03:15<1:24:16,  3.55s/it]

now in row:  217









 13%|████▉                                | 218/1641 [03:17<1:14:04,  3.12s/it]

now in row:  218









 13%|████▉                                | 219/1641 [03:19<1:06:04,  2.79s/it]

now in row:  219









 13%|█████▏                                 | 220/1641 [03:20<51:23,  2.17s/it]

now in row:  220









 13%|█████▎                                 | 221/1641 [03:22<50:36,  2.14s/it]

now in row:  221









 14%|█████▎                                 | 222/1641 [03:23<46:47,  1.98s/it]

now in row:  222









 14%|█████▎                                 | 223/1641 [03:26<54:38,  2.31s/it]

now in row:  223









 14%|█████▎                                 | 224/1641 [03:28<51:39,  2.19s/it]

now in row:  224









 14%|█████▎                                 | 225/1641 [03:30<50:32,  2.14s/it]

now in row:  225









 14%|█████▎                                 | 226/1641 [03:34<58:03,  2.46s/it]

now in row:  226









 14%|█████▍                                 | 227/1641 [03:35<53:17,  2.26s/it]

now in row:  227









 14%|█████▏                               | 228/1641 [03:42<1:24:57,  3.61s/it]

now in row:  228









 14%|█████▏                               | 229/1641 [03:44<1:14:02,  3.15s/it]

now in row:  229









 14%|█████▏                               | 230/1641 [03:47<1:13:08,  3.11s/it]

now in row:  230









 14%|█████▏                               | 231/1641 [03:49<1:06:20,  2.82s/it]

now in row:  231









 14%|█████▏                               | 232/1641 [04:02<2:13:36,  5.69s/it]

now in row:  232









 14%|█████▎                               | 233/1641 [04:04<1:46:24,  4.53s/it]

now in row:  233









 14%|█████▎                               | 234/1641 [04:06<1:29:20,  3.81s/it]

now in row:  234









 14%|█████▎                               | 235/1641 [04:08<1:19:13,  3.38s/it]

now in row:  235









 14%|█████▎                               | 236/1641 [04:18<2:04:52,  5.33s/it]

now in row:  236









 14%|█████▎                               | 237/1641 [04:20<1:39:53,  4.27s/it]

now in row:  237









 15%|█████▎                               | 238/1641 [04:21<1:15:42,  3.24s/it]

now in row:  238









 15%|█████▍                               | 239/1641 [04:23<1:07:41,  2.90s/it]

now in row:  239









 15%|█████▋                                 | 240/1641 [04:24<59:27,  2.55s/it]

now in row:  240









 15%|█████▋                                 | 241/1641 [04:26<52:51,  2.27s/it]

now in row:  241









 15%|█████▊                                 | 242/1641 [04:28<49:08,  2.11s/it]

now in row:  242









 15%|█████▊                                 | 243/1641 [04:30<48:33,  2.08s/it]

now in row:  243









 15%|█████▊                                 | 244/1641 [04:32<45:51,  1.97s/it]

now in row:  244









 15%|█████▊                                 | 245/1641 [04:34<47:28,  2.04s/it]

now in row:  245









 15%|█████▊                                 | 246/1641 [04:35<38:51,  1.67s/it]

now in row:  246









 15%|█████▊                                 | 247/1641 [04:36<40:05,  1.73s/it]

now in row:  247









 15%|█████▉                                 | 248/1641 [04:38<38:38,  1.66s/it]

now in row:  248









 15%|█████▉                                 | 249/1641 [04:41<50:04,  2.16s/it]

now in row:  249









 15%|█████▉                                 | 250/1641 [04:43<48:48,  2.11s/it]

now in row:  250









 15%|█████▉                                 | 251/1641 [04:45<47:38,  2.06s/it]

now in row:  251









 15%|█████▉                                 | 252/1641 [04:46<38:25,  1.66s/it]

now in row:  252









 15%|██████                                 | 253/1641 [04:48<40:02,  1.73s/it]

now in row:  253









 15%|██████                                 | 254/1641 [04:49<38:32,  1.67s/it]

now in row:  254









 16%|██████                                 | 255/1641 [04:51<40:10,  1.74s/it]

now in row:  255









 16%|██████                                 | 256/1641 [04:53<42:23,  1.84s/it]

now in row:  256









 16%|██████                                 | 257/1641 [04:56<49:31,  2.15s/it]

now in row:  257









 16%|██████▏                                | 258/1641 [04:58<46:41,  2.03s/it]

now in row:  258









 16%|██████▏                                | 259/1641 [05:00<45:06,  1.96s/it]

now in row:  259









 16%|██████▏                                | 260/1641 [05:01<43:08,  1.87s/it]

now in row:  260









 16%|██████▏                                | 261/1641 [05:04<46:55,  2.04s/it]

now in row:  261









 16%|██████▏                                | 262/1641 [05:05<38:32,  1.68s/it]

now in row:  262









 16%|██████▎                                | 263/1641 [05:05<32:47,  1.43s/it]

now in row:  263









 16%|██████▎                                | 264/1641 [05:06<28:20,  1.23s/it]

now in row:  264









 16%|██████▎                                | 265/1641 [05:07<27:32,  1.20s/it]

now in row:  265









 16%|██████▎                                | 266/1641 [05:08<24:49,  1.08s/it]

now in row:  266









 16%|██████▎                                | 267/1641 [05:09<23:07,  1.01s/it]

now in row:  267









 16%|██████▎                                | 268/1641 [05:10<21:02,  1.09it/s]

now in row:  268









 16%|██████▍                                | 269/1641 [05:10<19:38,  1.16it/s]

now in row:  269









 16%|██████▍                                | 270/1641 [05:11<20:35,  1.11it/s]

now in row:  270









 17%|██████▍                                | 271/1641 [05:12<21:33,  1.06it/s]

now in row:  271









 17%|██████▍                                | 272/1641 [05:14<23:05,  1.01s/it]

now in row:  272









 17%|██████▍                                | 273/1641 [05:15<22:20,  1.02it/s]

now in row:  273









 17%|██████▌                                | 274/1641 [05:16<23:44,  1.04s/it]

now in row:  274









 17%|██████▌                                | 275/1641 [05:17<23:44,  1.04s/it]

now in row:  275









 17%|██████▌                                | 276/1641 [05:18<23:43,  1.04s/it]

now in row:  276









 17%|██████▌                                | 277/1641 [05:19<23:04,  1.01s/it]

now in row:  277









 17%|██████▌                                | 278/1641 [05:20<23:28,  1.03s/it]

now in row:  278









 17%|██████▋                                | 279/1641 [05:21<25:40,  1.13s/it]

now in row:  279









 17%|██████▋                                | 280/1641 [05:22<25:18,  1.12s/it]

now in row:  280









 17%|██████▋                                | 281/1641 [05:24<27:21,  1.21s/it]

now in row:  281









 17%|██████▋                                | 282/1641 [05:25<26:46,  1.18s/it]

now in row:  282









 17%|██████▋                                | 283/1641 [05:26<24:20,  1.08s/it]

now in row:  283









 17%|██████▋                                | 284/1641 [05:27<23:08,  1.02s/it]

now in row:  284









 17%|██████▊                                | 285/1641 [05:28<22:25,  1.01it/s]

now in row:  285









 17%|██████▊                                | 286/1641 [05:28<22:23,  1.01it/s]

now in row:  286









 17%|██████▊                                | 287/1641 [05:30<23:09,  1.03s/it]

now in row:  287









 18%|██████▍                              | 288/1641 [05:36<1:01:01,  2.71s/it]

now in row:  288









 18%|██████▊                                | 289/1641 [05:37<50:15,  2.23s/it]

now in row:  289









 18%|██████▉                                | 290/1641 [05:38<42:26,  1.88s/it]

now in row:  290









 18%|██████▉                                | 291/1641 [05:39<36:05,  1.60s/it]

now in row:  291









 18%|██████▉                                | 292/1641 [05:40<31:35,  1.40s/it]

now in row:  292









 18%|██████▉                                | 293/1641 [05:41<29:50,  1.33s/it]

now in row:  293









 18%|██████▉                                | 294/1641 [05:43<28:27,  1.27s/it]

now in row:  294









 18%|███████                                | 295/1641 [05:44<27:12,  1.21s/it]

now in row:  295









 18%|███████                                | 296/1641 [05:45<26:15,  1.17s/it]

now in row:  296









 18%|███████                                | 297/1641 [05:46<24:06,  1.08s/it]

now in row:  297









 18%|███████                                | 298/1641 [05:47<24:24,  1.09s/it]

now in row:  298









 18%|███████                                | 299/1641 [05:48<28:09,  1.26s/it]

now in row:  299









 18%|██████▊                              | 300/1641 [05:57<1:17:13,  3.46s/it]

now in row:  300









 18%|██████▊                              | 301/1641 [05:58<1:01:26,  2.75s/it]

now in row:  301









 18%|███████▏                               | 302/1641 [05:59<49:46,  2.23s/it]

now in row:  302









 18%|███████▏                               | 303/1641 [06:00<41:05,  1.84s/it]

now in row:  303









 19%|███████▏                               | 304/1641 [06:01<37:31,  1.68s/it]

now in row:  304









 19%|███████▏                               | 305/1641 [06:02<31:52,  1.43s/it]

now in row:  305









 19%|███████▎                               | 306/1641 [06:03<29:35,  1.33s/it]

now in row:  306









 19%|███████▎                               | 307/1641 [06:04<27:21,  1.23s/it]

now in row:  307









 19%|███████▎                               | 308/1641 [06:05<25:04,  1.13s/it]

now in row:  308









 19%|███████▎                               | 309/1641 [06:06<24:38,  1.11s/it]

now in row:  309









 19%|███████▎                               | 310/1641 [06:07<25:26,  1.15s/it]

now in row:  310









 19%|███████▍                               | 311/1641 [06:08<22:59,  1.04s/it]

now in row:  311









 19%|███████▍                               | 312/1641 [06:09<23:20,  1.05s/it]

now in row:  312









 19%|███████▍                               | 313/1641 [06:11<28:52,  1.30s/it]

now in row:  313









 19%|███████▍                               | 314/1641 [06:13<29:24,  1.33s/it]

now in row:  314









 19%|███████▍                               | 315/1641 [06:13<25:51,  1.17s/it]

now in row:  315









 19%|███████▌                               | 316/1641 [06:14<23:58,  1.09s/it]

now in row:  316









 19%|███████▌                               | 317/1641 [06:15<22:34,  1.02s/it]

now in row:  317









 19%|███████▌                               | 318/1641 [06:16<22:36,  1.03s/it]

now in row:  318









 19%|███████▌                               | 319/1641 [06:17<21:04,  1.05it/s]

now in row:  319









 20%|███████▌                               | 320/1641 [06:18<19:41,  1.12it/s]

now in row:  320









 20%|███████▋                               | 321/1641 [06:19<18:56,  1.16it/s]

now in row:  321









 20%|███████▋                               | 322/1641 [06:20<19:50,  1.11it/s]

now in row:  322









 20%|███████▋                               | 323/1641 [06:20<19:44,  1.11it/s]

now in row:  323









 20%|███████▋                               | 324/1641 [06:22<21:05,  1.04it/s]

now in row:  324









 20%|███████▋                               | 325/1641 [06:22<20:25,  1.07it/s]

now in row:  325









 20%|███████▋                               | 326/1641 [06:23<19:37,  1.12it/s]

now in row:  326









 20%|███████▊                               | 327/1641 [06:24<19:06,  1.15it/s]

now in row:  327









 20%|███████▊                               | 328/1641 [06:25<19:25,  1.13it/s]

now in row:  328









 20%|███████▊                               | 329/1641 [06:26<19:48,  1.10it/s]

now in row:  329









 20%|███████▊                               | 330/1641 [06:27<19:22,  1.13it/s]

now in row:  330









 20%|███████▊                               | 331/1641 [06:28<18:52,  1.16it/s]

now in row:  331









 20%|███████▉                               | 332/1641 [06:28<18:18,  1.19it/s]

now in row:  332









 20%|███████▉                               | 333/1641 [06:29<19:19,  1.13it/s]

now in row:  333









 20%|███████▉                               | 334/1641 [06:30<18:58,  1.15it/s]

now in row:  334









 20%|███████▉                               | 335/1641 [06:31<19:17,  1.13it/s]

now in row:  335









 20%|███████▉                               | 336/1641 [06:32<18:53,  1.15it/s]

now in row:  336









 21%|████████                               | 337/1641 [06:33<19:04,  1.14it/s]

now in row:  337









 21%|████████                               | 338/1641 [06:34<19:54,  1.09it/s]

now in row:  338









 21%|████████                               | 339/1641 [06:35<19:17,  1.12it/s]

now in row:  339









 21%|████████                               | 340/1641 [06:35<18:52,  1.15it/s]

now in row:  340









 21%|████████                               | 341/1641 [06:36<17:40,  1.23it/s]

now in row:  341









 21%|████████▏                              | 342/1641 [06:38<27:30,  1.27s/it]

now in row:  342









 21%|███████▋                             | 343/1641 [06:46<1:07:12,  3.11s/it]

now in row:  343









 21%|████████▏                              | 344/1641 [06:47<55:27,  2.57s/it]

now in row:  344









 21%|████████▏                              | 345/1641 [06:48<44:50,  2.08s/it]

now in row:  345









 21%|████████▏                              | 346/1641 [06:49<36:34,  1.69s/it]

now in row:  346









 21%|████████▏                              | 347/1641 [06:50<33:19,  1.54s/it]

now in row:  347









 21%|████████▎                              | 348/1641 [06:51<29:25,  1.37s/it]

now in row:  348









 21%|████████▎                              | 349/1641 [06:52<27:17,  1.27s/it]

now in row:  349









 21%|████████▎                              | 350/1641 [06:53<24:45,  1.15s/it]

now in row:  350









 21%|████████▎                              | 351/1641 [06:54<24:51,  1.16s/it]

now in row:  351









 21%|████████▎                              | 352/1641 [06:55<23:34,  1.10s/it]

now in row:  352









 22%|████████▍                              | 353/1641 [06:56<21:21,  1.01it/s]

now in row:  353









 22%|████████▍                              | 354/1641 [06:57<22:07,  1.03s/it]

now in row:  354









 22%|████████▍                              | 355/1641 [06:58<21:29,  1.00s/it]

now in row:  355









 22%|████████▍                              | 356/1641 [06:59<20:02,  1.07it/s]

now in row:  356









 22%|████████▍                              | 357/1641 [07:00<19:47,  1.08it/s]

now in row:  357









 22%|████████▌                              | 358/1641 [07:00<19:19,  1.11it/s]

now in row:  358









 22%|████████▌                              | 359/1641 [07:02<20:41,  1.03it/s]

now in row:  359









 22%|████████▌                              | 360/1641 [07:02<20:05,  1.06it/s]

now in row:  360









 22%|████████▌                              | 361/1641 [07:04<21:32,  1.01s/it]

now in row:  361









 22%|████████▌                              | 362/1641 [07:04<20:17,  1.05it/s]

now in row:  362









 22%|████████▋                              | 363/1641 [07:05<18:52,  1.13it/s]

now in row:  363









 22%|████████▋                              | 364/1641 [07:06<19:20,  1.10it/s]

now in row:  364









 22%|████████▋                              | 365/1641 [07:07<18:56,  1.12it/s]

now in row:  365









 22%|████████▋                              | 366/1641 [07:08<19:10,  1.11it/s]

now in row:  366









 22%|████████▋                              | 367/1641 [07:10<26:45,  1.26s/it]

now in row:  367









 22%|████████▋                              | 368/1641 [07:11<24:26,  1.15s/it]

now in row:  368









 22%|████████▊                              | 369/1641 [07:12<23:41,  1.12s/it]

now in row:  369









 23%|████████▊                              | 370/1641 [07:13<23:01,  1.09s/it]

now in row:  370









 23%|████████▊                              | 371/1641 [07:15<26:29,  1.25s/it]

now in row:  371









 23%|████████▊                              | 372/1641 [07:16<27:12,  1.29s/it]

now in row:  372









 23%|████████▊                              | 373/1641 [07:19<36:19,  1.72s/it]

now in row:  373









 23%|████████▉                              | 374/1641 [07:20<35:27,  1.68s/it]

now in row:  374









 23%|████████▉                              | 375/1641 [07:21<31:53,  1.51s/it]

now in row:  375









 23%|████████▉                              | 376/1641 [07:23<33:23,  1.58s/it]

now in row:  376









 23%|████████▉                              | 377/1641 [07:25<37:58,  1.80s/it]

now in row:  377









 23%|████████▉                              | 378/1641 [07:28<44:20,  2.11s/it]

now in row:  378









 23%|█████████                              | 379/1641 [07:30<43:41,  2.08s/it]

now in row:  379









 23%|█████████                              | 380/1641 [07:32<42:29,  2.02s/it]

now in row:  380









 23%|█████████                              | 381/1641 [07:34<40:13,  1.92s/it]

now in row:  381









 23%|█████████                              | 382/1641 [07:35<37:00,  1.76s/it]

now in row:  382









 23%|█████████                              | 383/1641 [07:36<31:02,  1.48s/it]

now in row:  383









 23%|█████████▏                             | 384/1641 [07:38<32:08,  1.53s/it]

now in row:  384









 23%|█████████▏                             | 385/1641 [07:40<37:29,  1.79s/it]

now in row:  385









 24%|█████████▏                             | 386/1641 [07:42<35:15,  1.69s/it]

now in row:  386









 24%|█████████▏                             | 387/1641 [07:43<30:58,  1.48s/it]

now in row:  387









 24%|█████████▏                             | 388/1641 [07:44<32:02,  1.53s/it]

now in row:  388









 24%|█████████▏                             | 389/1641 [07:46<31:36,  1.51s/it]

now in row:  389









 24%|█████████▎                             | 390/1641 [07:48<35:01,  1.68s/it]

now in row:  390









 24%|█████████▎                             | 391/1641 [07:49<33:11,  1.59s/it]

now in row:  391









 24%|█████████▎                             | 392/1641 [07:50<27:55,  1.34s/it]

now in row:  392









 24%|█████████▎                             | 393/1641 [07:51<25:21,  1.22s/it]

now in row:  393









 24%|█████████▎                             | 394/1641 [07:53<29:37,  1.43s/it]

now in row:  394









 24%|█████████▍                             | 395/1641 [07:54<26:19,  1.27s/it]

now in row:  395









 24%|█████████▍                             | 396/1641 [07:55<25:12,  1.22s/it]

now in row:  396









 24%|█████████▍                             | 397/1641 [07:56<27:44,  1.34s/it]

now in row:  397









 24%|█████████▍                             | 398/1641 [07:58<30:32,  1.47s/it]

now in row:  398









 24%|█████████▍                             | 399/1641 [08:00<30:11,  1.46s/it]

now in row:  399









 24%|█████████▌                             | 400/1641 [08:01<31:12,  1.51s/it]

now in row:  400









 24%|█████████▌                             | 401/1641 [08:02<28:47,  1.39s/it]

now in row:  401









 24%|█████████▌                             | 402/1641 [08:03<24:58,  1.21s/it]

now in row:  402









 25%|█████████▌                             | 403/1641 [08:04<22:24,  1.09s/it]

now in row:  403









 25%|█████████▌                             | 404/1641 [08:05<23:11,  1.13s/it]

now in row:  404









 25%|█████████▋                             | 405/1641 [08:06<21:49,  1.06s/it]

now in row:  405









 25%|█████████▋                             | 406/1641 [08:07<23:46,  1.15s/it]

now in row:  406









 25%|█████████▋                             | 407/1641 [08:09<24:19,  1.18s/it]

now in row:  407









 25%|█████████▋                             | 408/1641 [08:10<22:57,  1.12s/it]

now in row:  408









 25%|█████████▋                             | 409/1641 [08:11<26:49,  1.31s/it]

now in row:  409









 25%|█████████▋                             | 410/1641 [08:13<29:32,  1.44s/it]

now in row:  410









 25%|█████████▊                             | 411/1641 [08:15<29:58,  1.46s/it]

now in row:  411









 25%|█████████▊                             | 412/1641 [08:16<28:35,  1.40s/it]

now in row:  412









 25%|█████████▊                             | 413/1641 [08:17<24:53,  1.22s/it]

now in row:  413









 25%|█████████▊                             | 414/1641 [08:17<21:48,  1.07s/it]

now in row:  414









 25%|█████████▊                             | 415/1641 [08:18<20:07,  1.02it/s]

now in row:  415









 25%|█████████▉                             | 416/1641 [08:19<20:46,  1.02s/it]

now in row:  416









 25%|█████████▉                             | 417/1641 [08:20<19:41,  1.04it/s]

now in row:  417









 25%|█████████▉                             | 418/1641 [08:22<24:00,  1.18s/it]

now in row:  418









 26%|█████████▉                             | 419/1641 [08:23<23:29,  1.15s/it]

now in row:  419









 26%|█████████▉                             | 420/1641 [08:24<21:19,  1.05s/it]

now in row:  420









 26%|██████████                             | 421/1641 [08:24<19:45,  1.03it/s]

now in row:  421









 26%|██████████                             | 422/1641 [08:25<18:24,  1.10it/s]

now in row:  422









 26%|██████████                             | 423/1641 [08:26<18:44,  1.08it/s]

now in row:  423









 26%|██████████                             | 424/1641 [08:27<17:39,  1.15it/s]

now in row:  424









 26%|██████████                             | 425/1641 [08:28<16:53,  1.20it/s]

now in row:  425









 26%|██████████                             | 426/1641 [08:29<16:53,  1.20it/s]

now in row:  426









 26%|██████████▏                            | 427/1641 [08:29<17:23,  1.16it/s]

now in row:  427









 26%|██████████▏                            | 428/1641 [08:30<17:29,  1.16it/s]

now in row:  428









 26%|██████████▏                            | 429/1641 [08:31<17:03,  1.18it/s]

now in row:  429









 26%|██████████▏                            | 430/1641 [08:32<16:34,  1.22it/s]

now in row:  430









 26%|██████████▏                            | 431/1641 [08:34<24:32,  1.22s/it]

now in row:  431









 26%|██████████▎                            | 432/1641 [08:36<31:29,  1.56s/it]

now in row:  432









 26%|██████████▎                            | 433/1641 [08:38<32:52,  1.63s/it]

now in row:  433









 26%|██████████▎                            | 434/1641 [08:39<29:30,  1.47s/it]

now in row:  434









 27%|██████████▎                            | 435/1641 [08:41<32:45,  1.63s/it]

now in row:  435









 27%|██████████▎                            | 436/1641 [08:43<34:18,  1.71s/it]

now in row:  436









 27%|██████████▍                            | 437/1641 [08:45<34:36,  1.72s/it]

now in row:  437









 27%|██████████▍                            | 438/1641 [08:47<35:00,  1.75s/it]

now in row:  438









 27%|██████████▍                            | 439/1641 [08:48<33:10,  1.66s/it]

now in row:  439









 27%|██████████▍                            | 440/1641 [08:50<35:03,  1.75s/it]

now in row:  440









 27%|█████████▉                           | 441/1641 [09:00<1:24:40,  4.23s/it]

now in row:  441









 27%|█████████▉                           | 442/1641 [09:02<1:09:18,  3.47s/it]

now in row:  442









 27%|█████████▉                           | 443/1641 [09:04<1:01:32,  3.08s/it]

now in row:  443









 27%|██████████▌                            | 444/1641 [09:06<54:44,  2.74s/it]

now in row:  444









 27%|██████████▌                            | 445/1641 [09:07<44:38,  2.24s/it]

now in row:  445









 27%|██████████                           | 446/1641 [09:15<1:17:44,  3.90s/it]

now in row:  446









 27%|██████████                           | 447/1641 [09:17<1:06:11,  3.33s/it]

now in row:  447









 27%|██████████▋                            | 448/1641 [09:19<57:08,  2.87s/it]

now in row:  448









 27%|██████████▋                            | 449/1641 [09:20<50:11,  2.53s/it]

now in row:  449









 27%|██████████▋                            | 450/1641 [09:22<42:56,  2.16s/it]

now in row:  450









 27%|██████████▋                            | 451/1641 [09:24<40:59,  2.07s/it]

now in row:  451









 28%|██████████▋                            | 452/1641 [09:25<39:45,  2.01s/it]

now in row:  452









 28%|██████████▊                            | 453/1641 [09:27<38:35,  1.95s/it]

now in row:  453









 28%|██████████▊                            | 454/1641 [09:29<39:16,  1.99s/it]

now in row:  454









 28%|██████████▊                            | 455/1641 [09:31<37:37,  1.90s/it]

now in row:  455









 28%|██████████▊                            | 456/1641 [09:33<38:09,  1.93s/it]

now in row:  456









 28%|██████████▊                            | 457/1641 [09:35<37:03,  1.88s/it]

now in row:  457









 28%|██████████▉                            | 458/1641 [09:36<34:27,  1.75s/it]

now in row:  458









 28%|██████████▉                            | 459/1641 [09:39<41:03,  2.08s/it]

now in row:  459









 28%|██████████▉                            | 460/1641 [09:41<39:04,  1.99s/it]

now in row:  460









 28%|██████████▉                            | 461/1641 [09:43<39:08,  1.99s/it]

now in row:  461









 28%|██████████▉                            | 462/1641 [09:45<41:14,  2.10s/it]

now in row:  462









 28%|███████████                            | 463/1641 [09:47<40:56,  2.09s/it]

now in row:  463









 28%|███████████                            | 464/1641 [09:49<38:57,  1.99s/it]

now in row:  464









 28%|███████████                            | 465/1641 [09:51<38:39,  1.97s/it]

now in row:  465









 28%|███████████                            | 466/1641 [09:53<37:53,  1.94s/it]

now in row:  466









 28%|███████████                            | 467/1641 [09:54<36:00,  1.84s/it]

now in row:  467









 29%|███████████                            | 468/1641 [09:55<31:02,  1.59s/it]

now in row:  468









 29%|███████████▏                           | 469/1641 [09:57<31:36,  1.62s/it]

now in row:  469









 29%|███████████▏                           | 470/1641 [09:59<33:17,  1.71s/it]

now in row:  470









 29%|███████████▏                           | 471/1641 [10:04<53:55,  2.77s/it]

now in row:  471









 29%|███████████▏                           | 472/1641 [10:06<49:08,  2.52s/it]

now in row:  472









 29%|███████████▏                           | 473/1641 [10:08<45:35,  2.34s/it]

now in row:  473









 29%|███████████▎                           | 474/1641 [10:10<44:47,  2.30s/it]

now in row:  474









 29%|██████████▋                          | 475/1641 [10:16<1:04:19,  3.31s/it]

now in row:  475









 29%|███████████▎                           | 476/1641 [10:18<57:20,  2.95s/it]

now in row:  476









 29%|███████████▎                           | 477/1641 [10:20<51:17,  2.64s/it]

now in row:  477









 29%|███████████▎                           | 478/1641 [10:21<44:24,  2.29s/it]

now in row:  478









 29%|███████████▍                           | 479/1641 [10:23<42:12,  2.18s/it]

now in row:  479









 29%|███████████▍                           | 480/1641 [10:25<40:01,  2.07s/it]

now in row:  480









 29%|███████████▍                           | 481/1641 [10:26<34:24,  1.78s/it]

now in row:  481









 29%|███████████▍                           | 482/1641 [10:28<33:18,  1.72s/it]

now in row:  482









 29%|███████████▍                           | 483/1641 [10:29<28:52,  1.50s/it]

now in row:  483









 29%|███████████▌                           | 484/1641 [10:31<32:04,  1.66s/it]

now in row:  484









 30%|███████████▌                           | 485/1641 [10:33<35:07,  1.82s/it]

now in row:  485









 30%|███████████▌                           | 486/1641 [10:35<35:58,  1.87s/it]

now in row:  486









 30%|███████████▌                           | 487/1641 [10:36<30:24,  1.58s/it]

now in row:  487









 30%|███████████▌                           | 488/1641 [10:38<35:34,  1.85s/it]

now in row:  488









 30%|███████████▌                           | 489/1641 [10:41<37:27,  1.95s/it]

now in row:  489









 30%|███████████▋                           | 490/1641 [10:43<37:28,  1.95s/it]

now in row:  490









 30%|███████████                          | 491/1641 [10:50<1:09:01,  3.60s/it]

now in row:  491









 30%|███████████▋                           | 492/1641 [10:52<57:47,  3.02s/it]

now in row:  492









 30%|███████████▋                           | 493/1641 [10:53<45:34,  2.38s/it]

now in row:  493









 30%|███████████▋                           | 494/1641 [10:54<38:52,  2.03s/it]

now in row:  494









 30%|███████████▊                           | 495/1641 [10:57<42:43,  2.24s/it]

now in row:  495









 30%|███████████▊                           | 496/1641 [10:58<38:25,  2.01s/it]

now in row:  496









 30%|███████████▊                           | 497/1641 [11:00<37:33,  1.97s/it]

now in row:  497









 30%|███████████▊                           | 498/1641 [11:01<32:47,  1.72s/it]

now in row:  498









 30%|███████████▊                           | 499/1641 [11:02<29:17,  1.54s/it]

now in row:  499









 30%|███████████▉                           | 500/1641 [11:03<27:53,  1.47s/it]

now in row:  500


In [26]:
def collect_data(length):
    data_ = []
    for i in range(length):
        f = open(str(i)+'.pickle', 'rb')
        one = pickle.load(f)
        data_.extend(one)
    return data_

length = len(df_years)
data_ = collect_data(length)
data_

[('Final Appeal (Civil)',
  '2011',
  "\\'https://legalref.judiciary.hk/lrs/images/ThemeXP/Epage.gif\\'",
  'FACV1/2011',
  'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=79800',
  '(06/01/2012)',
  'THE CATHOLIC DIOCESE OF HONG KONG ALSO KNOWN AS THE BISHOP OF THE ROMAN CATHOLIC CHURCH IN HONG KONG INCORPORATION  v. SECRETARY FOR JUSTICE \xa0'),
 ('Final Appeal (Civil)',
  '2011',
  "\\'https://legalref.judiciary.hk/lrs/images/ThemeXP/Epage.gif\\'",
  'FACV1/2011',
  'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=78563',
  '(13/10/2011)',
  'THE CATHOLIC DIOCESE OF HONG KONG ALSO KNOWN AS THE BISHOP OF THE ROMAN CATHOLIC CHURCH IN HONG KONG INCORPORATION v. SECRETARY FOR JUSTICE \xa0Reported in :(2011) 14 HKCFAR 754'),
 ('Final Appeal (Civil)',
  '2011',
  "\\'https://legalref.judiciary.hk/lrs/images/ThemeXP/ETpage.gif\\'",
  'FACV2/2011',
  'https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=79162',
  '(23/11/2011)',
  'WEALTH DUKE LTD AND OTHE

In [27]:
def get_dataframe(data_):
    df = pd.DataFrame(data_, columns=['sub_court_cleared', 'year', 'outcome', \
                                      'name', 'detail_url', 'date', 'persons'])
    return df
df_details = get_dataframe(data_)
print(df_details.shape)
df_details.head(3)

(28967, 7)


Unnamed: 0,sub_court_cleared,year,outcome,name,detail_url,date,persons
0,Final Appeal (Civil),2011,\'https://legalref.judiciary.hk/lrs/images/ThemeXP/Epage.gif\',FACV1/2011,https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=79800,(06/01/2012),THE CATHOLIC DIOCESE OF HONG KONG ALSO KNOWN AS THE BISHOP OF THE ROMAN CATHOLIC CHURCH IN HONG KONG INCORPORATION v. SECRETARY FOR JUSTICE
1,Final Appeal (Civil),2011,\'https://legalref.judiciary.hk/lrs/images/ThemeXP/Epage.gif\',FACV1/2011,https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=78563,(13/10/2011),THE CATHOLIC DIOCESE OF HONG KONG ALSO KNOWN AS THE BISHOP OF THE ROMAN CATHOLIC CHURCH IN HONG KONG INCORPORATION v. SECRETARY FOR JUSTICE Reported in :(2011) 14 HKCFAR 754
2,Final Appeal (Civil),2011,\'https://legalref.judiciary.hk/lrs/images/ThemeXP/ETpage.gif\',FACV2/2011,https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=79162,(23/11/2011),WEALTH DUKE LTD AND OTHERS v. BANK OF CHINA (HONG KONG) LTD Reported in :(2011) 14 HKCFAR 863


In [43]:
with open('details.pickle', 'wb') as ww:
    pickle.dump(df_details, ww)

In [7]:
f = open('details.pickle', 'rb')
df_details = pickle.load(f)

## combine all

In [8]:
df_years.dtypes

year                 object
year_url             object
sub_court_cleared    object
dtype: object

In [9]:
df_all.dtypes

sub_court            object
sub_court_cleared    object
sub_court_url        object
parent_court         object
parent_court_url     object
dtype: object

In [10]:
df_details.dtypes

sub_court_cleared    object
year                 object
outcome              object
name                 object
detail_url           object
date                 object
persons              object
dtype: object

In [11]:
df_details.shape

(28967, 7)

In [12]:
df2 = pd.merge(df_details, df_years, how='left', on=['sub_court_cleared', 'year'])
df2.shape

(38481, 8)

In [17]:
df3 = df2.drop_duplicates()

In [18]:
df3.shape

(24258, 8)

In [13]:
def dupli(df2):
    print(df2.shape)
    df3 = df2.drop_duplicates()
    print(df3.shape)
    return 
dupli(df_all)

(88, 5)
(88, 5)


In [14]:
dupli(df_years)

(1641, 3)
(1412, 3)


In [15]:
dupli(df_details)

(28967, 7)
(24258, 7)


In [19]:
df_fin = pd.merge(df3, df_all, how='left', on=['sub_court_cleared'])
df_fin.shape

(28967, 12)

In [20]:
print(df_fin.shape)
df5 = df_fin.drop_duplicates()
df5.shape

(28967, 12)


(28967, 12)

In [24]:
df5.head(1)

Unnamed: 0,sub_court_cleared,year,outcome,name,detail_url,date,persons,year_url,sub_court,sub_court_url,parent_court,parent_court_url
0,Final Appeal (Civil),2011,\'https://legalref.judiciary.hk/lrs/images/ThemeXP/Epage.gif\',FACV1/2011,https://legalref.judiciary.hk/lrs/common/ju/ju_frame.jsp?DIS=79800,(06/01/2012),THE CATHOLIC DIOCESE OF HONG KONG ALSO KNOWN AS THE BISHOP OF THE ROMAN CATHOLIC CHURCH IN HONG KONG INCORPORATION v. SECRETARY FOR JUSTICE,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?EX=T&L1=FA&L2=CV&L3=2011&AR=1_8#A1_8,Final Appeal (Civil),https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA&L2=CV&AR=1#A1,Court of Final Appeal,https://legalref.judiciary.hk/lrs/common/ju/judgment.jsp?L1=FA#H1


In [22]:
with open('fin.pickle', 'wb') as ww:
    pickle.dump(df5, ww)

In [None]:
f = open('fin.pickle', 'rb')
df = pickle.load(f)
df.shape

In [23]:
df5.to_csv('final.csv', index=False, encoding='utf-8')


#### Comments： 主要思路是re+bs4，对每一层提取url，并注意对应上主体。

#### Comments: 1）时间不太够，我NER没有学，但愿意学习。2）不知道测试要求到什么结果， 我也可以进一步清理（转换类型， 提取判决书）我想应该很快。

#### 多谢评阅。