### PROOF OF CONCEPT - Dragon Tiger Billboard of Compulsory Testing Notice (CTN)
---
Author: Jack Chan

Last Update: 2022/08/27

### Download CTN from Centre for Health Protection (CHP)
---

In [1]:
import os
import requests
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [2]:
# define range of dates to download CTN PDFs
date_start = datetime.strptime("20220114", "%Y%m%d")
date_end = datetime.today() + relativedelta(days=-1)

# total number of dates to be downloaded
outstanding_days = date_end - date_start
outstanding_days = outstanding_days.days + 1

prt_date_start = date_start.strftime('%d %B %Y')
prt_date_end = date_end.strftime('%d %B %Y')
print(
    f"CTN will be downloaded from {prt_date_start} to " \
    f"{prt_date_end}, for {outstanding_days} days in total."
)

CTN will be downloaded from 14 January 2022 to 26 August 2022, for 225 days in total.


In [3]:
# instantiate input date and output file name
input_date = "20220114"
file_name = f"ctn_{input_date}.pdf"

# request CTN from CHP website
uri = f"https://www.chp.gov.hk/files/pdf/{file_name}"
response = requests.get(uri)

# cache CTN if the request has succeeded
if response.status_code == 200:
    with open(file_name, "wb") as file:
        file.write(response.content)

if file_name in os.listdir():
    print(f"CTN '{file_name}' has been downloaded.")

CTN 'ctn_20220114.pdf' has been downloaded.


### Ingest and Wrangle CTN as Data Model
---

In [4]:
import re
import tabula
import pandas as pd

In [5]:
# instantiate input file
file_name = "ctn_20220114.pdf"

# load PDF with tailor-made template
pages = tabula.read_pdf_with_template(
    file_name,
    template_path="ctn_template.json",
    lattice=True,
    pandas_options={'header': None}
)

# append pages into a single data frame
df = pd.DataFrame()
for page in pages:
    page = page.astype(str)
    # keep only columns with `Specified place`
    dc = [page[col].str.contains("Specified place").any() \
          for col in page.columns]
    df = pd.concat([df, page.loc[:, dc]], ignore_index=True) \
        .astype(str)

df.head()

Unnamed: 0,1,2,0
0,,,
1,指明地點\rSpecified place,指明地點\rSpecified place,
2,(視情況而定任何一個,[年年年年-月,
3,,期間及時段),
4,,[年年年年-月月-日日],


In [6]:
# remove rows without non-ASCII (a.k.a. Chinese) characters
# for keeping relevant records
df_filter = pd.DataFrame()
for col in df.columns:
    df_merge = df.fillna("na")[col] \
        .str.contains(r"[^\x00-\x7F]", regex=True)
    df_filter = pd.concat([df_filter, df_merge], axis=1)
df = df[df_filter.any(axis=1)]

df.head()

Unnamed: 0,1,2,0
1,指明地點\rSpecified place,指明地點\rSpecified place,
2,(視情況而定任何一個,[年年年年-月,
3,,期間及時段),
4,,[年年年年-月月-日日],
12,佐敦廟街266號地下及\r閣樓九龍咖喱屋\rKowloon Curry House &\rB...,2022-01-03 12:00-14:00,


In [7]:
# reference sub-districts for valid address
subdistricts = pd.read_excel("areas_and_districts.xlsx")
subdistricts = subdistricts["subdistricts"] \
    .str.replace(r"\s", "", regex=True) \
    .str.upper() \
    .unique()

# remove cells without sub-districts
match = re.compile(r"\r|\s")
df_filter = pd.DataFrame()
for col in df.columns:
    df_merge = df[col].apply(
        lambda y: any(
            subdistrict in match.sub("", y.upper()) \
            for subdistrict in subdistricts
        )
    )
    df_filter = pd.concat([df_filter, df_merge], axis=1)
df = df[df_filter]

df[df.notna().any(axis=1)].head()

Unnamed: 0,1,2,0
13,"銅鑼灣軒尼詩道500號\r希慎廣場11樓美食廣場\rFood Court, 11/F, Hy...",,
25,"銅鑼灣恩平道28 號利\r園二期10樓如新生活體\r驗館\rNUSKINPlaza,10/F...",,
26,"青衣青敬路33 號青衣\r城一期 3樓 301號鋪肯\r德基\rKFC, Shop 301,...",,
27,"將軍澳唐德街 1 號將軍\r澳廣場 1 樓 1-004 號鋪\r太興\rTaiHing,Sh...",,


In [8]:
# combine row values and hard code a bit
# for removing unnecessary data
pattern = r"\r|\s|nan|Specified place"
df = df.apply(lambda y: "".join(y.dropna()), axis=1) \
    .str.replace(pattern, "", regex=True)

df[df.str.len() != 0].head()

13    銅鑼灣軒尼詩道500號希慎廣場11樓美食廣場FoodCourt,11/F,HysanPlac...
25    銅鑼灣恩平道28號利園二期10樓如新生活體驗館NUSKINPlaza,10/F,LeeGar...
26    青衣青敬路33號青衣城一期3樓301號鋪肯德基KFC,Shop301,Level3,Mari...
27    將軍澳唐德街1號將軍澳廣場1樓1-004號鋪太興TaiHing,Shop1-004,1/F,...
dtype: object

In [9]:
# remove rows without English characters
# for part of address component
df_filter = df.str.contains(r"[A-Za-z]", regex=True)
df = df[df_filter].reset_index(drop=True)

df.head()

0    銅鑼灣軒尼詩道500號希慎廣場11樓美食廣場FoodCourt,11/F,HysanPlac...
1    銅鑼灣恩平道28號利園二期10樓如新生活體驗館NUSKINPlaza,10/F,LeeGar...
2    青衣青敬路33號青衣城一期3樓301號鋪肯德基KFC,Shop301,Level3,Mari...
3    將軍澳唐德街1號將軍澳廣場1樓1-004號鋪太興TaiHing,Shop1-004,1/F,...
dtype: object

In [10]:
# (hard coded) remove unexpected characters
# to keep Chinese Address for later geocode querying
df = df.str.replace(r"é|â|’||–", "", regex=True) \
    .str.extract(r"(.*[^\x00-\x7F]\)?)")
df.columns = ["specified_place"]

df.head()

Unnamed: 0,specified_place
0,銅鑼灣軒尼詩道500號希慎廣場11樓美食廣場
1,銅鑼灣恩平道28號利園二期10樓如新生活體驗館
2,青衣青敬路33號青衣城一期3樓301號鋪肯德基
3,將軍澳唐德街1號將軍澳廣場1樓1-004號鋪太興


### Query Geocode for Specified Place
---

In [11]:
from AddressParser import Address

In [12]:
# instantiate input address
address = "銅鑼灣軒尼詩道500號希慎廣場11樓美食廣場"

# query geographic information
ad = Address(address)
try:
    result = ad.ParseAddress()
except Exception as e:
    # for 'NoneType' object is not iterable
    result = {
        "eng": {},
        "geo": {},
    }

In [13]:
# get geographic information
geo_info_dict = {
    "region": result["eng"].get("Region", None),
    "district": result["eng"].get("EngDistrict", {}) \
        .get("DcDistrict", None),
}

# get geocode
geocode_dict = {
    "latitude": result["geo"].get("Latitude", None),
    "longitude": result["geo"].get("Longitude", None),
}

# combine geographic information
out_dict = {}
out_dict.update(geo_info_dict)
out_dict.update(geocode_dict)

display(out_dict)

{'region': 'HK',
 'district': 'WAN CHAI DISTRICT',
 'latitude': '22.27977',
 'longitude': '114.1838'}

In [14]:
# convert geographic information into data frame
# for joining to CTN
df_parse = pd.DataFrame.from_dict(out_dict, orient='index').T
df_parse["specified_place"] = address

df_parse.head()

Unnamed: 0,region,district,latitude,longitude,specified_place
0,HK,WAN CHAI DISTRICT,22.27977,114.1838,銅鑼灣軒尼詩道500號希慎廣場11樓美食廣場
