In [1]:
import re

import requests
import pandas as pd

In [2]:
url = (
    "https://api.data.gov.hk/v1/historical-archive/list-files?"
    "start=20240101&"
    "end=20240102&"
    "category=commerce-and-industry&"
    "provider=cc&"
    "format=json&"
)
file_list = requests.get(url)
file_list = file_list.json()

for file in file_list["files"]:
    print(f"{file['dataset-name-en']}: {file['url']}")

Online Price Watch: https://online-price-watch.consumer.org.hk/opw/opendata/pricewatch.json


In [3]:
url = (
    "https://api.data.gov.hk/v1/historical-archive/list-file-versions?"
    "url=https://online-price-watch.consumer.org.hk/opw/opendata/pricewatch.json&"
    "start=20240101&"
    "end=20240102&"
)
file_version = requests.get(url)
file_version = file_version.json()

print(f"Total of {file_version['version-count']} versions: {', '.join(file_version['timestamps'])}")

Total of 2 versions: 20240101-0958, 20240102-1012


In [4]:
url = (
    "https://api.data.gov.hk/v1/historical-archive/get-file?"
    "url=https://online-price-watch.consumer.org.hk/opw/opendata/pricewatch.json&"
    f"time={(file_dt:='20240101-0958')}&"
)
file = requests.get(url)
file = file.json()

price_list, item_list = [], []
for item in file:
    code = item["code"] = item["code"].upper()
    
    prices = item.pop("prices")
    offers = item.pop("offers")
    
    price_dict = {
        price["supermarketCode"]: price for price in prices
    }
    offer_dict = {
        offer["supermarketCode"]: offer for offer in offers
    }
    
    price = [
        {
            "code": code, "date": file_dt,
            **price_dict.get(smkt, {}), **offer_dict.get(smkt, {}),
        }
        for smkt in set(price_dict) | set(offer_dict)
    ]
    
    price_list += price
    item_list.append(item)

df_price = pd.DataFrame.from_records(price_list)
df_item = pd.json_normalize(item_list)

print(f"Total of {len(df_price):,} prices; {len(df_item):,} items")

Total of 4,675 prices; 1,941 items


In [5]:
for df in (df_price, df_item):
    df.drop(columns=df.filter(regex="zh-Hans").columns, inplace=True)

df_price["price"] = df_price["price"].str.extract(r"([\d\.]+)") \
    .astype(float).fillna(0)
df_price["date"] = df_price["date"].str.extract("(\d{8})")
df_price["en"] = df_price["en"].fillna("No Promotion")
df_price["zh-Hant"] = df_price["zh-Hant"].fillna("No Promotion")

price_metadata = {
    "code": "sku",
    "date": "date",
    "supermarketCode": "smkt",
    "price": "price",
    "en": "promo_en",
    "zh-Hant": "promo_zh",
}
df_price.rename(columns=price_metadata, inplace=True)

item_metadata = {
    "code": "sku",
    "brand.en": "brand_en",
    "brand.zh-Hant": "brand_zh",
    "name.en": "name_en",
    "name.zh-Hant": "name_zh",
    "cat1Name.en": "dept_en",
    "cat1Name.zh-Hant": "dept_zh",
    "cat2Name.en": "cat_en",
    "cat2Name.zh-Hant": "cat_zh",
    "cat3Name.en": "subcat_en",
    "cat3Name.zh-Hant": "subcat_zh",
}
df_item.rename(columns=item_metadata, inplace=True)

print(f"Total {df_price.shape[1]} columns for price; {df_item.shape[1]} columns for item")

Total 6 columns for price; 11 columns for item


In [6]:
ops = lambda x: str(x).lower().split("/")
promos = list(map(ops, df_price["promo_en"].unique()))

promo_cnt = {}
for promo in sum(promos, []):
    promo = re.sub(r"\s", "", promo.strip())
    # amount
    promo = re.sub(r"\$(\d+\.?\d*)", "{AMT}", promo)
    # percentage
    promo = re.sub(r"\d+%", "{PCT}", promo)
    # numeric
    promo = re.sub(r"\d+", "{NUM}", promo)
    
    promo_cnt[promo] = promo_cnt.get(promo, 0) + 1


promo_pat = pd.DataFrame({
    "pat": promo_cnt.keys(),
    "cnt": promo_cnt.values(),
})
promo_pat["amt"] = promo_pat["pat"].str.count("{AMT}")
promo_pat["pct"] = promo_pat["pat"].str.count("{PCT}")
promo_pat["num"] = promo_pat["pat"].str.count("{NUM}")

promo_pat.query(
    "amt + pct + num == 2 and amt != 2 and pct != 2",
    inplace=True,
)

cnt = promo_pat.query(
    "amt + pct == 2 and num != 2 and "
    "pat.str.contains('{AMT}.*{PCT}')"
).shape[0]
print(f"Total of {cnt:,} promotions with AMT and PCT")

cnt = promo_pat.query(
    "amt + num == 2 and num != 2 and "
    "pat.str.contains('{NUM}[\w\s]*{AMT}$')"
).shape[0]
print(f"Total of {cnt:,} promotions with AMT and NUM")

cnt = promo_pat.query(
    "pct + num == 2 and num != 2 and "
    "pat.str.contains('{NUM}(?!nd)[\w\s]*{PCT}') or "
    "pat.str.contains('{NUM}nd[\w\s]*{PCT}')"
).shape[0]
print(f"Total of {cnt:,} promotions with PCT and NUM")

cnt = promo_pat.query(
    "num == 2 and "
    "pat.str.contains('buy{NUM}get{NUM}')"
).shape[0]
print(f"Total of {cnt:,} promotions with 2 NUM")

Total of 0 promotions with AMT and PCT
Total of 3 promotions with AMT and NUM
Total of 4 promotions with PCT and NUM
Total of 1 promotions with 2 NUM
