In [1]:
from datetime import datetime, timezone
from enum import StrEnum
from pathlib import Path
import tempfile

import lxml.html
import pandas as pd
import requests

In [2]:
# Constants
REQUEST_URL = "https://bulbapedia.bulbagarden.net/w/api.php?action=parse&format=json&page=List_of_Pok%C3%A9mon_Trading_Card_Game_expansions"
RAW_HTML = "download.txt"
PREPROCESSED_HTML = "sanitised.txt"

class ColumnNames(StrEnum):
    NAME = "full_name"
    SET = "set_abb"

SANITISATION_MAP = {
    'colspan=\\"2\\"': "colspan=2",
    "\\n": "",
    "Name of Expansion": ColumnNames.NAME,
    "Set abb.": ColumnNames.SET,
}

FILTER = "\\u2014"

In [3]:
# graceful fetch - error handling to be dealt properly in implementation
def fetch_contents(uri):
    try:
        response = requests.get(uri, timeout=5)
        response.raise_for_status()
    
        # if successful
        return response
    except requests.exceptions.HTTPError as errh:
        print(errh)
    except requests.exceptions.ConnectionError as errc:
        print(errc)
    except requests.exceptions.Timeout as errt:
        print(errt)
    except requests.exceptions.RequestException as err:
        print(err)

In [4]:
# fetch and save to file
response = fetch_contents(REQUEST_URL)

In [5]:
# helper functions for text pre-processing
def save_raw_html(file_path):
    with open(file_path, "wb") as fd:
        for chunk in response.iter_content(chunk_size=128):
            fd.write(chunk)

def preprocess_html(file_path):
    with open(file_path, "r") as fd:
        line = fd.readline()

    if not line:
        print("UNABLE TO PARSE RAW HTML")

    new_line = line
    for key, value in SANITISATION_MAP.items():
        new_line = new_line.replace(key, value)
    return new_line

def write_to_file(file_contents, file_path):
    if not file_contents:
        print("NO CONTENTS WRITTEN TO FILE")
    with open(file_path, "w") as fd:
        fd.write(file_contents)

In [12]:
# helper functions for table manipulation
def extract_relevant_tables(file_path):
    tables = pd.read_html(file_path)
    return [df for df in tables if ColumnNames.SET in df.columns]

def extract_relevant_columns(list_of_tables):
    return [df[[ColumnNames.NAME, ColumnNames.SET]] for df in list_of_tables]

def postprocess_table(table):
    return table[table[ColumnNames.SET] != FILTER]

def manipulate_as_tables(list_of_tables):
    merged_table = pd.concat(extract_relevant_columns(list_of_tables), ignore_index=True)
    return postprocess_table(merged_table)

In [13]:
def generate_timestamp():
    return datetime.now(timezone.utc).strftime("%A, %d. %B %Y %I:%M%p")

In [14]:
# consolidate IO actions for temp folder
def perform_initial_processing():
    # create temp folder - to replace `cwd` in final version with `out`
    with tempfile.TemporaryDirectory(prefix="temp_", dir=Path.cwd()) as temp_folder_name:
        temp_folder = Path.cwd() / temp_folder_name
        raw_html_file = temp_folder / RAW_HTML
        sanitised_html_file = temp_folder / PREPROCESSED_HTML
        
        # download and pre-process
        save_raw_html(raw_html_file)
        buffer = preprocess_html(raw_html_file)
        write_to_file(buffer, sanitised_html_file)
    
        # read as table
        return extract_relevant_tables(sanitised_html_file)

In [23]:
def format_as_lines(table):
    if table is None:
        print("INVALID TABLE")
    
    string_buffer = [
        f"// Generated using Forecast on {generate_timestamp()} UTC\n",
        "//     See: https://github.com/KOOKIIEStudios/Forecast\n",
        "const Set<String> setAbbreviations = {\n",
    ]

    for row in table.itertuples(index=False):
        string_buffer.append(f'  "{row.set_abb}",  // {row.full_name}\n')
    
    string_buffer.append("};\n")
    return string_buffer

In [24]:
name_and_set_mapping = manipulate_as_tables(perform_initial_processing())
lines = format_as_lines(name_and_set_mapping)

In [25]:
print("".join(lines))

// Generated using Forecast on Saturday, 11. November 2023 07:07PM UTC
//     See: https://github.com/KOOKIIEStudios/Forecast
const Set<String> setAbbreviations = {
  "BS",  // Base Set
  "JU",  // Jungle
  "FO",  // Fossil
  "B2",  // Base Set 2
  "TR",  // Team Rocket
  "G1",  // Gym Heroes
  "G2",  // Gym Challenge
  "N1",  // Neo Genesis
  "N2",  // Neo Discovery
  "N3",  // Neo Revelation
  "N4",  // Neo Destiny
  "LC",  // Legendary Collection
  "EX",  // Expedition Base Set
  "AQ",  // Aquapolis
  "SK",  // Skyridge
  "RS",  // EX Ruby & Sapphire
  "SS",  // EX Sandstorm
  "DR",  // EX Dragon
  "MA",  // EX Team Magma vs Team Aqua
  "HL",  // EX Hidden Legends
  "FL",  // EX FireRed & LeafGreen
  "TRR",  // EX Team Rocket Returns
  "DX",  // EX Deoxys
  "EM",  // EX Emerald
  "UF",  // EX Unseen Forces
  "DS",  // EX Delta Species
  "LM",  // EX Legend Maker
  "HP",  // EX Holon Phantoms
  "CG",  // EX Crystal Guardians
  "DF",  // EX Dragon Frontiers
  "PK",  // EX Power Keeper