In [1]:
# Broad Categories/initial narrowing questions
# V2 Decision Tree w/ Updated categories

from dataclasses import dataclass
from typing import Dict, List, Union, Tuple, Callable, Optional
import json, re
import os

# Decision Tree
NodeRef = Union[str, Tuple[str, str]]

@dataclass
class Node:
    q: str
    yes: NodeRef
    no: NodeRef

def CAT(name: str) -> Tuple[str, str]:
    return ("CAT", name)

ROOT_ID = "N0"
NODES: Dict[str, Node] = {
 "N0": Node("Is it a living organism or portrayed as living?", "N1", "N100"),
 "N1": Node("Is it fictional or only in stories/games?", "N2", "N10"),
 "N2": Node("Is it a Pokémon?", CAT("Pokemon"), CAT("Fictional/Character")),
 # Real living
 "N10": Node("Is it a human?", "N11", "N20"),
 "N11": Node("Is it a specific individual?", "N12", "N15"),
 "N12": Node("Known for playing in the NBA?", CAT("Athletes (NBA)"), "N13"),
 "N13": Node("Famous or historically significant?", CAT("Famous People / Historical Figures"), CAT("Person")),
 "N15": Node("Is it a job/occupation/role label?", CAT("Profession/role"), CAT("Person")),
 "N20": Node("Is it an animal?", "N21", "N28"),
 "N21": Node("Does it name a specific species or breed?", "N22", CAT("Animal")),
 "N22": Node("Is it a dog breed?", CAT("Dog Breeds"), CAT("Animals (Specific Species)")),
 "N28": Node("Is it a plant?", CAT("Plant"), "N29"),
 "N29": Node("(Catch-all) Is it a Pokémon?", CAT("Pokemon"), CAT("Other")),
 # Non-living
 "N100": Node("Is it an abstract/non-physical idea?", "N101", "N110"),
 "N101": Node("Broad philosophical abstraction (e.g., love, justice)?", CAT("Abstract Concepts"), "N102"),
 "N102": Node("Defined domain concept/theory (e.g., blockchain)?", CAT("Concept/Idea"), CAT("Abstract Concepts")),
 "N110": Node("Is it a company or brand?", CAT("Companies / Brands"), "N120"),
 "N120": Node("Is it a media/entertainment work or category?", CAT("Media & Entertainment"), "N130"),
 "N130": Node("Is it a place or geographic location?", "N140", "N200"),
 # Places
 "N140": Node("Aviation/airport related?", CAT("Aviation & Airport"), "N145"),
 "N145": Node("Weather/atmospheric phenomenon?", CAT("Weather"), "N146"),
 "N146": Node("Natural landform/feature?", CAT("Geological Formations"), "N147"),
 "N147": Node("Rock or mineral?", CAT("Rock/Mineral"), "N148"),
 "N148": Node("Designated landmark/monument?", "N149", "N150"),
 "N149": Node("Globally iconic landmark?", CAT("Landmarks / Monuments (Global)"), CAT("Landmarks / Monuments")),
 "N150": Node("Is it a city?", "N151", "N153"),
 "N151": Node("Is it in the United States?", CAT("US Cities"), CAT("Cities")),
 "N153": Node("Is it a U.S. state or territory?", CAT("US States"), CAT("Place")),
 # Activities / Food / Objects
 "N200": Node("Is it a sport/activity or an exercise?", "N201", "N210"),
 "N201": Node("Specifically a workout/exercise movement?", CAT("Workout Exercises"), CAT("Sport/Activity")),
 "N210": Node("Is it food or drink?", "N211", "N220"),
 "N211": Node("Is it usually drunk?", CAT("Drinks"), "N212"),
 "N212": Node("Is it a fruit?", CAT("Fruits"), CAT("Food")),
 "N220": Node("Is it a physical object?", "N221", CAT("Other")),
 "N221": Node("Is it a building or structure?", CAT("Structure/building"), "N222"),
 "N222": Node("Is it a vehicle?", "N223", "N226"),
 "N223": Node("Is it a motorcycle model?", CAT("Motorcycle Models"), CAT("Vehicles")),
 "N226": Node("Powered machinery/heavy equipment?", CAT("Machinery & Vehicles"), "N227"),
 "N227": Node("Is it a tool?", CAT("Tools"), "N228"),
 "N228": Node("Technology/device/platform?", CAT("Technology"), "N229"),
 "N229": Node("Common everyday household/personal object?", CAT("Everyday Objects"), CAT("Object")),
}

def ask_yes_no(prompt: str, input_fn: Optional[Callable[[str], str]] = None) -> bool:
    if input_fn is None:
        input_fn = input
    while True:
        ans = input_fn(f"{prompt} (y/n): ").strip().lower()
        if ans in ("y","yes"): return True
        if ans in ("n","no"):  return False
        print("Please answer y/n.")

def run_decision_tree(root_id: str = ROOT_ID,
                      input_fn: Optional[Callable[[str], str]] = None) -> str:
    cur = root_id
    if input_fn is None:
        input_fn = input
    while True:
        node = NODES.get(cur)
        if node is None:
            raise ValueError(f"Unknown node id: {cur}")
        nxt = node.yes if ask_yes_no(node.q, input_fn=input_fn) else node.no
        if isinstance(nxt, tuple) and nxt[0] == "CAT":
            return nxt[1]
        cur = nxt

# Export End Category to New JSON
def export_category_subset(category: str,
                           json_in: str = "Downloads/noun_categories_dynamic_refined_FINAL_v2.json",
                           json_out: str = "Downloads/selected_category_nouns.json") -> str:
    with open(json_in, "r", encoding="utf-8") as f:
        data = json.load(f)

    nouns = None

    if isinstance(data, dict) and category in data and isinstance(data[category], list):
        nouns = data[category]

    if nouns is None and isinstance(data, dict) and isinstance(data.get("items"), list):
        for item in data["items"]:
            if isinstance(item, dict) and item.get("category") == category:
                nouns = item.get("nouns")
                break

    if nouns is None and isinstance(data, dict):
        for v in data.values():
            if isinstance(v, list):
                for item in v:
                    if isinstance(item, dict) and item.get("category") == category:
                        nouns = item.get("nouns")
                        break
                if nouns is not None:
                    break

    if nouns is None:
        raise KeyError(f"Category not found in JSON: {category}")

    out = {"category": category, "nouns": nouns}
    with open(json_out, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)
    return json_out

def classify_and_export(json_in: str = "Downloads/noun_categories_dynamic_refined_FINAL_v2.json",
                        json_out: str = "Downloads/selected_category_nouns.json",
                        input_fn: Optional[Callable[[str], str]] = None) -> str:

    category = run_decision_tree(input_fn=input_fn)
    path = export_category_subset(category, json_in=json_in, json_out=json_out)
    print(f"Category: {category}\nWrote: {path}")
    return path

In [6]:
# Test
classify_and_export()

def load_selected_nouns(json_path: str = "Downloads/selected_category_nouns.json"):
    with open(json_path, "r", encoding="utf-8") as f:
        obj = json.load(f)
    return obj["category"], obj["nouns"]

cat, nouns = load_selected_nouns()
# Passes End Category+Noun List

Is it a living organism or portrayed as living? (y/n):  y
Is it fictional or only in stories/games? (y/n):  y
Is it a Pokémon? (y/n):  y


Category: Pokemon
Wrote: Downloads/selected_category_nouns.json
