Create the output files for each country for every non-zero analyte value, analyte matrix and category.\
This takes ~3 minutes (depends on the KCDB response time).

In [None]:
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path

from msl.kcdb import ChemistryBiology


@dataclass
class Result:
    """The `key` value is the unique `Analyte + Category` combination."""

    key: str
    country: str


chem_bio = ChemistryBiology(timeout=600)

countries = [country.value for country in chem_bio.countries()]

# When running this search request in January 2025, there were 6518 elements in the result.
# Since 6518 < 10000 (MAX_PAGE_SIZE), getting all results in a single request is, for now, okay.
# Getting all data in a single request is also much faster than requesting each analyte + category combination.
# Must get all data first because there is no way to know the Analyte Matrix values beforehand.
# The KCDB API (v1.0.9) does not include an endpoint to request the different Analyte Matrix values.
print("Requesting all chem-bio data, this will take a couple of minutes... ", end="")
reply = chem_bio.search(page_size=chem_bio.MAX_PAGE_SIZE)
if reply.total_pages != 1:
    msg = "Received more than 1 page of elements"
    raise RuntimeError(msg)
print(f"done! Got {reply.number_of_elements} elements")

# Find the unique set of Analyte Value, Analyte Matrix and Category Value combinations
results: list[Result] = []
combinations: set[str] = set()
for data in reply.data:
    # Some Analytes (id=51, id=1138, id=2265 and id=2357) contain ; and \n characters
    # in the value that must be removed or replaced
    analyte_value = data.analyte_value.rstrip("\n;").replace(";", ",").replace("\n", " ")
    analyte_matrix = data.analyte_matrix.rstrip("\n;").replace(";", ",").replace("\n", " ")
    key = f"{analyte_value};{analyte_matrix};{data.category_value}"
    combinations.add(key)
    results.append(Result(key=key, country=data.country_value))
sorted_combinations = sorted(combinations)

# All files are saved to this directory
root_path = Path().resolve().parent / "input" / "chem-bio-data-Non-zero"
root_path.mkdir(parents=True, exist_ok=True)
summary_path = root_path / "summary"
summary_path.mkdir(exist_ok=True)

# Create an output file for each country
summary: dict[str, list[str]] = {key: [] for key in sorted_combinations}
for country in countries:
    print(f"Saving data for {country}")
    with (root_path / f"{country}-Non-zero.cmc").open("w", encoding="utf-8") as f:
        _ = f.write("Country;AnalyteValue;AnalyteMatrix;Category;Number of CMCs\n")
        filtered = [result for result in results if country == result.country]
        counts: dict[str, int] = {}
        for key in sorted_combinations:
            counts[key] = 0
            for item in filtered:
                if key == item.key:
                    counts[key] += 1
            n = counts[key]
            if n > 0:
                summary[key].append(country)
            _ = f.write(f"{country};{key};{n}\n")

# Create the summary file
with (summary_path / "chem-bio-Non-zero-Summary.cmc").open("w", encoding="utf-8") as f:
    _ = f.write("Analyte Value;Analyte Matrix;Category;Number of Countries;Countries\n")
    for k, v in summary.items():
        _ = f.write(f"{k};{len(v)};{'|'.join(v)}\n")