In [4]:
import collections
import re
import rich

import asdiv_dataset

In [5]:
asdiv_ds = asdiv_dataset.ASDiv(cache_path="asdiv.xml", quiet=True)

rich.print(asdiv_ds[0])

In [7]:
def is_numeric(text):
    clean_text = re.sub(r"\s+", "", text).replace(",", "").strip()

    return re.match(r"^\-?\d+(\.\d+)?$", clean_text) is not None



values_filtered_digit = []
values_filtered_numeric = []
numeric_not_digit = []
digit_not_numeric = []
values_filtered_is_integer = []
is_digit_is_not_integer = []
is_digit_is_multiple_integers = []
is_digit_is_not_multiple_integers = []


for entry in asdiv_ds:
    answer = entry["answer"].split(" ", 1)[0]

    is_not_digit = not answer.isdigit()
    is_not_numeric = not is_numeric(answer)
    is_not_integer = not asdiv_dataset.ASDivInteger._is_integer(entry["answer"])

    if is_not_digit:
        values_filtered_digit.append(entry)

    if is_not_numeric:
        values_filtered_numeric.append(entry)

    if not is_not_digit and is_not_numeric:
        digit_not_numeric.append(entry)
    
    if is_not_digit and not is_not_numeric:
        numeric_not_digit.append(entry)

    if is_not_integer:
        values_filtered_is_integer.append(entry)

    if not is_not_digit and is_not_integer:
        is_digit_is_not_integer.append(entry)

    if not is_not_digit:
        splitted = [x.strip() for x in entry["answer"].split(";")]
        if len(splitted) > 1 and all(asdiv_dataset.ASDivInteger._is_integer(x) for x in splitted):
            is_digit_is_multiple_integers.append(entry)
        else:
            if is_not_integer: 
                is_digit_is_not_multiple_integers.append(entry)

def stats(name, entries_filtered):
    counter = collections.Counter([x["Solution-Type"] for x in entries_filtered])
    print(f"{name}:  {len(entries_filtered) / len(asdiv_ds): 0.1%}")
    rich.print({k: v for k, v in sorted(counter.items(), key=lambda x: x[1], reverse=True)})
    rich.print(sum([v for v in counter.values()]))


for name, values in dict(
    is_not_digit=values_filtered_digit,
    is_not_numeric=values_filtered_numeric,
    numeric_not_digit=numeric_not_digit,
    digit_not_numeric=digit_not_numeric,
    is_not_integer=values_filtered_is_integer,
    is_digit_is_not_integer=is_digit_is_not_integer,
    is_digit_is_multiple_integers=is_digit_is_multiple_integers,
    is_digit_is_not_multiple_integers=is_digit_is_not_multiple_integers,
).items():
    stats(name, values)


is_not_digit:   9.0%


is_not_numeric:   6.3%


numeric_not_digit:   2.7%


digit_not_numeric:   0.0%


is_not_integer:   12.2%


is_digit_is_not_integer:   3.2%


is_digit_is_multiple_integers:   3.2%


is_digit_is_not_multiple_integers:   0.0%
