In [None]:
def init_as_taxonomy(path_to_json: str) -> dict:
    """ Initialise taxonomy with AudioSet taxonomy.
        E.g.
            taxonomy = init_taxonomy(path_to_json) # taxonomy = {
                'mid': {'name': , 'child_mids', 'restrictions', 'parent_mids', 'ancestor_mid'},
                ...
                }
    """
    taxonomy = {}
    with open(path_to_json, 'rb') as f:
        meta = json.load(f)
    for cat in meta:
        tmp = {}
        tmp['name'] = cat['name']
        tmp['child_mid'] = cat['child_ids']
        tmp['restrictions'] = cat['restrictions']
        tmp['parent_mid'] = []
        tmp['ancestor_mid'] = []

        taxonomy[cat['id']] = tmp

    return taxonomy

In [None]:
as_taxonomy = init_as_taxonomy('./ontology.json')
print(as_taxonomy['/m/09b5t']['child_mid'])  # check the result
print(len(as_taxonomy)) # check the number of categories in taxonomy

In [None]:
def setup_tree_structure(taxonomy: dict) -> Tuple[dict]:
    """ Initiate tree structure using a taxonomy with specified format."""
    _lvl = 0
    inverse_tree = dict()
    # Trace parent mids for each mid
    for mid in taxonomy.keys():
        for child_mid in taxonomy[mid]['child_mid']:
            taxonomy[child_mid]['parent_mid'].append(mid)
    # Trace ancestor nodes in taxonomy
    _tmp = list()
    for mid, cat_dict in taxonomy.items():
        if cat_dict['parent_mid'] == []:
            _tmp.append(mid)
    inverse_tree[_lvl] = _tmp

    # Iter to trace the level of each category in the taxonomy,
    # note some of them may belong to multiple levels (multi-path to root)
    while True:
        _lvl += 1
        _tmp = list()
        for mid, cat_dict in taxonomy.items():
            for pmid in cat_dict['parent_mid']:
                if pmid in inverse_tree[_lvl - 1]:
                    _tmp.append(mid)
        if _tmp != []:
            inverse_tree[_lvl] = set(_tmp)
        else:
            break
    # Get the tree structure where the bottom level is denoted as 0
    tree = list()
    height = len(inverse_tree)
    for lvl in range(height):
        tree.append(inverse_tree[height-lvl-1])

    return tree

In [None]:
tree = setup_tree_structure(as_taxonomy)
l = 0
for lvl in tree:
    l += len(lvl)
print(l)
print(tree[5])

In [None]:
def measure_height(taxonomy: dict, tree: list) -> dict:
    """ Measure the height of each class in the tree structure."""
    for mid, cat_dict in taxonomy.items():
        _tmp = list()
        for id, lvl in enumerate(tree):
            if mid in lvl:
                _tmp.append(id)
        cat_dict['hierarchy'] = list(set(_tmp))

    return taxonomy

In [None]:
taxonomy = measure_height(as_taxonomy, tree)
print(taxonomy)

In [None]:
# Define WORKPLACE path
# todo: make it ordinary one
collection_dir = './ground_truth'
dev_csv = 'dev.csv'
eval_csv = 'eval.csv'
voc_csv = 'vocabulary.csv'

In [None]:
def init_fsd_vocabulary(path_to_csv: str) -> dict:
    """ Init the fsd50 dataset."""
    vocabulary = {}
    with open(path_to_csv, 'r') as f1:
        meta = pd.read_csv(f1, names=['index', 'label', 'id'])
        _indices = meta['index'].values.tolist()
        _labels = meta['label'].values.tolist()
        _ids = meta['id'].values.tolist()
        for index, label, id in zip(_indices, _labels, _ids):
            vocabulary[id] = {
                'index': index,
                'label': label
            }
    return vocabulary

In [None]:
fsd_voc = init_fsd_vocabulary(os.path.join(collection_dir, voc_csv))

In [None]:
as_mid = list(taxonomy.keys())
count = 0
for mid in fsd_voc.keys():
    if mid not in as_mid:
        print(mid)
    else:
        count += 1
print(count)

In [None]:
def filter_taxonomy(ori_taxonomy: dict, vocabulary: dict) -> dict:
    """ Filter taxonomy according to the provided vocabulary."""
    mid_set = list(vocabulary.keys())
    output = dict()
    for mid, cat_dict in ori_taxonomy.items():
        if mid in mid_set:
            output[mid] = cat_dict
    assert len(output) == len(vocabulary)
    return output


In [None]:
fsd_taxonomy = filter_taxonomy(taxonomy, fsd_voc)

In [None]:
black_list = list()
for mid, cat_dict in fsd_taxonomy.items():
    if len(cat_dict['hierarchy']) != 1:
        black_list.append(mid)
        print(f"{mid}-{fsd_voc[mid]['label']}: {cat_dict['hierarchy']}")

In [None]:
def remove_class(ori_vocabulary: dict, black_list: dict) -> dict:
    """ Remove classes from ori_vocabulary as per black_list."""
    for mid in black_list:
        del ori_vocabulary[mid]
    return ori_vocabulary

In [None]:
for i, lvl in enumerate(tree):
    print(f"{i}-th level: {len(lvl)}")

In [None]:
def filter_tree(ori_tree: list,  vocabulary: dict) -> dict:
    """ Filter tree according to the provided vocabulary."""
    mid_set = set(list(vocabulary.keys()))
    output = [None] * len(ori_tree)  # create an empty list which has the same length as `ori_tree`
    for lvl, l_set in enumerate(ori_tree):
        output[lvl] = list((set(l_set) & mid_set))
    return output

In [None]:
fsd_tree = filter_tree(tree, fsd_voc)

In [None]:
for i, lvl in enumerate(fsd_tree):
    print(f"{i}-th level: {len(lvl)}")

In [None]:
remain_voc = remove_class(fsd_voc, [*fsd_tree[1], *fsd_tree[4], *fsd_tree[5]])

In [None]:
remain_fsd_tree = filter_tree(tree, remain_voc)
for i, lvl in enumerate(remain_fsd_tree):
    print(f"{i}-th level: {len(lvl)}")

In [None]:
remain_fsd_taxonomy = filter_taxonomy(fsd_taxonomy, remain_voc)
black_list = list()
for mid, cat_dict in remain_fsd_taxonomy.items():
    if len(cat_dict['hierarchy']) != 1:
        black_list.append(mid)
        print(f"{mid}-{fsd_voc[mid]['label']}: {cat_dict['hierarchy']}")

In [None]:
print(len(fsd_taxonomy))

In [None]:
remain_voc = remove_class(remain_voc, black_list)

In [None]:
remain_fsd_tree = filter_tree(tree, remain_voc)
for i, lvl in enumerate(remain_fsd_tree):
    print(f"{i}-th level: {len(lvl)}")
remain_fsd_taxonomy = filter_taxonomy(fsd_taxonomy, remain_voc)

In [None]:
black_list = list()
for mid, cat_dict in remain_fsd_taxonomy.items():
    if len(cat_dict['hierarchy']) != 1:
        black_list.append(mid)
        print(f"{mid}-{fsd_voc[mid]['label']}: {cat_dict['hierarchy']}")

In [None]:
print(len(remain_voc))