In [21]:
import json

In [2]:
with open('./ontology.json', 'rb') as f:
    meta = json.load(f)

In [3]:
# create a dict containing elements as "mid: {"name": , "child_ids", "restrictions", "parent_ids"}"
voc = {}
for cls in meta:
    tmp = {}
    tmp['name'] = cls['name']
    tmp['child_ids'] = cls['child_ids']
    tmp['restrictions'] = cls['restrictions']
    tmp['parent_ids'] = []
    tmp['hierarchy'] = []
    tmp['ancestor_id'] = []
    
    voc[cls['id']] = tmp
    
print(voc['/m/09b5t']['child_ids']) # check the result randomly 

['/m/07st89h', '/m/07qn5dc']


In [4]:
# append 'parent' attribute to each category
for id in voc:
    for child_id in voc[id]['child_ids']:
        voc[child_id]['parent_ids'].append(id)

print(voc['/m/05zppz']) # check a category randomly

{'name': 'Male speech, man speaking', 'child_ids': [], 'restrictions': [], 'parent_ids': ['/m/09x0r'], 'hierarchy': [], 'ancestor_id': []}


In [5]:
probe = []
count = 0
extra_labels = 0
for id in voc:
    if voc[id]['parent_ids'] != [] and len(voc[id]['parent_ids']) > 1:
        count += 1
        extra_labels += len(set(voc[id]['parent_ids'])) - 1 
        probe.append(id)

print(len(probe))
print("There are {} labels in the ontology. {} labels have more than one path to root, so there would be {} labels throughout the hierarchy".format(len(voc), count, len(voc) + extra_labels))
print(voc['/t/dd00135'])

38
There are 632 labels in the ontology. 38 labels have more than one path to root, so there would be 677 labels throughout the hierarchy
{'name': 'Children shouting', 'child_ids': [], 'restrictions': [], 'parent_ids': ['/m/07p6fty', '/t/dd00012'], 'hierarchy': [], 'ancestor_id': []}


In [6]:
def make_hierarchy(_voc, n_iter=9, verbose=True):
    # append 'hierarchy' attribute to each category and create a dict for each level
    hierarchy = {}
    level = 1

    # the first level doesn't have parent id
    tmp = []
    for id in _voc:
        if _voc[id]['parent_ids'] == []:
            if level not in _voc[id]['hierarchy']: _voc[id]['hierarchy'].append(level)
            if id not in _voc[id]['ancestor_id']: _voc[id]['ancestor_id'].append(id)
            tmp.append(id)

    hierarchy[level] = tmp
    # print(_voc)
    # print(len(hierarchy[level]))

    # now iter for the following level
    while level < n_iter:
        level += 1
        tmp = []
        for id in _voc:
            for p in _voc[id]['parent_ids']:
                # inherite ancestor_id from the parent 
                for a_id in _voc[p]['ancestor_id']: 
                    if a_id not in _voc[id]['ancestor_id']: _voc[id]['ancestor_id'].append(a_id)
                    
                if p in hierarchy[level - 1]: 
                    if level not in _voc[id]['hierarchy']: _voc[id]['hierarchy'].append(level)
                    tmp.append(id)
                    break

        if tmp != []:
            hierarchy[level] = tmp
        else:
            break

    if verbose:
        # show some results
        # for cls in _voc:
            # print('{}\n'.format(_voc[cls]))
        for level in hierarchy:
            print("N_num in level {} of the ontology is {}".format(level, len(hierarchy[level])))
            
            
    return hierarchy, _voc


as_hierarchy, voc = make_hierarchy(voc)

N_num in level 1 of the ontology is 7
N_num in level 2 of the ontology is 43
N_num in level 3 of the ontology is 306
N_num in level 4 of the ontology is 240
N_num in level 5 of the ontology is 66
N_num in level 6 of the ontology is 5


In [7]:
def check_multipath(hierarchy):
    sum = 0
    intersection = {}
    for lvl in hierarchy:
        # check there is no duplication in the same level
        assert len(hierarchy[lvl]) == len(list(set(hierarchy[lvl])))

        # check the overlaps between different level
        for l in range(lvl + 1, len(hierarchy)):
            tmp = set(hierarchy[lvl]).intersection(set(hierarchy[l]))
            sum += len(tmp)
            if tmp:
                intersection['{}&{}'.format(lvl, l)] = tmp
                print('The intersection between {} and {} is {} (n={})'.format(lvl, l, tmp, len(tmp)))

    print(sum)
    # print(sum + len(_voc))
    return intersection
    
    
as_intersection = check_multipath(as_hierarchy)

The intersection between 2 and 3 is {'/m/0395lw'} (n=1)
The intersection between 3 and 4 is {'/m/07n_g', '/m/0f8s22', '/g/11b630rrvh', '/m/03w41f', '/m/07rqsjt', '/m/07rjwbb', '/m/07qn4z3', '/t/dd00135', '/m/0l14jd', '/m/07qcpgn', '/m/0239kh', '/m/027m70_', '/m/07r4wb8', '/m/0gy1t2s', '/m/03wwcy', '/m/07qnq_y', '/m/0150b9'} (n=17)
The intersection between 3 and 5 is {'/m/0912c9', '/m/02mfyn', '/m/04229', '/m/05x_td', '/t/dd00092', '/m/0gy1t2s'} (n=6)
The intersection between 4 and 5 is {'/m/012ndj', '/m/07pggtn', '/m/026fgl', '/m/08j51y', '/m/07rrh0c', '/m/07r67yg', '/m/07pjwq1', '/t/dd00108', '/m/0gy1t2s', '/m/04qvtq', '/m/012n7d'} (n=11)
35


In [8]:
import os
import pandas as pd

# define WORKPLACE path
collection_dir = './collection'

dev_csv = 'collection_dev.csv'
eval_csv = 'collection_eval.csv'
dev_voc_csv = 'vocabulary_collection_dev.csv'
eval_voc_csv = 'vocabulary_collection_eval.csv'

In [9]:
collection_info = {}
with open(os.path.join(collection_dir, dev_voc_csv), 'r') as f1:
    meta1 = pd.read_csv(f1, names=['index', 'label', 'id'])
    collection_info['index'] = meta1['index'].values.tolist()
    collection_info['label'] = meta1['label'].values.tolist()
    collection_info['id'] = meta1['id'].values.tolist()

print(collection_info['id'])
print(len(collection_info['index']))

['/m/07q2z82', '/m/0mkg', '/m/042v_gx', '/m/025wky1', '/m/05x_td', '/m/0k5j', '/m/014yck', '/m/07pp_mv', '/m/046dlr', '/m/02pprs', '/m/012n7d', '/m/0jbk', '/m/028ght', '/m/0dl83', '/m/0_1c', '/m/0261r1', '/t/dd00002', '/t/dd00001', '/m/05tny_', '/m/0bm02', '/m/018vs', '/m/01c3q', '/m/03dnzn', '/m/04gy_2', '/m/01h3n', '/m/0395lw', '/m/0199g', '/m/0gy1t2s', '/m/015p6', '/m/05_wcq', '/m/020bb7', '/m/07q0h5t', '/m/02pjr4', '/m/019jd', '/m/0dv3j', '/m/07qqyl4', '/m/0l14_3', '/m/01kcd', '/m/0lyf6', '/m/03q5_w', '/m/07qsvvw', '/m/01bjv', '/m/01z47d', '/m/07pjwq1', '/m/030rvx', '/m/0dv5r', '/m/073cg4', '/m/0k4j', '/m/02mfyn', '/t/dd00134', '/m/024dl', '/m/01yrx', '/m/01xq0k1', '/m/07r5c2p', '/m/01xqw', '/m/01sb50', '/m/01j4z9', '/m/02bk07', '/m/07rkbfh', '/m/053hz1', '/m/03cczk', '/m/09b5t', '/t/dd00005', '/m/0ytgt', '/t/dd00013', '/t/dd00135', '/m/0f8s22', '/m/07q7njn', '/m/02021', '/m/07pggtn', '/m/0l14jd', '/m/07pczhz', '/m/07pn_8q', '/m/07rgt08', '/m/03w41f', '/m/0dgbq', '/m/0l15bq', '/m/0

In [10]:
index = len(collection_info['index'])

with open(os.path.join(collection_dir, eval_voc_csv), 'r') as f2:
    meta2 = pd.read_csv(f2, names=['index', 'label', 'id'])
    for row, id in enumerate(meta2['id']):
        if id in collection_info['id']:
            # print(collection_info['id'][row])
            pass
        else:
            collection_info['index'].append(index)
            collection_info['label'].append(meta2['label'].iloc[row])
            collection_info['id'].append(meta2['id'].iloc[row])
            
            index += 1

# print(collection_info['index'])
print(len(collection_info['id']))

378


In [11]:
collection_voc = {}
for id in collection_info['id']:
    collection_voc[id] = voc[id]

# print(collection_voc)
print(len(collection_voc))

378


In [12]:
# collection_hierarchy, _ = make_hierarchy(collection_voc)

In [13]:
def show_hierarchy(voc, n_iter=9):
    level = 0
    hierarchy = {}
    while level < n_iter:
        tmp = []
        level += 1
        for cls in voc:
            if level in voc[cls]['hierarchy']: tmp.append(cls)
        
        if tmp == []: break
        else: hierarchy[level] = tmp
                
    # show some results
    s = 0
    for l in range(1, len(hierarchy) +1):
        s += len(hierarchy[l])
        print("N_num in level {} of the ontology is {}".format(l, len(hierarchy[l])))
    
    print("number of total categories is {}".format(s))
    return hierarchy


collection_hierarchy = show_hierarchy(collection_voc)

N_num in level 1 of the ontology is 4
N_num in level 2 of the ontology is 24
N_num in level 3 of the ontology is 168
N_num in level 4 of the ontology is 157
N_num in level 5 of the ontology is 52
N_num in level 6 of the ontology is 4
number of total categories is 409


In [14]:
check_multipath(collection_hierarchy)

The intersection between 2 and 3 is {'/m/0395lw'} (n=1)
The intersection between 3 and 4 is {'/m/0f8s22', '/m/03w41f', '/m/07rqsjt', '/m/07rjwbb', '/t/dd00135', '/m/0l14jd', '/m/07qcpgn', '/m/0239kh', '/m/027m70_', '/m/07r4wb8', '/m/0gy1t2s', '/m/03wwcy', '/m/07qnq_y', '/m/07qn4z3'} (n=14)
The intersection between 3 and 5 is {'/m/0912c9', '/m/02mfyn', '/m/05x_td', '/t/dd00092', '/m/0gy1t2s'} (n=5)
The intersection between 4 and 5 is {'/m/012ndj', '/m/07pggtn', '/m/026fgl', '/m/08j51y', '/m/07rrh0c', '/m/07r67yg', '/m/07pjwq1', '/t/dd00108', '/m/0gy1t2s', '/m/04qvtq', '/m/012n7d'} (n=11)
31


{'2&3': {'/m/0395lw'},
 '3&4': {'/m/0239kh',
  '/m/027m70_',
  '/m/03w41f',
  '/m/03wwcy',
  '/m/07qcpgn',
  '/m/07qn4z3',
  '/m/07qnq_y',
  '/m/07r4wb8',
  '/m/07rjwbb',
  '/m/07rqsjt',
  '/m/0f8s22',
  '/m/0gy1t2s',
  '/m/0l14jd',
  '/t/dd00135'},
 '3&5': {'/m/02mfyn', '/m/05x_td', '/m/0912c9', '/m/0gy1t2s', '/t/dd00092'},
 '4&5': {'/m/012n7d',
  '/m/012ndj',
  '/m/026fgl',
  '/m/04qvtq',
  '/m/07pggtn',
  '/m/07pjwq1',
  '/m/07r67yg',
  '/m/07rrh0c',
  '/m/08j51y',
  '/m/0gy1t2s',
  '/t/dd00108'}}

In [15]:
def id2label(hierarchy):
    collection_labels = {}
    for level in hierarchy:
        tmp = []
        for cls in hierarchy[level]:
            tmp.append(voc[cls]['name'])

        collection_labels[level] = tmp
    
    return collection_labels

collection_hierarchy_id = id2label(collection_hierarchy)
for level in collection_hierarchy_id:
    print('level {} contains {}'.format(level,  collection_hierarchy_id[level]))

level 1 contains ['Animal', 'Music', 'Human sounds', 'Natural sounds']
level 2 contains ['Alarm', 'Bell', 'Engine', 'Explosion', 'Fire', 'Glass', 'Hands', 'Heart sounds, heartbeat', 'Human voice', 'Liquid', 'Mechanisms', 'Thunderstorm', 'Tools', 'Vehicle', 'Water', 'Wind', 'Wood', 'Digestive', 'Domestic animals, pets', 'Domestic sounds, home sounds', 'Musical instrument', 'Respiratory sounds', 'Whistling', 'Wild animals']
level 3 contains ['Accelerating, revving, vroom', 'Accordion', 'Air conditioning', 'Air horn, truck horn', 'Aircraft', 'Alarm clock', 'Applause', 'Arrow', 'Bathtub (filling or washing)', 'Bell', 'Bicycle bell', 'Bird', 'Blender', 'Boat, Water vehicle', 'Boiling', 'Boom', 'Bowed string instrument', 'Brass instrument', 'Breathing', 'Burping, eructation', 'Burst, pop', 'Buzzer', 'Camera', 'Car alarm', 'Cash register', 'Cat', 'Cattle, bovinae', 'Chatter', 'Cheering', 'Chewing, mastication', 'Children playing', 'Children shouting', 'Chime', 'Chink, clink', 'Choir', 'Chop',

In [16]:
collection_structure = check_multipath(collection_hierarchy)
id2label(collection_structure)

The intersection between 2 and 3 is {'/m/0395lw'} (n=1)
The intersection between 3 and 4 is {'/m/0f8s22', '/m/03w41f', '/m/07rqsjt', '/m/07rjwbb', '/t/dd00135', '/m/0l14jd', '/m/07qcpgn', '/m/0239kh', '/m/027m70_', '/m/07r4wb8', '/m/0gy1t2s', '/m/03wwcy', '/m/07qnq_y', '/m/07qn4z3'} (n=14)
The intersection between 3 and 5 is {'/m/0912c9', '/m/02mfyn', '/m/05x_td', '/t/dd00092', '/m/0gy1t2s'} (n=5)
The intersection between 4 and 5 is {'/m/012ndj', '/m/07pggtn', '/m/026fgl', '/m/08j51y', '/m/07rrh0c', '/m/07r67yg', '/m/07pjwq1', '/t/dd00108', '/m/0gy1t2s', '/m/04qvtq', '/m/012n7d'} (n=11)
31


{'2&3': ['Bell'],
 '3&4': ['Chime',
  'Church bell',
  'Whoosh, swoosh, swish',
  'Hiss',
  'Children shouting',
  'Choir',
  'Tap',
  'Cowbell',
  'Jingle bell',
  'Knock',
  'Bicycle bell',
  'Doorbell',
  'Thump, thud',
  'Rattle'],
 '3&5': ['Vehicle horn, car horn, honking',
  'Car alarm',
  'Air horn, truck horn',
  'Wind noise (microphone)',
  'Bicycle bell'],
 '4&5': ['Fire engine, fire truck (siren)',
  'Chirp, tweet',
  'Wind chime',
  "Dental drill, dentist's drill",
  'Thunk',
  'Ding-dong',
  'Buzz',
  'Clunk',
  'Bicycle bell',
  'Police car (siren)',
  'Ambulance (siren)']}

In [17]:
id2label(as_intersection)

{'2&3': ['Bell'],
 '3&4': ['Tuning fork',
  'Chime',
  'Kettle whistle',
  'Church bell',
  'Whoosh, swoosh, swish',
  'Hiss',
  'Rattle',
  'Children shouting',
  'Choir',
  'Tap',
  'Cowbell',
  'Jingle bell',
  'Knock',
  'Bicycle bell',
  'Doorbell',
  'Thump, thud',
  'Change ringing (campanology)'],
 '3&5': ['Vehicle horn, car horn, honking',
  'Car alarm',
  'Jet engine',
  'Air horn, truck horn',
  'Wind noise (microphone)',
  'Bicycle bell'],
 '4&5': ['Fire engine, fire truck (siren)',
  'Chirp, tweet',
  'Wind chime',
  "Dental drill, dentist's drill",
  'Thunk',
  'Ding-dong',
  'Buzz',
  'Clunk',
  'Bicycle bell',
  'Police car (siren)',
  'Ambulance (siren)']}

In [18]:
as_hierarchy_labels = id2label(as_hierarchy)

for level in as_hierarchy_labels:
    print('level {} contains {}'.format(level, as_hierarchy_labels[level]))

level 1 contains ['Human sounds', 'Animal', 'Music', 'Natural sounds', 'Sounds of things', 'Source-ambiguous sounds', 'Channel, environment and background']
level 2 contains ['Human voice', 'Whistling', 'Respiratory sounds', 'Human locomotion', 'Digestive', 'Hands', 'Heart sounds, heartbeat', 'Otoacoustic emission', 'Human group actions', 'Domestic animals, pets', 'Livestock, farm animals, working animals', 'Wild animals', 'Musical instrument', 'Bell', 'Music genre', 'Musical concepts', 'Music role', 'Music mood', 'Wind', 'Thunderstorm', 'Water', 'Fire', 'Vehicle', 'Engine', 'Domestic sounds, home sounds', 'Alarm', 'Mechanisms', 'Tools', 'Explosion', 'Wood', 'Glass', 'Liquid', 'Miscellaneous sources', 'Specific impact sounds', 'Generic impact sounds', 'Surface contact', 'Deformable shell', 'Onomatopoeia', 'Silence', 'Other sourceless', 'Acoustic environment', 'Noise', 'Sound reproduction']
level 3 contains ['Speech', 'Shout', 'Children shouting', 'Screaming', 'Whispering', 'Laughter', 

In [19]:
ancestor_branches = {}
for ancestor in collection_hierarchy[1]:
    ancestor_branches[ancestor] = []
    for id in collection_voc:
        if ancestor in collection_voc[id]['ancestor_id'] and id not in ancestor_branches[ancestor]: ancestor_branches[ancestor].append(id)
        
# print(ancestor_branches)
for ancestor in ancestor_branches:
    print('{} contains {} classes'.format(collection_voc[ancestor]['name'], len(ancestor_branches[ancestor])))

Animal contains 60 classes
Music contains 76 classes
Human sounds contains 69 classes
Natural sounds contains 21 classes


In [20]:
print(collection_voc)

{'/m/07q2z82': {'name': 'Accelerating, revving, vroom', 'child_ids': [], 'restrictions': [], 'parent_ids': ['/m/02mk9'], 'hierarchy': [3], 'ancestor_id': ['/t/dd00041']}, '/m/0mkg': {'name': 'Accordion', 'child_ids': [], 'restrictions': [], 'parent_ids': ['/m/04szw'], 'hierarchy': [3], 'ancestor_id': ['/m/04rlf']}, '/m/042v_gx': {'name': 'Acoustic guitar', 'child_ids': [], 'restrictions': [], 'parent_ids': ['/m/0342h'], 'hierarchy': [5], 'ancestor_id': ['/m/04rlf']}, '/m/025wky1': {'name': 'Air conditioning', 'child_ids': [], 'restrictions': [], 'parent_ids': ['/t/dd00077'], 'hierarchy': [3], 'ancestor_id': ['/t/dd00041']}, '/m/05x_td': {'name': 'Air horn, truck horn', 'child_ids': [], 'restrictions': [], 'parent_ids': ['/m/07r04', '/m/07pp_mv'], 'hierarchy': [3, 5], 'ancestor_id': ['/t/dd00041']}, '/m/0k5j': {'name': 'Aircraft', 'child_ids': ['/m/014yck', '/m/09ct_', '/m/0cmf2'], 'restrictions': [], 'parent_ids': ['/m/07yv9'], 'hierarchy': [3], 'ancestor_id': ['/t/dd00041']}, '/m/014y