In [1]:
import numpy as np
import pandas as pd

In [55]:
audio_metadata = pd.read_csv('/content/test_metadata.csv')
audio_metadata['labels'] = audio_metadata['labels'].apply(eval)
audio_metadata['root_classes'] = None
audio_metadata

Unnamed: 0,video_id,start_time_seconds,end_time_seconds,labels,root_classes
0,iZcPr3wgJCw,30.0,40.0,[350],
1,iZmn0dUXP9E,40.0,50.0,"[375, 376]",
2,iZ2K_GPtc6g,30.0,40.0,"[501, 503]",
3,iZtF1lUPbEQ,20.0,30.0,"[60, 130, 137]",
4,iZFRfh1Wjt8,420.0,430.0,"[300, 308, 316, 318]",
...,...,...,...,...,...
21777,2Cmn2lplGfc,30.0,40.0,[338],
21778,2ClQcAd-wJg,30.0,40.0,"[0, 398]",
21779,-Sclnq22t7o,30.0,40.0,[58],
21780,-SD9DkKyOrY,30.0,40.0,"[161, 162, 164, 165, 166, 168, 171, 172]",


In [5]:
!git clone https://github.com/audioset/ontology.git

Cloning into 'ontology'...
remote: Enumerating objects: 14, done.[K
remote: Total 14 (delta 0), reused 0 (delta 0), pack-reused 14[K
Receiving objects: 100% (14/14), 87.20 KiB | 628.00 KiB/s, done.
Resolving deltas: 100% (2/2), done.


In [7]:
label_index = pd.read_csv('/content/class_labels_indices.csv')
label_index

Unnamed: 0,index,mid,display_name
0,0,/m/09x0r,Speech
1,1,/m/05zppz,"Male speech, man speaking"
2,2,/m/02zsn,"Female speech, woman speaking"
3,3,/m/0ytgt,"Child speech, kid speaking"
4,4,/m/01h8n0,Conversation
...,...,...,...
522,522,/m/07p_0gm,Throbbing
523,523,/m/01jwx6,Vibration
524,524,/m/07c52,Television
525,525,/m/06bz3,Radio


In [42]:
import json

# Load the ontology JSON data
ontology_path = "/content/ontology/ontology.json"
with open(ontology_path, 'r') as file:
    ontology_data = json.load(file)

# Build a dict to map mid to class
mid_to_class = {}
for item in ontology_data:
  mid_to_class[item['id']] = item["name"]

In [44]:
index_to_class = {}
for i, item in label_index.iterrows():
  index_to_class[item['index']] = item['display_name']

In [56]:
# Build a dictionary to map child class to parent class
# ignore super classes except 'Music' - poor subdivision
super_classes = {'Channel, environment and background',
                 'Source-ambiguous sounds',
                 'Animal',
                 'Natural sounds',
                 'Sounds of things',
                 'Human sounds'}
child_to_parent = {}
for item in ontology_data:
  className = item['name']
  if className in super_classes:
    continue
  for child_mid in item["child_ids"]:
    child_class = mid_to_class[child_mid]
    # child may have multiple parent classes
    if child_class in child_to_parent:
      child_to_parent[child_class].append(className)
    else:
      child_to_parent[child_class] = [className]

child_to_parent

{'Speech': ['Human voice'],
 'Shout': ['Human voice'],
 'Screaming': ['Human voice'],
 'Whispering': ['Human voice'],
 'Laughter': ['Human voice'],
 'Crying, sobbing': ['Human voice'],
 'Wail, moan': ['Human voice'],
 'Sigh': ['Human voice'],
 'Singing': ['Human voice'],
 'Humming': ['Human voice'],
 'Groan': ['Human voice'],
 'Grunt': ['Human voice'],
 'Yawn': ['Human voice'],
 'Male speech, man speaking': ['Speech'],
 'Female speech, woman speaking': ['Speech'],
 'Child speech, kid speaking': ['Speech'],
 'Conversation': ['Speech'],
 'Narration, monologue': ['Speech'],
 'Babbling': ['Speech'],
 'Speech synthesizer': ['Speech'],
 'Bellow': ['Shout'],
 'Whoop': ['Shout'],
 'Yell': ['Shout'],
 'Battle cry': ['Shout'],
 'Children shouting': ['Shout', 'Human group actions'],
 'Baby laughter': ['Laughter'],
 'Giggle': ['Laughter'],
 'Snicker': ['Laughter'],
 'Belly laugh': ['Laughter'],
 'Chuckle, chortle': ['Laughter'],
 'Baby cry, infant cry': ['Crying, sobbing'],
 'Whimper': ['Crying, s

In [57]:
node_to_root = {}
# Function to find root label
def find_root(node):
  if node not in child_to_parent:
    return [node]
  parents = child_to_parent[node]
  res = set()
  for parent in parents:
    res.update(find_root(parent))
  return list(res)

for k in child_to_parent:
  node_to_root[k] = find_root(k)

node_to_root

{'Speech': ['Human voice'],
 'Shout': ['Human voice'],
 'Screaming': ['Human voice'],
 'Whispering': ['Human voice'],
 'Laughter': ['Human voice'],
 'Crying, sobbing': ['Human voice'],
 'Wail, moan': ['Human voice'],
 'Sigh': ['Human voice'],
 'Singing': ['Human voice'],
 'Humming': ['Human voice'],
 'Groan': ['Human voice'],
 'Grunt': ['Human voice'],
 'Yawn': ['Human voice'],
 'Male speech, man speaking': ['Human voice'],
 'Female speech, woman speaking': ['Human voice'],
 'Child speech, kid speaking': ['Human voice'],
 'Conversation': ['Human voice'],
 'Narration, monologue': ['Human voice'],
 'Babbling': ['Human voice'],
 'Speech synthesizer': ['Human voice'],
 'Bellow': ['Human voice'],
 'Whoop': ['Human voice'],
 'Yell': ['Human voice'],
 'Battle cry': ['Human voice'],
 'Children shouting': ['Human group actions', 'Human voice'],
 'Baby laughter': ['Human voice'],
 'Giggle': ['Human voice'],
 'Snicker': ['Human voice'],
 'Belly laugh': ['Human voice'],
 'Chuckle, chortle': ['Huma

In [58]:
for i, row in audio_metadata.iterrows():
  labels = row['labels']
  # de-duplicate root labels
  root_labels = set()
  # convert index labels to string labels
  class_labels = []
  for label in labels:
    className = index_to_class[label]
    class_labels.append(className)
    # get root class list
    root_classes = node_to_root.get(className, [className])
    # add root labels to set
    root_labels.update(root_classes)
  audio_metadata.at[i, 'labels'] = class_labels
  audio_metadata.at[i, 'root_classes'] = list(root_labels)

In [59]:
audio_metadata

Unnamed: 0,video_id,start_time_seconds,end_time_seconds,labels,root_classes
0,iZcPr3wgJCw,30.0,40.0,[Engine knocking],[Engine]
1,iZmn0dUXP9E,40.0,50.0,"[Toothbrush, Electric toothbrush]","[Domestic sounds, home sounds]"
2,iZ2K_GPtc6g,30.0,40.0,"[Sine wave, Chirp tone]",[Other sourceless]
3,iZtF1lUPbEQ,20.0,30.0,"[Fart, Buzz, Music]","[Wild animals, Digestive, Music, Onomatopoeia]"
4,iZFRfh1Wjt8,420.0,430.0,"[Vehicle, Vehicle horn, car horn, honking, Tru...","[Alarm, Vehicle]"
...,...,...,...,...,...
21777,2Cmn2lplGfc,30.0,40.0,"[Propeller, airscrew]",[Vehicle]
21778,2ClQcAd-wJg,30.0,40.0,"[Speech, Buzzer]","[Alarm, Human voice]"
21779,-Sclnq22t7o,30.0,40.0,"[Burping, eructation]",[Digestive]
21780,-SD9DkKyOrY,30.0,40.0,"[Percussion, Drum kit, Drum, Snare drum, Rimsh...",[Music]


In [None]:
from random import uniform, randint
from datetime import datetime, timedelta


# Define the time range
start_date = datetime(2000, 1, 1)
end_date = datetime(2023, 12, 31)
time_delta = end_date - start_date

# Function to generate a random date within the range
def random_date(start, delta):
    return start + timedelta(days=randint(0, delta.days), seconds=randint(0, 86400))

# Define a more constrained geographical range to increase likelihood of land placement
latitude_min, latitude_max = 24, 70  # Adjusted to reduce ocean placements
longitude_min, longitude_max = -125, -65  # Adjusted to reduce ocean placements

# Regenerate synthetic spatial data with the new constraints
audio_metadata['latitude'] = audio_metadata.apply(lambda x: round(uniform(latitude_min, latitude_max), 6), axis=1)
audio_metadata['longitude'] = audio_metadata.apply(lambda x: round(uniform(longitude_min, longitude_max), 6), axis=1)
audio_metadata['datetime'] = audio_metadata.apply(lambda x: random_date(start_date, time_delta), axis=1)

# Convert datetime to string for easier viewing and saving
audio_metadata['datetime'] = audio_metadata['datetime'].astype(str)

audio_metadata

Unnamed: 0,video_id,start_time_seconds,end_time_seconds,labels,latitude,longitude,datetime
0,wqoOX8K8DEU,30.0,40.0,[Alarm],44.436773,-82.618181,2017-11-10 21:32:17
1,wqH6Sj_h948,120.0,130.0,"[Speech, Glass]",49.174869,-105.107694,2021-12-27 21:48:57
2,wq1098my4zA,130.0,140.0,"[Music, Singing, Lullaby]",63.328074,-122.951617,2012-05-16 20:39:16
3,wqR7LHho-WE,10.0,20.0,"[Speech, Wail, moan, Crying, sobbing]",50.485902,-90.706898,2001-12-17 08:45:48
4,wq6Me-UUbSc,360.0,370.0,[Mechanisms],28.943491,-120.071508,2009-11-23 00:09:27
...,...,...,...,...,...,...,...
21777,2w6tV5kDGWo,240.0,250.0,[Music],52.449303,-78.761162,2021-06-02 04:06:01
21778,2wZCoeq9Ppc,80.0,90.0,"[Music, Theme music]",43.845412,-100.740085,2019-04-17 18:28:18
21779,2wajg-UP-Gs,0.0,10.0,[Arrow],52.756911,-109.192839,2007-11-17 09:40:49
21780,lZavPVn7O4Q,180.0,190.0,"[Music, Christian music, Music of Asia, Christ...",29.850410,-86.566702,2009-06-17 00:55:05


In [60]:
from google.colab import files
# save updated metadata
audio_metadata.to_csv('/content/test_metadata_rootclass.csv', index=False)
files.download('/content/test_metadata_rootclass.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>