<a href="https://colab.research.google.com/github/Jerry086/SALSA/blob/jerry/AudioSet_to_CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# download the dataset
region = 'us'
!gsutil -m cp -r gs://{region}_audioset/youtube_corpus/v1/features/features.tar.gz /content/

Copying gs://us_audioset/youtube_corpus/v1/features/features.tar.gz...
/ [0/1 files][    0.0 B/  2.4 GiB]   0% Done                                    ==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

| [1/1 files][  2.4 GiB/  2.4 GiB] 100% Done  73.7 MiB/s ETA 00:00:00           
Operation completed over 1 objects/2.4 GiB.                                      


In [4]:
# extract the dataset
!tar -xzf /content/features.tar.gz -C /content/

In [32]:
# import dependencies
import tensorflow as tf
import pandas as pd
import os
import numpy as np

In [6]:
# Function to parse the SequenceExample
def parse_sequence_example(sequence_example_proto):
    # Define the context and feature list structure for parsing
    context_features = {
        "video_id": tf.io.FixedLenFeature([], dtype=tf.string),
        "start_time_seconds": tf.io.FixedLenFeature([], dtype=tf.float32),
        "end_time_seconds": tf.io.FixedLenFeature([], dtype=tf.float32),
        "labels": tf.io.VarLenFeature(tf.int64)
    }
    sequence_features = {
        "audio_embedding": tf.io.FixedLenSequenceFeature([], dtype=tf.string)
    }

    # Parse the input `SequenceExample` proto using the dictionary above
    context_parsed, sequence_parsed = tf.io.parse_single_sequence_example(
        sequence_example_proto,
        context_features=context_features,
        sequence_features=sequence_features
    )

    return context_parsed, sequence_parsed

In [9]:
# Function to get filename list
def get_files():
  # Base directory where the TFRecord files are extracted
  base_dir = '/content/audioset_v1_embeddings/bal_train'

  # List to store the paths of all TFRecord files
  tfrecord_files = []

  file_count = 0
  # Walk through the directory tree
  for subdir, dirs, files in os.walk(base_dir):
      for file in files:
          if file.endswith('.tfrecord'):
              # Construct the full file path and add it to the list
              full_path = os.path.join(subdir, file)
              tfrecord_files.append(full_path)
              file_count += 1

  print(f"Total number of .tfrecord files available: {file_count}")

  return tfrecord_files
  # for path in tfrecord_files:
  #     print(path)

In [12]:
# obtain file list
tfrecord_files = get_files()

Total number of .tfrecord files available: 4070


In [13]:
# Create a dataset from the tfrecord files
dataset = tf.data.TFRecordDataset(tfrecord_files)

In [15]:
# Map the parsing function over the dataset
parsed_dataset = dataset.map(parse_sequence_example)
parsed_dataset

<_MapDataset element_spec=({'labels': SparseTensorSpec(TensorShape([None]), tf.int64), 'end_time_seconds': TensorSpec(shape=(), dtype=tf.float32, name=None), 'start_time_seconds': TensorSpec(shape=(), dtype=tf.float32, name=None), 'video_id': TensorSpec(shape=(), dtype=tf.string, name=None)}, {'audio_embedding': TensorSpec(shape=(None,), dtype=tf.string, name=None)})>

In [18]:
# Function to decode and flatten audio embeddings
def decode_and_flatten_audio_embeddings(audio_embeddings):
    # Convert the audio embeddings from byte strings to uint8 tensors
    audio_embeddings_uint8 = tf.io.decode_raw(audio_embeddings, tf.uint8)
    # Flatten the audio embeddings
    flattened_embeddings = tf.reshape(audio_embeddings_uint8, [-1])
    return flattened_embeddings.numpy()

In [34]:
# Placeholders for records and embeddings
metadata_records = []
embedding_records = []

for context_parsed, sequence_parsed in parsed_dataset:
    video_id = context_parsed["video_id"].numpy().decode('utf-8')
    start_time = context_parsed["start_time_seconds"].numpy()
    end_time = context_parsed["end_time_seconds"].numpy()
    labels = tf.sparse.to_dense(context_parsed["labels"]).numpy()
    audio_embeddings = decode_and_flatten_audio_embeddings(sequence_parsed["audio_embedding"])

    # Check if the dimension of the embeddings is exactly 1280
    if audio_embeddings.shape[0] == 1280:
        # Metadata record
        metadata_record = {
            "video_id": video_id,
            "start_time_seconds": start_time,
            "end_time_seconds": end_time,
            "labels": labels.tolist()  # Convert to list for easier DataFrame handling
        }
        metadata_records.append(metadata_record)

        # Embeddings record, prepend video_id to link with metadata
        embedding_record = {"video_id": video_id}
        embedding_record.update({f"feature_{i}": audio_embeddings[i] for i in range(1280)})
        embedding_records.append(embedding_record)

# Convert to pandas DataFrames
metadata_df = pd.DataFrame(metadata_records)
embeddings_df = pd.DataFrame(embedding_records)

In [35]:
metadata_df

Unnamed: 0,video_id,start_time_seconds,end_time_seconds,labels
0,wqoOX8K8DEU,30.0,40.0,"[396, 397]"
1,wqH6Sj_h948,120.0,130.0,"[0, 441, 443]"
2,wq1098my4zA,130.0,140.0,"[27, 137, 271]"
3,wqR7LHho-WE,10.0,20.0,"[0, 22, 25]"
4,wq6Me-UUbSc,360.0,370.0,[413]
...,...,...,...,...
21777,2w6tV5kDGWo,240.0,250.0,[178]
21778,2wZCoeq9Ppc,80.0,90.0,"[137, 138, 185, 195, 196, 198, 268]"
21779,2wajg-UP-Gs,0.0,10.0,[459]
21780,lZavPVn7O4Q,180.0,190.0,"[137, 258, 260, 273]"


In [36]:
embeddings_df

Unnamed: 0,video_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_1270,feature_1271,feature_1272,feature_1273,feature_1274,feature_1275,feature_1276,feature_1277,feature_1278,feature_1279
0,wqoOX8K8DEU,89,255,19,54,240,199,213,188,255,...,101,152,255,0,194,190,0,91,247,251
1,wqH6Sj_h948,89,74,221,113,99,254,42,97,135,...,98,38,255,0,255,255,124,29,0,255
2,wq1098my4zA,224,124,142,123,113,144,189,206,64,...,72,116,62,197,89,51,212,44,112,177
3,wqR7LHho-WE,147,207,173,147,216,98,144,0,218,...,86,81,91,123,192,51,255,255,101,0
4,wq6Me-UUbSc,70,90,168,90,220,90,130,61,129,...,91,184,83,32,131,167,167,111,180,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21777,2w6tV5kDGWo,155,99,141,33,81,109,108,237,176,...,217,196,195,104,81,0,47,194,56,196
21778,2wZCoeq9Ppc,102,255,136,0,0,95,255,255,255,...,80,0,233,146,255,200,0,0,0,96
21779,2wajg-UP-Gs,132,72,60,190,75,6,114,73,153,...,255,156,156,35,255,0,0,255,255,0
21780,lZavPVn7O4Q,224,112,85,133,151,193,112,127,154,...,127,207,66,83,106,204,154,74,145,62


In [38]:
# Save DataFrames to CSV files
metadata_df.to_csv("/content/audio_metadata.csv", index=False)
embeddings_df.to_csv("/content/audio_embeddings.csv", index=False)

In [39]:
from google.colab import files

# Download the metadata CSV
files.download('/content/audio_metadata.csv')

# Download the embeddings CSV
files.download('/content/audio_embeddings.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>