The dataset here is the features extracted from the Voxceleb 1 dataset using the DAE.

This dataset was stored in numpy arrays so in npy files.

For performance issues we switch to a better method using the tfrecords.

So in this notebook we will copy the dataset to tfrecords files.

In [1]:
import numpy as np
import gc
import tensorflow as tf
import os
from tqdm import tqdm

In [None]:
# Mount google drive to colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Get the paths of the files that contains the featuers.
dataset_files = []
for i in range(7):
  file = f'/content/drive/MyDrive/30K_vox{i+1}.npy'
  dataset_files.append(file)
print(dataset_files)

In [None]:
# Write function
def record_write(path, x, y):
  with tf.io.TFRecordWriter(path, 'GZIP') as file_writer:
    for i in range(x.shape[0]):
      record_bytes = tf.train.Example(features=tf.train.Features(feature={
          "x": tf.train.Feature(float_list=tf.train.FloatList(value=x[i].flatten())),
          "y": tf.train.Feature(float_list=tf.train.FloatList(value=y[i])),
      })).SerializeToString()
      file_writer.write(record_bytes)

def record_read(record_bytes):
  parsed_features =  tf.io.parse_single_example(
      # Data
      record_bytes,

      # Schema
      {"x": tf.io.FixedLenFeature([58*32*16], dtype=tf.float32),
       "y": tf.io.FixedLenFeature([1251], dtype=tf.float32)}
  )
  x = parsed_features['x']
  x = tf.reshape(x, (58, 32, 16))
  y = parsed_features['y']
  return x, y

In [None]:
# Loop through the files.
for f in range(len(dataset_files)):
  ############### Data Pipeline ##################################
  gc.collect()
  print(f'start loadng part: {f+1}, which is: {dataset_files[f]}')

  # Load the numpy array
  data = np.load(dataset_files[f], allow_pickle=True)

  # Each npy file contains a numpy array of objects
  # There are two columns, the first one contains the examples, and the second contains the labels
  x_train_copy =  np.array(data[:, 0].copy())
  #print(x_train_copy.shape)
  y_train_copy = np.array(data[:, 1].copy())
  #print(y_train_copy.shape)
  del data
  gc.collect()

  # Reshape the arrays to the original shape and specify the type (since we have object type until now)
  x_train = np.zeros(
      (x_train_copy.shape[0], 58, 32, 16),
      dtype=np.float32
  )
  for i in range(0, x_train.shape[0]):
      x_train[i] = np.array(
          x_train_copy[i],
          dtype=np.float32
      ).reshape(58, 32, 16)
  # x_train = x_train[..., np.newaxis]
  print(x_train.shape)

  y_train = np.zeros(
      (y_train_copy.shape[0], y_train_copy[0].shape[0]),
      dtype=np.float32
  )

  for i in range(0, y_train.shape[0]):
      y_train[i] = np.array(y_train_copy[i], dtype=np.float32)
  print(y_train.shape)


  del x_train_copy
  del y_train_copy
  gc.collect()
  ############### Data Pipeline ##################################
  ############### Data Storing ###################################
  
  # It is recommended to have 100MB file size.
  # So for our case this happens with about 1024 file.
  # Powers of 2 for better performance.
  FILE_SIZE = (1<<10)
  for i in tqdm(range(1, 19 + 1)):
    st = (i-1)*FILE_SIZE
    en = i*FILE_SIZE
    file_path = f'/content/drive/MyDrive/dataset/vox_part{f}.{i}.tfrecords'
    # create the file
    open(file_path, 'a').close()
    # write to the file
    record_write(file_path, x_train[st:en], y_train[st:en])

  ############### Data Storing ###################################
  del x_train
  del y_train
  gc.collect()
