In [16]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import torchaudio.transforms as T

from datasets import Dataset, DatasetDict

In [12]:
data_file = "/om2/user/msaddler/spatial_audio_pipeline/assets/commonvoice_9_en/manifest_all_word_alignments.pdpkl"
df = pd.read_pickle(data_file)[["client_id", "split", "sr", "src_fn", "total_file_duration_in_s", "gender", "word"]]

In [13]:
df["sentence"] = df.groupby(["src_fn", "client_id", "split", "sr", "total_file_duration_in_s", "gender"])["word"].transform(lambda x : ' '.join(x))
df = df.drop_duplicates("src_fn").reset_index().drop(columns=["index", "word"])

In [14]:
#drop ,missing values in gender
df = df.dropna(axis=0, subset="gender")
df = df.dropna(axis=0, subset="sentence")
df = df.loc[df.gender != "other"]
print(len(df))
#select samples > 2sec
df = df.loc[df.total_file_duration_in_s >= 2]
print(len(df))
#select speakers with > 300 samples
male_speakers = df.loc[df.gender=="male"].client_id.value_counts().index.tolist()[:100]
female_speakers = df.loc[df.gender=="female"].client_id.value_counts().index.tolist()[:100]
df = df.loc[df.client_id.isin(male_speakers+female_speakers)]
print(len(df))

722465
716395
316086


In [17]:
speakers = df.client_id.value_counts().index.tolist()
temp = []
for speaker in tqdm(speakers):
    speaker_df = df.loc[df.client_id==speaker]
    if len(speaker_df) > 400:
        speaker_df = speaker_df.sample(400)
    temp.append(speaker_df)
df = pd.concat(temp)
df

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:03<00:00, 57.18it/s]


Unnamed: 0,client_id,split,sr,src_fn,total_file_duration_in_s,gender,sentence
1480475,372293e65cdab88771e028a4351651ab2eff64438ddafc...,train,48000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,9.192,male,both accounts concur that green first heard jo...
1497928,372293e65cdab88771e028a4351651ab2eff64438ddafc...,train,32000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,8.064,male,some performers utilize complex combinations o...
1479941,372293e65cdab88771e028a4351651ab2eff64438ddafc...,train,48000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,7.200,male,the past few years have witnessed important gr...
1487979,372293e65cdab88771e028a4351651ab2eff64438ddafc...,train,48000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,4.656,male,everything was deadly still
1478405,372293e65cdab88771e028a4351651ab2eff64438ddafc...,train,48000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,7.200,male,in spite of this blackthorne becomes a trusted...
...,...,...,...,...,...,...,...
984702,4cbced96a01d967939d63d4d35b3068a70aabb0dcc567b...,train,32000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,5.580,female,zac bertschy has described gendo as being a gi...
984703,4cbced96a01d967939d63d4d35b3068a70aabb0dcc567b...,train,32000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,5.940,female,he then bought out little david records the la...
984704,4cbced96a01d967939d63d4d35b3068a70aabb0dcc567b...,train,32000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,9.036,female,her maternal greatgrandmother was english and ...
984705,4cbced96a01d967939d63d4d35b3068a70aabb0dcc567b...,train,32000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,5.076,female,it passes through the city of brampton and the...


In [18]:
df = df.rename(columns={"src_fn":"wav_path"})
df["speaker_int"] = df.client_id.map({l: i for i, l in enumerate(df.client_id.unique())})
df.drop(columns=["split"], inplace=True)

In [20]:
temp = []
for speaker in speakers:
    speaker_df = df.loc[df.client_id == speaker]
    msk = np.random.rand(len(speaker_df)) < 0.7
    train = speaker_df[msk]
    test = speaker_df[~msk]
    val = test.sample(frac=0.5)
    test = test.drop(val.index)
    train["split"] = "train"
    test["split"] = "test"
    val["split"] = "val"
    temp.append(train)
    temp.append(val)
    temp.append(test)
df = pd.concat(temp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["split"] = "train"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["split"] = "train"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["split"] = "train"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

In [22]:
df.to_csv("../commonvoice_data_curated.csv")

In [21]:
df

Unnamed: 0,client_id,sr,wav_path,total_file_duration_in_s,gender,sentence,speaker_int,split
1480475,372293e65cdab88771e028a4351651ab2eff64438ddafc...,48000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,9.192,male,both accounts concur that green first heard jo...,0,train
1487979,372293e65cdab88771e028a4351651ab2eff64438ddafc...,48000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,4.656,male,everything was deadly still,0,train
1478405,372293e65cdab88771e028a4351651ab2eff64438ddafc...,48000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,7.200,male,in spite of this blackthorne becomes a trusted...,0,train
1491837,372293e65cdab88771e028a4351651ab2eff64438ddafc...,48000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,4.752,male,how foolish to reveal those talons to him,0,train
1491882,372293e65cdab88771e028a4351651ab2eff64438ddafc...,48000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,4.584,male,the two knives were left implanted,0,train
...,...,...,...,...,...,...,...,...
984682,4cbced96a01d967939d63d4d35b3068a70aabb0dcc567b...,32000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,4.248,female,and then a shadow came between rocco and the sun,199,test
984686,4cbced96a01d967939d63d4d35b3068a70aabb0dcc567b...,32000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,4.320,female,contemporary reviews were generally favorable,199,test
984693,4cbced96a01d967939d63d4d35b3068a70aabb0dcc567b...,32000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,4.356,female,the station acquired the nickname of hooligan ...,199,test
984697,4cbced96a01d967939d63d4d35b3068a70aabb0dcc567b...,32000,/om2/data/public/mozilla-CommonVoice-9.0/cv-co...,5.688,female,numerous alumni are also involved in postcolle...,199,test


In [19]:
df.sentence.isnull().any()

False

In [1]:
import os
import fire
import shutil
from glob import glob

from utils import *
from data import DataCollator
from tokenizer import Tokenizer

import torch
from torch.utils.data import DataLoader

from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config_path = "/om2/user/gelbanna/voice-speech-metamers/config.yaml"

In [3]:
# Load config file
config = load_yaml_config(config_path)

# 1. Read data and split it

#read csv data as dataframe
df = pd.read_csv(config.data.data_path)

#convert dataframes into huggingface dataset
raw_datasets = DatasetDict()
raw_datasets["train"] = Dataset.from_pandas(df.loc[df.split == "train"])
raw_datasets["validation"] = Dataset.from_pandas(df.loc[df.split == "val"])
raw_datasets["test"] = Dataset.from_pandas(df.loc[df.split == "test"])

In [4]:
# 2. Define Tokenizer

#define a tokenizer for the vocabulary
tokenizer = Tokenizer(**config.text)

In [6]:
# 3. Preprocess and tokenize data

#define paths for cached files
cache_file_names = None
if config.data.cache_file_path is not None:
    cache_file_names = {"train": f"{config.data.cache_file_path}/train", 
                        "validation": f"{config.data.cache_file_path}/val",
                        "test": f"{config.data.cache_file_path}/test"}
#load, resample and tokenize audio files
remove_columns = ["wav_path", "split", "client_id", "Unnamed: 0", "sr", "gender", "total_file_duration_in_s", "__index_level_0__"]
vectorized_datasets = raw_datasets.map(prepare_data,
                                    remove_columns=remove_columns,
                                    cache_file_names=cache_file_names,
                                    fn_kwargs={"target_sr": config.data.sampling_rate, "tokenizer": tokenizer})

In [7]:
vectorized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'speaker_int', '__index_level_0__', 'audio', 'text', 'text_length'],
        num_rows: 54845
    })
    validation: Dataset({
        features: ['sentence', 'speaker_int', '__index_level_0__', 'audio', 'text', 'text_length'],
        num_rows: 11790
    })
    test: Dataset({
        features: ['sentence', 'speaker_int', '__index_level_0__', 'audio', 'text', 'text_length'],
        num_rows: 11801
    })
})

In [9]:
# 4. Define DataCollator and DataLoaders

data_collator = DataCollator()
train_dataloader = DataLoader(
        vectorized_datasets["train"],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=config.dataloader.per_device_train_batch_size,
        num_workers=config.dataloader.num_workers,
    )
eval_dataloader = DataLoader(
    vectorized_datasets["validation"], 
    collate_fn=data_collator, 
    batch_size=config.dataloader.per_device_eval_batch_size,
    num_workers=config.dataloader.num_workers,
)



In [None]:
# 5. Define Encoders (ECAPA and Whisper) and Joint model

#load pre-trained encoder model
speaker_encoder = Speaker_Encoder(finetuning_config.encoder.model_cache)
speech_encoder = Speech_Encoder(finetuning_config.encoder.model_cache)

#define joint encoder
saganet = Joint_Encoder()

In [12]:
x = next(iter(train_dataloader))

In [17]:
x["speech_labels"]

[tensor([ 3, 29, 21, 22, 20, 11, 13,  7, 29,  5, 20,  3,  8, 22, 29, 17,  8, 29,
         22, 10,  7, 29, 21, 17, 23, 22, 10, 29,  3,  8, 20, 11,  5,  3, 16, 29,
         16,  3, 24, 27, 29, 25,  3, 21, 29, 16,  3, 15,  7,  6, 29,  3,  8, 22,
          7, 20, 29, 10, 11, 15,  1]),
 tensor([22, 10,  7, 29,  3, 25,  3, 20,  6, 29, 11, 21, 29,  9, 11, 24,  7, 16,
         29, 22, 17, 29, 22, 10,  7, 29,  4,  7, 21, 22, 29, 16,  7, 25, 14, 27,
         29, 20,  7, 14,  7,  3, 21,  7,  6, 29, 24,  7, 10, 11,  5, 14,  7, 29,
          7,  3,  5, 10, 29, 27,  7,  3, 20,  1]),
 tensor([18, 11,  5, 13, 14,  7,  6, 29,  3, 20, 15,  7, 16, 11,  3, 16, 29,  5,
         23,  5, 23, 15,  4,  7, 20, 29, 11, 21, 29, 21, 17, 14,  6, 29, 11, 16,
         29, 15, 11,  6,  6, 14,  7, 29,  7,  3, 21, 22,  7, 20, 16, 29, 15,  3,
         20, 13,  7, 22, 21, 29,  3, 21, 29, 18, 11,  5, 13, 14,  7,  6, 29, 25,
         11, 14,  6, 29,  5, 23,  5, 23, 15,  4,  7, 20,  1]),
 tensor([16, 17, 22, 29,  3, 29,  9, 

In [18]:
x["sentence"]

['a strike craft of the south african navy was named after him',
 'the award is given to the best newly released vehicle each year',
 'pickled armenian cucumber is sold in middle eastern markets as pickled wild cucumber',
 'not a good idea to shoot the messenger',
 'it consisted of the county of richmond',
 'it is a carnegie library',
 'position was eventually rerecorded for this album and features only zander singing',
 'the child was rushed to the hospital after swallowing a crayon']