In [25]:
from collections import defaultdict
from pathlib import Path
from typing import List, Dict

from src.config.project_paths import get_voxceleb1_metadata_path, get_data_file_path, get_voxceleb_path, \
    get_project_root_path, get_voxceleb2_metadata_path, get_voxceleb2_path
from src.annotate_data.annotate_from_path import data_path_to_annotation_list
import pandas as pd

## VoxCeleb 1

In [17]:
metadata = pd.read_csv(get_voxceleb1_metadata_path(), sep='\t')

In [18]:
metadata.head()

Unnamed: 0,user_id,user_name,gender,nationality,set
0,id10001,A.J._Buckley,m,Ireland,dev
1,id10002,A.R._Rahman,m,India,dev
2,id10003,Aamir_Khan,m,India,dev
3,id10004,Aaron_Tveit,m,USA,dev
4,id10005,Aaron_Yoo,m,USA,dev


In [19]:
annotations_train = data_path_to_annotation_list(get_voxceleb_path("train"), "train")

In [20]:
annotations_test = data_path_to_annotation_list(get_voxceleb_path("test"), "test")

In [21]:
all_annotations = [*annotations_train, *annotations_test]

In [22]:
annotations_df = pd.DataFrame(all_annotations)

In [23]:
annotations_df.head()

Unnamed: 0,wav_path,user_id,split
0,/home/mateusz/ai/biometry/VoiceAuthenticatior/...,id10092,train
1,/home/mateusz/ai/biometry/VoiceAuthenticatior/...,id10092,train
2,/home/mateusz/ai/biometry/VoiceAuthenticatior/...,id10092,train
3,/home/mateusz/ai/biometry/VoiceAuthenticatior/...,id10092,train
4,/home/mateusz/ai/biometry/VoiceAuthenticatior/...,id10092,train


In [24]:
annotations_df["wav_path"] = annotations_df["wav_path"].apply(
    lambda abs_path: str(
        Path(abs_path).relative_to(get_project_root_path())
    )
)

In [25]:
annotations_df.head()

Unnamed: 0,wav_path,user_id,split
0,data/vox1_dev_wav/wav/id10092/LbVIZMrQGmQ/0000...,id10092,train
1,data/vox1_dev_wav/wav/id10092/LbVIZMrQGmQ/0000...,id10092,train
2,data/vox1_dev_wav/wav/id10092/LbVIZMrQGmQ/0000...,id10092,train
3,data/vox1_dev_wav/wav/id10092/9YDXIU5SmUo/0000...,id10092,train
4,data/vox1_dev_wav/wav/id10092/9YDXIU5SmUo/0000...,id10092,train


In [26]:
annotations_with_metadata = pd.merge(annotations_df, metadata, on="user_id", how="left")

In [27]:
annotations_with_metadata.head(5)

Unnamed: 0,wav_path,user_id,split,user_name,gender,nationality,set
0,data/vox1_dev_wav/wav/id10092/LbVIZMrQGmQ/0000...,id10092,train,Bingbing_Li,f,China,dev
1,data/vox1_dev_wav/wav/id10092/LbVIZMrQGmQ/0000...,id10092,train,Bingbing_Li,f,China,dev
2,data/vox1_dev_wav/wav/id10092/LbVIZMrQGmQ/0000...,id10092,train,Bingbing_Li,f,China,dev
3,data/vox1_dev_wav/wav/id10092/9YDXIU5SmUo/0000...,id10092,train,Bingbing_Li,f,China,dev
4,data/vox1_dev_wav/wav/id10092/9YDXIU5SmUo/0000...,id10092,train,Bingbing_Li,f,China,dev


In [28]:
annotations_with_metadata.to_json(get_data_file_path("annotations_with_metadata.json"), orient="records")

## VoxCeleb 2

In [54]:
voxceleb2_metadata = pd.read_csv(get_voxceleb2_metadata_path(), sep=',')
voxceleb2_metadata = voxceleb2_metadata.drop(columns=["VGGFace2 ID "], axis=1).rename(
    columns={"VoxCeleb2 ID ": "user_id", "Gender ": "gender", "Set ": "set"})
voxceleb2_metadata["user_id"] = voxceleb2_metadata["user_id"].str.strip()
voxceleb2_metadata["gender"] = voxceleb2_metadata["gender"].str.strip()
voxceleb2_metadata["set"] = voxceleb2_metadata["set"].str.strip()

voxceleb2_metadata.head()

Unnamed: 0,user_id,gender,set
0,id00012,m,dev
1,id00015,m,dev
2,id00016,m,dev
3,id00017,m,test
4,id00018,m,dev


In [67]:
voxceleb2_path = get_voxceleb2_path()

In [68]:
id_paths = voxceleb2_path.iterdir()

In [69]:
annotations: List[Dict[str, any]] = []

In [70]:
for id_path in id_paths:
    sample_paths = id_path.iterdir()
    for sample_path in sample_paths:
        mp4_paths = sample_path.iterdir()
        for mp4_path in mp4_paths:
            annotations.append(
                {
                    "wav_path": mp4_path.relative_to(get_project_root_path()),
                    "user_id": id_path.stem
                }
            )
            

In [71]:
len(annotations)

36237

In [72]:
voxceleb2_user_to_metadata = voxceleb2_metadata[["user_id", "gender"]].drop_duplicates().set_index("user_id").to_dict(orient="index")

In [74]:
for annotation in annotations:
    annotation["gender"] = voxceleb2_user_to_metadata[annotation["user_id"]]["gender"]

In [77]:
pd.DataFrame(annotations).to_json(get_data_file_path("annotations_voxceleb2.json"), orient="records", default_handler=str)