# CommonVoice Data Preparation

## Pseudocode:

- Require 4 directories: cv_wav, cv_rttm, cv_uem, wav_pool
- Preparation
    1. Prepare audio-preprocessing functions (using Pydub, ffmpeg)
        - volume normalization
        - silence trimming
        - pitch shifting
        - downmixing
    2. Query cv data dictionary into dataframe
    3. Define dict mapping between (age, gender) and client_id called "classes"
        - ex. { (age=50, gender=m) : [ id0, id1], (age=40, gender=m) : [ id2, id3] }
    4. Define dict mapping between client_id and audio paths called "paths"
        - ex. { id0 : [ audio0, audio1], id1 : [ audio2, audio3] }
    ```
    ERD
     ______________             ___________           _______ 
    |              |    1:m    |           |   1:m   |       |
    | (age,gender) |  ------>  | client_id | ------> | audio |
    |______________| <classes> |___________| <paths> |_______|
    ```
    5. Add your own audio files into classes and paths

## Preparation

In [36]:
!mkdir cv_wav cv_rttm cv_uem # keep results of generating files
!mkdir wav_pool # transfer folder from mp3 to wav

In [21]:
import pandas as pd
import numpy as np
import os
from pydub import AudioSegment

In [22]:
from utils.writeFile import *
from utils.concatenator import *
from utils.overlapper import *

In [23]:
# Query CommonVoice data dictionary into DF
path = "cv-corpus-9.0-2022-04-27/th/"
df_cv = pd.read_csv(path+"validated.tsv", delimiter="\t")
for file in ["invalidated.tsv", "other.tsv"]:
    df = pd.read_csv(path+file, delimiter="\t")
    df_cv = pd.concat([df_cv,df])
df_cv.info()
df_cv.head(3)

  df_cv = pd.read_csv(path+"validated.tsv", delimiter="\t")
  df = pd.read_csv(path+file, delimiter="\t")


<class 'pandas.core.frame.DataFrame'>
Int64Index: 311794 entries, 0 to 178340
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   client_id   311794 non-null  object
 1   path        311794 non-null  object
 2   sentence    311794 non-null  object
 3   up_votes    311794 non-null  int64 
 4   down_votes  311794 non-null  int64 
 5   age         175191 non-null  object
 6   gender      175213 non-null  object
 7   accents     405 non-null     object
 8   locale      311794 non-null  object
 9   segment     48 non-null      object
dtypes: int64(2), object(8)
memory usage: 26.2+ MB


Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,locale,segment
0,00086f5dc46f9038f13bbd829c4118fab3ac28688d5ee8...,common_voice_th_25695281.mp3,ใครเป็นผู้รับ,2,0,,,,th,
1,00245363f02e6f86a15de6793dc282d6f7095a8db6889b...,common_voice_th_31277118.mp3,รู้ได้ไงว่าเขาไม่หนุก,2,0,,,,th,
2,005237750a4d82bf743fe1e6eb0e4a579f0eb6a2455aaf...,common_voice_th_25722081.mp3,การที่จะทำเค้กแต่งงานชั้นเลิศคุณจะต้องใช้น้ำตา...,3,0,,,,th,


In [24]:
# Classify client_id by age and gender into dict
# classes[(age, gender)] = list of ids of the selected age, gender
ages = ["teens", "twenties", "thirties", "fourties", "fifties"]
genders = ["male", "female"]
classes = {}
import itertools
keys = list(itertools.product(ages,genders))
for age, gender in keys:
    condition = (df_cv["age"] == age) & (df_cv["gender"] == gender)
    dfx = df_cv[condition]
    id_list = dfx.iloc[:, :2]\
        .groupby(["client_id"])\
        .count()\
        .sort_values(by="path", ascending=False)
    id_list = id_list[id_list["path"]>=150].index
    classes[(age,gender)] = list(id_list[:5])

In [25]:
# paths[id] = list of audio file of that id
mp3_path = "cv-corpus-9.0-2022-04-27/th/clips/"
paths = {}
for key in keys:
    for id in classes[key]:
        paths[id] = list(mp3_path + df_cv[df_cv["client_id"] == id].path)
len(paths.keys())

39

In [26]:
# Add your own video clips
# classes
own_id = ["chuan", "prayut", "sutin", "visanu",
          "4f0p125", "4f0p-250", "4f1p125",
          "amarat", "amap-187", "5f0p187", "5f0p-250", ]
classes[("fifties", "male")].extend(own_id[:4])
classes[("fourties", "female")].extend(own_id[4:7])
classes[("fifties", "female")].extend(own_id[7:])

# paths
own_path = "own/"
for enum, id in enumerate(own_id):
    paths[id] = []
    for file in os.listdir(own_path):
        if file.startswith(id):
            paths[id].append(own_path+file)

In [27]:
# refill used up paths
def refill_paths(id):
    print(". . .filling", id[:6])
    if id in own_id:
        # from own clips
        path = own_path
        for file in os.listdir(own_path):
            if file.startswith(id):
                paths[id].append(own_path+file)
    else: 
        # from common voice
        path = mp3_path
        paths[id] = list(path + df_cv[df_cv["client_id"] == id].path)

## Generating

In [28]:
metadata = []

In [29]:
# Generate Indexes
# 150 files, 4 id per file = 600 ids
# 10 class(by age and gender) --> 60 ids per class
arr = []
num_class = len(keys)
num_files = 150
amount_per_class = num_files*4//num_class
for i in range(num_class):
    arr = arr + [i]*(amount_per_class)
    # generate list of [0]*60 + [1]*60 +...+ [9]*60
indices = []
for i in range(num_files):
    turn = []
    for j in range(4):
        id = np.random.choice(len(arr))
        turn.append(arr.pop(id))
    indices.append(turn)
indices[:5]

[[6, 4, 7, 3], [5, 8, 7, 9], [3, 1, 0, 6], [6, 8, 0, 6], [4, 6, 7, 3]]

In [None]:
for enum in range(0,90):
    index = indices.pop(0)
    id_list = [classes[keys[i]][np.random.choice(len(classes[keys[i]]))] for i in index]
    while len(id_list)!=len(set(id_list)):
        id_list = [classes[keys[i]][np.random.choice(len(classes[keys[i]]))] for i in index]
    ag = [keys[i] for i in index]
    audioName = f"cv02_ovl_{enum}"
    print(". . .generating", audioName)
    rttm, duration, metadata = overlapper(
        client_id=id_list,
        speaker_id=[id[:6] for id in id_list],
        paths=paths,
        outputName=audioName,
        refill_callback = refill_paths, 
        metadata = metadata, 
        age_gender_keys = [keys[i] for i in index]
    )
    writeFile(content=rttm, outputName=f"cv_rttm/{audioName}.rttm")
    uem = f"{audioName} 1 0 {duration}"
    writeFile(content=uem, outputName=f"cv_uem/{audioName}.uem")

In [None]:
for enum in range(0,60):
    index = indices.pop(0)
    id_list = [classes[keys[i]][np.random.choice(len(classes[keys[i]]))] for i in index]
    # check if id_list have duplicate client_id
    while len(id_list)!=len(set(id_list)):
        id_list = [classes[keys[i]][np.random.choice(len(classes[keys[i]]))] for i in index]
        assert id_list==[] # prevent infinite loop
    audioName = f"cv02_cct_{enum}"
    print(". . .generating", audioName)
    rttm, duration, metadata = concatenator(
        client_id = id_list,
        speaker_id = [id[:6] for id in id_list],
        paths = paths,
        outputName = audioName,
        refill_callback = refill_paths,
        metadata = metadata, 
        age_gender_keys = [keys[i] for i in index]
    )
    writeFile(content=rttm, outputName=f"cv_rttm/{audioName}.rttm")
    uem = f"{audioName} 1 0 {duration}"
    writeFile(content=uem, outputName=f"cv_uem/{audioName}.uem")

In [133]:
# save metadata as csv
import csv
header = ["name", "client_id", "age", "gender", "audio", "duration"]
with open('cv02_metadata.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(metadata)

## Metadata

In [16]:
# using metadata
import pandas as pd
df = pd.read_csv("cv02_metadata.csv")
print(df.groupby(["age","gender"])["duration"].agg(duration_hour='sum')/60/60)
df.sample(3)

                 duration_hour
age      gender               
fifties  female       6.559556
         male         7.099241
fourties female       5.596265
         male         4.689603
teens    female       5.717852
         male         5.115534
thirties female       4.655827
         male         5.244366
twenties female       6.172873
         male         4.703438


Unnamed: 0,name,client_id,age,gender,audio,duration
35980,cv02_ovl_76,0bf48b2997faa1392615d12db5bf514ee0ea9fcceb3553...,fourties,male,common_voice_th_26409240,0.756
33483,cv02_ovl_71,dc44dca3259e3633773c05ba48b9a1556e6cecca7f30ae...,teens,male,common_voice_th_28970393,4.688
28941,cv02_ovl_61,2d7fbf12437f9aa31fe80c56d97b268851db90e2b75be1...,teens,male,common_voice_th_29165038,2.76
