In [1]:
import sys
sys.path.append("..")
from utils.audio_utils import get_total_audio_duration,get_audio_info,get_duration_ffprobe
from pathlib import Path
from tqdm import tqdm
from utils.file_utils import save_json
from utils.transcription_utils import clean_and_verify_transcript_hi
from collections import Counter
from utils.conversions import sec_to_time

In [2]:
save_path=Path("/media/dataset-harddisk/munikumar/utils/audio_utils/data/mucs")

In [3]:
dev_dir=Path("/media/dataset-harddisk/munikumar/hindi_dataset/mucs/mucs_raw/test")
train_dir=Path("/media/dataset-harddisk/munikumar/hindi_dataset/mucs/mucs_raw/train")
test_dir=Path("/media/dataset-harddisk/munikumar/hindi_dataset/mucs/mucs_raw/subtask1_blindtest_wReadme/Hindi")


In [4]:
def save_duration_data(dataset_path):
    audio_path=dataset_path/"audio"
    audio_data=[]
    all_audio=list(audio_path.glob("*"))
    
    print("getting audio duration ...")
    duration=get_total_audio_duration(all_audio)
    save_json(duration,save_path=save_path/f"mucs_{audio_path.parent.name}_duration.json")
    
    print("getting audio info ...")
    for audio in tqdm(all_audio):
        data=get_audio_info(str(audio))
        audio_data.append(data)
    save_json(audio_data,save_path=save_path/f"mucs_{audio_path.parent.name}_info.json")
    

In [5]:
save_duration_data(dev_dir)

getting audio duration ...


100%|██████████| 3843/3843 [01:41<00:00, 37.92it/s]


getting audio info ...


100%|██████████| 3843/3843 [04:51<00:00, 13.20it/s]


In [6]:
save_duration_data(train_dir)


getting audio duration ...


100%|██████████| 99925/99925 [53:28<00:00, 31.15it/s]  


getting audio info ...


100%|██████████| 99925/99925 [1:55:50<00:00, 14.38it/s]  


In [7]:
save_duration_data(test_dir)

getting audio duration ...


100%|██████████| 3897/3897 [00:24<00:00, 162.16it/s]


getting audio info ...


100%|██████████| 3897/3897 [03:34<00:00, 18.14it/s]


In [8]:
def get_transcription_info(dataset_path):
    non_hindi_chars = []
    non_hindi_transcriptions = {}
    total_duration=0
    transcription_path = dataset_path / "transcription.txt"
    with open(transcription_path, "r", encoding="utf-8") as fp:
        data = fp.read().splitlines()
        transcription_data = {
            transcription.split(" ")[0] + ".wav": " ".join(transcription.split(" ")[1:])
            for transcription in data
        }
    for audio_file, transcription in transcription_data.items():
        clean_transcription, non_hindi_tokens = clean_and_verify_transcript_hi(transcription)
        if len(non_hindi_tokens) != 0:
            duration=get_duration_ffprobe(dataset_path/"audio"/audio_file)
            non_hindi_transcriptions[audio_file] = {
                "raw_transcription": transcription,
                "clean_transcription": clean_transcription,
                "duration":duration
            }
            total_duration+=duration
            
            non_hindi_chars.extend(non_hindi_tokens)
    print(non_hindi_chars)
    non_hindi_char_freq=dict(Counter(non_hindi_chars))
    non_hindi_transcriptions["total_audio_duration"]=total_duration
    save_json(non_hindi_transcriptions,save_path=save_path/f"mucs_{dataset_path.name}_non_hindi_transcripts.json",ensure_ascii=False)
    save_json(non_hindi_char_freq,save_path=save_path/f"mucs_{dataset_path.name}_non_hindi_tokens.json",ensure_ascii=False)
    
    

In [9]:
get_transcription_info(train_dir)

['ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ', 'ॠ',

In [10]:
get_transcription_info(dev_dir)

['ऍ', 'ऍ', 'ऍ', 'ऍ', 'ऍ', 'ऍ', 'ऍ', 'ऍ', 'ऍ', 'ऍ']


In [11]:
get_transcription_info(test_dir)

[]


In [12]:
sec_to_time(1106.4800000000005)

(0.0, 18.0, 26.480000000000473)

## Raw audio duration

train_raw audio duration : 95:03:00  
dev_raw audio duration: 5:33:25  
test_raw audio duration: 5:29:26  
total_duration: 106:05:51

## Total number of audio files

train_audio_files: 99925  
dev_audio_files: 3843  
test_audio_files: 3897  
total_audio files: 107665


In [1]:
from datasets import load_dataset
fleurs = load_dataset("google/fleurs", "hi_in", split="train")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(fleurs)

Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 2120
})
