In [1]:
import sys
sys.path.append("..")
from utils.audio_utils import get_total_audio_duration,get_audio_info,get_duration_ffprobe
from pathlib import Path
from tqdm import tqdm
from utils.file_utils import save_json
from utils.transcription_utils import clean_and_verify_transcript_hi
from collections import Counter
from utils.conversions import sec_to_time
import csv

In [2]:
save_path=Path("/media/dataset-harddisk/munikumar/utils/audio_utils/data/fleurs")

In [3]:
train_dir=Path("/media/dataset-harddisk/munikumar/hindi_dataset/fleurs/fleurs_raw/audio/train")
dev_dir=Path("/media/dataset-harddisk/munikumar/hindi_dataset/fleurs/fleurs_raw/audio/dev")
test_dir=Path("/media/dataset-harddisk/munikumar/hindi_dataset/fleurs/fleurs_raw/audio/test")


In [4]:
def save_duration_data(audio_path):
    audio_data=[]
    all_audio=list(audio_path.glob("*"))
    min_duration=1e9
    max_duration=-1e9
    total_duration=0
    audio_file_duration={}
    print(f"getting audio info about {audio_path}")
    for audio in tqdm(all_audio):
        data=get_audio_info(str(audio))
        audio_data.append(data)
        duration=float(data["duration"])
        max_duration=max(duration,max_duration)
        min_duration=min(duration,min_duration)
        total_duration+=duration
        audio_file_duration[audio.name]=duration
    audio_file_duration["max_duration"]=max_duration
    audio_file_duration['min_duration']=min_duration
    audio_file_duration['total_duration']=total_duration
    save_json(audio_data,save_path=save_path/f"fleurs_{audio_path.name}_info.json")
    save_json(audio_file_duration,save_path=save_path/f"fleurs_{audio_path.name}_duration.json")
    

In [5]:
save_duration_data(train_dir)
save_duration_data(dev_dir)
save_duration_data(test_dir)


getting audio info about /media/dataset-harddisk/munikumar/hindi_dataset/fleurs/fleurs_raw/audio/train


100%|██████████| 2120/2120 [03:10<00:00, 11.12it/s]


getting audio info about /media/dataset-harddisk/munikumar/hindi_dataset/fleurs/fleurs_raw/audio/dev


100%|██████████| 239/239 [00:57<00:00,  4.14it/s]


getting audio info about /media/dataset-harddisk/munikumar/hindi_dataset/fleurs/fleurs_raw/audio/test


100%|██████████| 418/418 [00:44<00:00,  9.42it/s]


In [6]:
def get_transcription_info(transcription_path,dataset_path):
    non_hindi_chars = []
    non_hindi_transcriptions = {}
    total_duration=0
    file_count=0
    with open(transcription_path, "r", encoding="utf-8") as fp:
        reader=csv.reader(fp,delimiter='\t')
        
        transcription_data = {
            transcription[1]:transcription[3]
            for transcription in reader
        }
    for audio_file, transcription in tqdm(transcription_data.items()):
        clean_transcription, non_hindi_tokens = clean_and_verify_transcript_hi(transcription)
        if len(non_hindi_tokens) != 0:
            file_count+=1
            duration=get_duration_ffprobe(dataset_path/audio_file)
            non_hindi_transcriptions[audio_file] = {
                "raw_transcription": transcription,
                "clean_transcription": clean_transcription,
                "duration":duration,
                "non_hindi_chars":non_hindi_tokens
            }
            total_duration+=duration
            
            non_hindi_chars.extend(non_hindi_tokens)
    print(non_hindi_chars)
    non_hindi_char_freq=dict(Counter(non_hindi_chars))
    non_hindi_transcriptions["total_audio_duration"]=total_duration
    non_hindi_transcriptions["file_count"]=file_count
    print("file_count",file_count)
    save_json(non_hindi_transcriptions,save_path=save_path/f"fleurs_{transcription_path.stem}_non_hindi_transcripts.json",ensure_ascii=False)
    save_json(non_hindi_char_freq,save_path=save_path/f"fleurs_{transcription_path.stem}_non_hindi_tokens.json",ensure_ascii=False)
    
    

In [7]:
get_transcription_info(train_dir.parent.parent/"train.tsv",train_dir)
get_transcription_info(dev_dir.parent.parent/"dev.tsv",dev_dir)
get_transcription_info(test_dir.parent.parent/"test.tsv",test_dir)

100%|██████████| 2120/2120 [00:38<00:00, 55.20it/s]


['2', '0', '0', '0', '1', '8', '8', '4', '-', '।', 'a', 'f', 'c', 'f', 't', 'a', '3', '0', '-', '2', '0', '0', '9', '6', '4', '5', '1', '0', '4', '0', '6', ',', '5', '0', '0', 'u', 's', 'a', 'f', '.', '-', '-', '-', '।', '-', '-', '-', '-', '8', '0', '%', ',', '।', '5', '5', ',', '0', '0', '0', '2', '3', '।', '-', '4', 'l', 'a', 't', 'a', 'm', 'o', 'n', 'e', 'w', 'o', 'r', 'l', 'd', '1', '0', '0', '1', '0', '0', '3', '0', '6', '4', '5', '1', '0', '4', '0', '1', '0', '0', '1', '7', ',', '0', '0', '0', '.', '1', '6', '0', '/', 'd', 'n', 'a', ',', '।', '.', '-', '2', '0', '0', '-', '3', '5', '2', '4', '3', '6', '1', '2', '0', '-', '1', '6', '0', '-', '2', '0', '1', '0', '+', '3', '0', '°', 'c', '5', '4', '1', '1', ':', '2', '9', '-', '-', '-', '1', '2', '0', '0', '6', '.', '1', '0', '0', '0', '.', '.', '-', '2', '0', '1', '4', '8', '0', '2', '5', '-', 'p', 'b', 's', '2', '4', '2', '0', '2', '0', '0', '9', '9', '0', '1', '9', '9', '3', '-', '4', '6', '-', '1', '1', ':', '2', '9', '-', '-',

100%|██████████| 239/239 [00:02<00:00, 91.08it/s] 


['2', '0', '1', '0', '1', ',', '4', '0', '0', '2', '0', '0', '8', '8', '-', '-', '-', '-', 'n', 'u', 'm', 'b', 'e', 'r', 'o', 'n', 'e', 'm', 'a', 'n', 't', 'a', 'n', 'u', 'm', 'b', 'e', 'r', 'o', 'n', 'e', 'm', 'a', 'n', 't', 'a', '2', '1', '2', '0', '1', '5', '2', '1', '2', '0', '1', '5', '5', '6', '-', '1', '0', '8', '-', '-', 'a', 'p', 's', '-', '4', '1', '1', '1', '2', '7', '0', '4', '8', '9', '2', '4', '0', '-', '-', '-', '-', '-', '1', '8', '-', '-', '-', '-', '-', '-', '-', '/', '।', '/', '।', '/', '।', '-', '1', '9', ',', '5', '0', '0', '1', '4', '-', '-', '-', '-', '/', '-', '।', '।', '”', '”', '.', '1', '9', '9', '5', '”', '”', '.', '1', '9', '9', '5', '।', '।', '1', '9', '7', '8', '1', '9', '7', '0', '-', '1', '9', '7', '9', '1', '9', '7', '9', '।', '-', '-', '-', 'n', 'p', 'w', 's', 'n', 's', 'w', 'n', 'p', 'w', 's', 'n', 's', 'w', 'n', 'p', 'w', 's', 'n', 's', 'w', '-', '-', '-', '2', '0', '0', '0', '1', '0', '0', '0', '-', '-', '।', '।', '।', '1', '7', '6', '7', '1', '7',

100%|██████████| 418/418 [00:05<00:00, 74.08it/s] 

['।', '।', '!', '!', '5', '3', '5', '3', '-', '-', '-', '8', '0', '0', ',', '0', '0', '0', '-', '-', '3', '5', 'm', 'm', '8', '0', '5', '0', '।', '-', '-', '-', '.', '.', '6', ',', '”', '।', '1', '8', '8', '9', '7', '0', '7', '0', '1', '9', '7', '6', '1', '9', '7', '6', '-', '6', '.', '5', 'm', 's', 'm', 's', '-', '3', '3', '0', ',', '0', '0', '0', '6', ',', '0', '0', '0', '-', '3', '3', '0', ',', '0', '0', '0', '6', ',', '0', '0', '0', '-', '-', '-', '-', '2', '0', '0', '9', '-', '-', 'm', '1', '6', '-', '-', '2', '9', '1', '5', '-', '-', '2', '5', '-', '3', '0', '-', '2', '5', '-', '3', '0', 'p', 'h', '।', '।', '-', '/', '0', '6', ':', '3', '0', '0', '7', ':', '3', '0', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '1', ':', '3', '5', '1', '1', ':', '3', '5', '-', '-', '.', '5', '0', '0', '-', '-', '.', '5', '0', '0', '-', '।', '”', '”', '।', '8', '0', '0', '8', '0', '0', '7', '0', '/', '.', '.', '.', '।', 't', 'o', 'g', 'i', 'n', 'e', 't', 't', 'o', 'g', 'i', 'n', 'e', 't', 't',




In [8]:
from utils.conversions import sec_to_time
def get_duration_loss(total_duration,loss_duration):
    print("format hours, minutes, seconds")
    print(f"total duration : {sec_to_time(total_duration)}")
    print(f"loss_duration : {sec_to_time(loss_duration)}")
    print(f"Remaining duration : {sec_to_time(total_duration-loss_duration)}")

Final dev duration data

In [9]:
get_duration_loss(2565,852)

format hours, minutes, seconds
total duration : (0, 42, 45)
loss_duration : (0, 14, 12)
Remaining duration : (0, 28, 33)


**Final test duration**

In [10]:
get_duration_loss(4832,1659)


format hours, minutes, seconds
total duration : (1, 20, 32)
loss_duration : (0, 27, 39)
Remaining duration : (0, 52, 53)


**Train duration**

In [11]:
get_duration_loss(23960,9237)

format hours, minutes, seconds
total duration : (6, 39, 20)
loss_duration : (2, 33, 57)
Remaining duration : (4, 5, 23)
