In [1]:
# Imports
import os
import pandas as pd
from tqdm import tqdm
import shutil
import random

In [2]:
# CONFIGURATION
LANGUAGES = ['en', 'hi']
DATA_BASE = '../../data/asr'
OUTPUT_BASE = '../../data/asr_processed'
AUDIO_DIR = 'train'
AUGMENT_HINDI = True
HINDI_UPSAMPLE_FACTOR = 9  # To balance Hindi vs. English

os.makedirs(OUTPUT_BASE, exist_ok=True)

In [3]:
# UTILITY FUNCTIONS
def list_audio_files(audio_folder):
    """List all mp3 files in a directory as set."""
    return set(f for f in os.listdir(audio_folder) if f.endswith('.mp3'))

def normalize_text(text, lang):
    """Basic normalization: lowercase, strip, optionally more."""
    text = str(text).strip().lower()
    return text

def filter_tsv(tsv_path, audio_folder, lang):
    """Return filtered dataframe where audio files exist and text is valid."""
    df = pd.read_csv(tsv_path, sep='\t')
    # Only mp3s present in subset
    audio_files = list_audio_files(audio_folder)
    mask_audio = df['path'].apply(lambda x: x in audio_files)
    df = df[mask_audio]
    # Remove missing/empty text
    mask_text = df['sentence'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)
    df = df[mask_text]
    # Basic normalizations
    tqdm.pandas(desc=f'Normalizing text ({lang})')
    df['sentence'] = df['sentence'].progress_apply(lambda x: normalize_text(x, lang))
    # Remove duplicates
    df = df.drop_duplicates(subset=['sentence', 'path'])
    df = df.reset_index(drop=True)
    return df

def speaker_split(df, val_pct=0.05, test_pct=0.05):
    """Split dataframe by speaker into train/val/test splits."""
    speakers = df['client_id'].unique().tolist()
    random.shuffle(speakers)
    n = len(speakers)
    n_val = int(val_pct * n)
    n_test = int(test_pct * n)
    n_val = max(n_val, 1)
    n_test = max(n_test, 1)
    val_ids = set(speakers[:n_val])
    test_ids = set(speakers[n_val:n_val+n_test])
    def get_split(speaker):
        if speaker in test_ids:
            return 'test'
        elif speaker in val_ids:
            return 'val'
        else:
            return 'train'
    df['split'] = df['client_id'].apply(get_split)
    return df

def upsample_hindi(df, factor):
    """Up-sample Hindi training set to balance with English count."""
    train_df = df[df['split'] == 'train']
    rest_df = df[df['split'] != 'train']
    # Resample train set with replacement
    upsampled = train_df.sample(n=len(train_df)*factor, replace=True, random_state=42).reset_index(drop=True)
    result = pd.concat([upsampled, rest_df], ignore_index=True)
    return result

In [4]:
# MAIN WORKFLOW
processed_stats = []

for lang in LANGUAGES:
    print(f"\nProcessing language: {lang}")
    lang_input_dir = os.path.join(DATA_BASE, lang)
    lang_audio_dir = os.path.join(lang_input_dir, AUDIO_DIR)
    tsv_path = os.path.join(lang_input_dir, 'train.tsv')
    
    print("Filtering available audio/text pairs...")
    df = filter_tsv(tsv_path, lang_audio_dir, lang)
    
    # Limit English data to 10,000 rows (after filtering)
    if lang == 'en':
        df = df.sample(n=10000, random_state=42).reset_index(drop=True)
        print(f"✂️ Downsampled English dataset to {len(df)} examples")
    
    print("Splitting dataset by speakers...")
    df = speaker_split(df)
    
    # Save initial stats
    counts = df['split'].value_counts().to_dict()
    counts['lang'] = lang
    counts['total_clips'] = len(df)
    processed_stats.append(counts)

    # Upsample Hindi training set to balance it
    if lang == 'hi' and AUGMENT_HINDI:
        print(f"Upsampling Hindi train set by factor {HINDI_UPSAMPLE_FACTOR}...")
        df = upsample_hindi(df, HINDI_UPSAMPLE_FACTOR)
    
    # Save metadata for each split
    outdir = os.path.join(OUTPUT_BASE, lang)
    os.makedirs(outdir, exist_ok=True)
    for split in ['train', 'val', 'test']:
        split_df = df[df['split'] == split]
        outfile = os.path.join(outdir, f'{split}.csv')
        split_df.to_csv(outfile, index=False)
        print(f"Saved {split} set: {len(split_df)} rows to {outfile}")


Processing language: en
Filtering available audio/text pairs...


Normalizing text (en): 100%|█████████████████████████| 40000/40000 [00:00<00:00, 1173182.66it/s]


✂️ Downsampled English dataset to 10000 examples
Splitting dataset by speakers...
Saved train set: 9004 rows to ../../data/asr_processed/en/train.csv
Saved val set: 493 rows to ../../data/asr_processed/en/val.csv
Saved test set: 503 rows to ../../data/asr_processed/en/test.csv

Processing language: hi
Filtering available audio/text pairs...


Normalizing text (hi): 100%|████████████████████████████| 4479/4479 [00:00<00:00, 892502.62it/s]


Splitting dataset by speakers...
Upsampling Hindi train set by factor 9...
Saved train set: 18207 rows to ../../data/asr_processed/hi/train.csv
Saved val set: 942 rows to ../../data/asr_processed/hi/val.csv
Saved test set: 1514 rows to ../../data/asr_processed/hi/test.csv


In [5]:
# SUMMARY STATUS
print("\nFinal dataset statistics (after processing):")
final_stats = pd.DataFrame(processed_stats)
display(final_stats)

# Save overview stats
final_stats.to_csv(os.path.join(OUTPUT_BASE, 'asr_data_stats.csv'), index=False)

print("\nData preparation and split complete!")


Final dataset statistics (after processing):


Unnamed: 0,train,test,val,lang,total_clips
0,9004,503,493,en,10000
1,2023,1514,942,hi,4479



Data preparation and split complete!
