### Common Voice 22.0 Dataset Preparation

This notebook downloads, preprocesses, and saves the Common Voice dataset in HuggingFace Dataset format.

**Dataset:** `fsicoli/common_voice_22_0`  
**Languages:** Danish (da), English (en), Dutch (nl)

**Preprocessing:**
- Filter by duration: min 0.5s, max 30s
- Lowercase transcription
- Remove punctuation
- Save both `raw_transcription` and `transcription` (preprocessed)

**Output:** HuggingFace Dataset with pre-computed audio arrays (no need to reload from disk during training)

#### Step 1: Install Dependencies

In [1]:
import os
import re
import json
import tarfile
from pathlib import Path
from tqdm.auto import tqdm
import csv

from huggingface_hub import hf_hub_download, list_repo_files
from datasets import Dataset, Audio, Features, Value
import soundfile as sf
import numpy as np

print("All imports successful!")

  from .autonotebook import tqdm as notebook_tqdm


All imports successful!


#### Step 2: Configuration

In [2]:
# Configuration
DATASET_REPO = "fsicoli/common_voice_22_0"
LANGUAGE = "da"  # Danish. Change to "en" for English, "nl" for Dutch
SPLITS = ["train", "dev", "test"]  # dev will be renamed to validation
OUTPUT_DIR = Path("../data/cv22_hf")
SAMPLE_RATE = 16000

# Filtering
MIN_DURATION = 0.5   # seconds - filter out very short/corrupted samples
MAX_DURATION = 30.0  # seconds - Whisper's maximum input length

print(f"Dataset: {DATASET_REPO}")
print(f"Language: {LANGUAGE}")
print(f"Output: {OUTPUT_DIR}")
print(f"Duration filter: [{MIN_DURATION}s, {MAX_DURATION}s]")

Dataset: fsicoli/common_voice_22_0
Language: da
Output: ..\data\cv22_hf
Duration filter: [0.5s, 30.0s]


#### Step 3: Define Preprocessing Function

In [3]:
def preprocess_transcription(text: str) -> str:
    """
    Preprocess transcription:
    - Convert to lowercase
    - Remove all punctuation
    - Collapse multiple spaces
    """
    if not text:
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation (keep apostrophes for contractions like "don't")
    text = re.sub(r"[^\w\s']", " ", text)
    
    # Remove standalone apostrophes
    text = re.sub(r"(?<!\w)'|'(?!\w)", " ", text)
    
    # Collapse multiple spaces and strip
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

# Test the preprocessing
test_examples = [
    "Hej, hvordan har du det?",
    "Det er en god dag!",
    "Don't stop the music.",
    "Prijs: €50,00 (incl. BTW)",
]

print("Preprocessing examples:")
for ex in test_examples:
    print(f"  '{ex}' -> '{preprocess_transcription(ex)}'")

Preprocessing examples:
  'Hej, hvordan har du det?' -> 'hej hvordan har du det'
  'Det er en god dag!' -> 'det er en god dag'
  'Don't stop the music.' -> 'don't stop the music'
  'Prijs: €50,00 (incl. BTW)' -> 'prijs 50 00 incl btw'


#### Step 4: Download Functions

In [4]:
def download_transcript(language: str, split: str) -> list:
    """Download and parse transcript TSV file."""
    filename = f"transcript/{language}/{split}.tsv"
    print(f"Downloading: {filename}")
    
    tsv_path = hf_hub_download(
        repo_id=DATASET_REPO,
        filename=filename,
        repo_type="dataset",
    )
    
    samples = []
    with open(tsv_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for row in reader:
            samples.append(dict(row))
    
    print(f"  Loaded {len(samples)} transcript entries")
    return samples


def download_and_extract_audio(language: str, split: str, output_dir: Path) -> Path:
    """Download and extract audio tar file."""
    # Audio format: audio/{lang}/{split}/{lang}_{split}_0.tar
    filename = f"audio/{language}/{split}/{language}_{split}_0.tar"
    print(f"Downloading: {filename}")
    
    tar_path = hf_hub_download(
        repo_id=DATASET_REPO,
        filename=filename,
        repo_type="dataset",
    )
    
    # Extract
    audio_dir = output_dir / "audio" / split
    audio_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"  Extracting to: {audio_dir}")
    with tarfile.open(tar_path, "r") as tar:
        for member in tqdm(tar.getmembers(), desc="Extracting"):
            if member.isfile():
                member.name = os.path.basename(member.name)
                tar.extract(member, path=audio_dir)
    
    return audio_dir

print("Download functions defined!")

Download functions defined!


#### Step 5: Process and Create HuggingFace Dataset

In [5]:
def process_split(language: str, split: str, output_dir: Path) -> Dataset:
    """
    Download, preprocess, and create HuggingFace Dataset for a split.
    
    Returns Dataset with columns:
    - audio: Dict with 'array' (numpy float32) and 'sampling_rate' (16000)
    - raw_transcription: Original transcription
    - transcription: Preprocessed transcription (lowercase, no punctuation)
    - duration: Audio duration in seconds
    - speaker_id: Client ID (anonymized)
    """
    # Download transcript and audio
    transcript = download_transcript(language, split)
    audio_dir = download_and_extract_audio(language, split, output_dir)
    
    # Process samples - load audio arrays directly (no need for Audio feature decoding)
    processed_samples = []
    stats = {"total": 0, "valid": 0, "too_short": 0, "too_long": 0, "missing": 0, "empty": 0}
    
    print(f"\nProcessing {len(transcript)} samples...")
    
    for sample in tqdm(transcript, desc=f"Processing {split}"):
        stats["total"] += 1
        
        # Get audio file path
        audio_filename = sample.get("path", "")
        if not audio_filename:
            stats["missing"] += 1
            continue
            
        audio_path = audio_dir / audio_filename
        if not audio_path.exists():
            stats["missing"] += 1
            continue
        
        # Get transcription
        raw_text = sample.get("sentence", "").strip()
        if not raw_text:
            stats["empty"] += 1
            continue
        
        # Load audio and get duration
        try:
            audio_array, sr = sf.read(str(audio_path))
            duration = len(audio_array) / sr
        except Exception as e:
            stats["missing"] += 1
            continue
        
        # Filter by duration
        if duration < MIN_DURATION:
            stats["too_short"] += 1
            continue
        if duration > MAX_DURATION:
            stats["too_long"] += 1
            continue
        
        # Resample if needed (MP3 files from Common Voice are usually 48kHz)
        if sr != SAMPLE_RATE:
            import librosa
            audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=SAMPLE_RATE)
        
        # Preprocess transcription
        preprocessed_text = preprocess_transcription(raw_text)
        
        # Store audio array directly - no decoding needed during training!
        processed_samples.append({
            "audio_array": audio_array.astype(np.float32),  # Store array directly
            "sampling_rate": SAMPLE_RATE,
            "raw_transcription": raw_text,
            "transcription": preprocessed_text,
            "duration": round(duration, 3),
            "speaker_id": sample.get("client_id", "")[:16],
        })
        stats["valid"] += 1
    
    # Print stats
    print(f"\nStats for {split}:")
    print(f"  Total: {stats['total']}")
    print(f"  Valid: {stats['valid']}")
    print(f"  Too short (<{MIN_DURATION}s): {stats['too_short']}")
    print(f"  Too long (>{MAX_DURATION}s): {stats['too_long']}")
    print(f"  Missing/Error: {stats['missing']}")
    print(f"  Empty text: {stats['empty']}")
    
    # Create HuggingFace Dataset - audio arrays are stored directly, no decoding needed
    dataset = Dataset.from_list(processed_samples)
    
    total_hours = sum(s["duration"] for s in processed_samples) / 3600
    print(f"  Total duration: {total_hours:.2f} hours")
    
    return dataset

print("Process function defined!")

Process function defined!


#### Step 6: Download and Process Danish Dataset

This will:
1. Download transcript TSV files
2. Download and extract audio tar files
3. Load audio arrays, filter by duration, preprocess transcriptions
4. Create HuggingFace Dataset objects

**Note:** First run may take 10-30 minutes depending on your internet speed.

In [6]:
# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Process each split
datasets_dict = {}

for split in SPLITS:
    print(f"\n{'='*60}")
    print(f"Processing: {split}")
    print('='*60)
    
    dataset = process_split(LANGUAGE, split, OUTPUT_DIR)
    
    # Rename 'dev' to 'validation' for consistency
    split_name = "validation" if split == "dev" else split
    datasets_dict[split_name] = dataset
    
print(f"\n{'='*60}")
print("All splits processed!")
print('='*60)
for name, ds in datasets_dict.items():
    print(f"  {name}: {len(ds)} samples")


Processing: train
Downloading: transcript/da/train.tsv
  Loaded 3592 transcript entries
Downloading: audio/da/train/da_train_0.tar
  Extracting to: ..\data\cv22_hf\audio\train


Extracting: 100%|██████████| 3603/3603 [00:19<00:00, 182.42it/s]



Processing 3592 samples...


Processing train: 100%|██████████| 3592/3592 [00:21<00:00, 168.61it/s]



Stats for train:
  Total: 3592
  Valid: 3592
  Too short (<0.5s): 0
  Too long (>30.0s): 0
  Missing/Error: 0
  Empty text: 0
  Total duration: 4.18 hours

Processing: dev
Downloading: transcript/da/dev.tsv
  Loaded 2511 transcript entries
Downloading: audio/da/dev/da_dev_0.tar
  Extracting to: ..\data\cv22_hf\audio\dev


Extracting: 100%|██████████| 2631/2631 [00:16<00:00, 160.56it/s]



Processing 2511 samples...


Processing dev: 100%|██████████| 2511/2511 [00:14<00:00, 173.46it/s]



Stats for dev:
  Total: 2511
  Valid: 2511
  Too short (<0.5s): 0
  Too long (>30.0s): 0
  Missing/Error: 0
  Empty text: 0
  Total duration: 3.21 hours

Processing: test
Downloading: transcript/da/test.tsv
  Loaded 2684 transcript entries
Downloading: audio/da/test/da_test_0.tar
  Extracting to: ..\data\cv22_hf\audio\test


Extracting: 100%|██████████| 2759/2759 [00:16<00:00, 163.43it/s]



Processing 2684 samples...


Processing test: 100%|██████████| 2684/2684 [00:16<00:00, 163.68it/s]



Stats for test:
  Total: 2684
  Valid: 2684
  Too short (<0.5s): 0
  Too long (>30.0s): 0
  Missing/Error: 0
  Empty text: 0
  Total duration: 3.45 hours

All splits processed!
  train: 3592 samples
  validation: 2511 samples
  test: 2684 samples


## Step 7: Inspect the Dataset

In [9]:
# Look at the dataset structure
print("Dataset features:")
print(datasets_dict["train"].features)
print()

# Look at a few examples
print("Sample examples from train set:")
for i in range(3):
    sample = datasets_dict["train"][i]
    audio = np.array(sample['audio_array'])  # Convert list back to numpy array
    print(f"\nExample {i+1}:")
    print(f"  Duration: {sample['duration']:.2f}s")
    print(f"  Raw:      '{sample['raw_transcription']}'")
    print(f"  Cleaned:  '{sample['transcription']}'")
    print(f"  Audio array shape: {audio.shape}")
    print(f"  Sample rate: {sample['sampling_rate']}")

Dataset features:
{'audio_array': List(Value('float32')), 'sampling_rate': Value('int64'), 'raw_transcription': Value('string'), 'transcription': Value('string'), 'duration': Value('float64'), 'speaker_id': Value('string')}

Sample examples from train set:

Example 1:
  Duration: 2.81s
  Raw:      'Min fortræffelige lille nattergal!'
  Cleaned:  'min fortræffelige lille nattergal'
  Audio array shape: (44928,)
  Sample rate: 16000

Example 2:
  Duration: 2.88s
  Raw:      'Jeg venter grumme meget af den'
  Cleaned:  'jeg venter grumme meget af den'
  Audio array shape: (46080,)
  Sample rate: 16000

Example 3:
  Duration: 4.25s
  Raw:      'Men hendes vilje var fast, som hendes tillid til vorherre'
  Cleaned:  'men hendes vilje var fast som hendes tillid til vorherre'
  Audio array shape: (67968,)
  Sample rate: 16000


## Step 8: Save as HuggingFace Dataset

This saves the dataset in Arrow format, which:
- Stores audio arrays directly (no need to reload from MP3 files)
- Fast loading during training
- Memory-mapped for efficient access

In [10]:
from datasets import DatasetDict

# Create DatasetDict
dataset_dict = DatasetDict(datasets_dict)

# Save to disk
save_path = OUTPUT_DIR / LANGUAGE
print(f"Saving dataset to: {save_path}")

dataset_dict.save_to_disk(str(save_path))

print(f"\nDataset saved successfully!")
print(f"Location: {save_path.absolute()}")

Saving dataset to: ..\data\cv22_hf\da


Saving the dataset (2/2 shards): 100%|██████████| 3592/3592 [00:01<00:00, 3069.75 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 2511/2511 [00:00<00:00, 2788.24 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 2684/2684 [00:00<00:00, 2763.94 examples/s]


Dataset saved successfully!
Location: p:\Programming\Efficient_ASR_LLM\notebooks\..\data\cv22_hf\da





## Step 9: Load the Saved Dataset

This demonstrates how to load the dataset during training - audio arrays are already pre-computed!

In [11]:
# Load the saved dataset
from datasets import load_from_disk

load_path = OUTPUT_DIR / LANGUAGE
loaded_dataset = load_from_disk(str(load_path))

print("Loaded dataset:")
print(loaded_dataset)
print()

# Verify a sample
sample = loaded_dataset["train"][0]
audio = np.array(sample['audio_array'])  # Convert list to numpy array
print("Sample from loaded dataset:")
print(f"  Duration: {sample['duration']:.2f}s")
print(f"  Raw transcription: '{sample['raw_transcription']}'")
print(f"  Transcription: '{sample['transcription']}'")
print(f"  Audio array shape: {audio.shape}")
print(f"  Audio sample rate: {sample['sampling_rate']}")

Loaded dataset:
DatasetDict({
    train: Dataset({
        features: ['audio_array', 'sampling_rate', 'raw_transcription', 'transcription', 'duration', 'speaker_id'],
        num_rows: 3592
    })
    validation: Dataset({
        features: ['audio_array', 'sampling_rate', 'raw_transcription', 'transcription', 'duration', 'speaker_id'],
        num_rows: 2511
    })
    test: Dataset({
        features: ['audio_array', 'sampling_rate', 'raw_transcription', 'transcription', 'duration', 'speaker_id'],
        num_rows: 2684
    })
})

Sample from loaded dataset:
  Duration: 2.81s
  Raw transcription: 'Min fortræffelige lille nattergal!'
  Transcription: 'min fortræffelige lille nattergal'
  Audio array shape: (44928,)
  Audio sample rate: 16000


## Step 10: Usage Example for Training

Here's how to use the dataset in your training code:

In [12]:
import torch
from torch.utils.data import DataLoader

# Example: Create a simple collate function
def collate_fn(batch):
    """Simple collate function for DataLoader."""
    audio_arrays = [torch.tensor(item["audio_array"]) for item in batch]
    transcriptions = [item["transcription"] for item in batch]
    durations = [item["duration"] for item in batch]
    
    # Pad audio to same length
    max_len = max(a.shape[0] for a in audio_arrays)
    padded_audio = torch.zeros(len(audio_arrays), max_len)
    for i, a in enumerate(audio_arrays):
        padded_audio[i, :a.shape[0]] = a
    
    return {
        "audio": padded_audio,
        "transcription": transcriptions,
        "duration": durations,
    }

# Create DataLoader
train_loader = DataLoader(
    loaded_dataset["train"],
    batch_size=4,
    shuffle=True,
    collate_fn=collate_fn,
)

# Test loading a batch
batch = next(iter(train_loader))
print("Batch contents:")
print(f"  Audio shape: {batch['audio'].shape}")
print(f"  Transcriptions: {batch['transcription']}")
print(f"  Durations: {batch['duration']}")

Batch contents:
  Audio shape: torch.Size([4, 106560])
  Transcriptions: ['vær opmærksom på at midlerne til kultur hovedsageligt er rettet mod de store bycentre', 'det var for flasken som om den levede det om igen', 'jo jeg tror jeg tror sagde han velsignede hr', 'vær så god hr']
  Durations: [6.66, 4.608, 5.58, 1.908]


## Summary

The dataset is now saved in HuggingFace format with:

| Column | Description |
|--------|-------------|
| `audio_array` | Pre-computed audio as numpy float32 array |
| `sampling_rate` | Audio sample rate (16000) |
| `raw_transcription` | Original transcription (with punctuation and casing) |
| `transcription` | Preprocessed transcription (lowercase, no punctuation) |
| `duration` | Audio duration in seconds |
| `speaker_id` | Anonymized speaker ID |

**Benefits:**
- ✅ Audio arrays are pre-computed and stored (no MP3 decoding during training)
- ✅ Fast loading with memory-mapped Arrow files
- ✅ Both raw and preprocessed transcriptions available
- ✅ Duration filtering already applied
- ✅ No FFmpeg dependency needed during training

**To load in your training code:**
```python
from datasets import load_from_disk
dataset = load_from_disk("data/cv22_hf/da")
train_data = dataset["train"]

# Access audio directly
audio = train_data[0]["audio_array"]  # numpy array, ready to use!
```