# Prepare Georgian Speech Data for Whisper Fine-tuning
This notebook loads the Common Voice Georgian dataset and prepares it for training.

In [3]:
import pandas as pd
import os
from pathlib import Path
import librosa
import numpy as np

# Dataset paths
DATA_DIR = Path("cv-corpus-23.0-2025-09-05/ka")
CLIPS_DIR = DATA_DIR / "clips"
TRAIN_TSV = DATA_DIR / "train.tsv"
TEST_TSV = DATA_DIR / "test.tsv"
DEV_TSV = DATA_DIR / "dev.tsv"

In [4]:
!pip install transformers



In [None]:
# Load the training data
train_df = pd.read_csv(TRAIN_TSV, sep='\t')
test_df = pd.read_csv(TEST_TSV, sep='\t')
dev_df = pd.read_csv(DEV_TSV, sep='\t')

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Dev samples: {len(dev_df)}")
print("\nColumns:", train_df.columns.tolist())
print("\nFirst few rows:")
train_df.head()

In [None]:
# Check audio files exist and get statistics
def check_audio_files(df, clips_dir):
    existing = []
    durations = []
    
    for idx, row in df.iterrows():
        audio_path = clips_dir / row['path']
        if audio_path.exists():
            existing.append(True)
            # Get duration
            try:
                duration = librosa.get_duration(path=str(audio_path))
                durations.append(duration)
            except:
                durations.append(0)
        else:
            existing.append(False)
            durations.append(0)
        
        if idx % 1000 == 0:
            print(f"Checked {idx}/{len(df)} files...")
    
    df['exists'] = existing
    df['duration'] = durations
    return df

print("Checking training files...")
train_df = check_audio_files(train_df, CLIPS_DIR)
print(f"\nExisting training files: {train_df['exists'].sum()}/{len(train_df)}")
print(f"Total training duration: {train_df['duration'].sum()/3600:.2f} hours")

In [None]:
# Filter only existing files
train_df_clean = train_df[train_df['exists']].copy()
test_df_clean = test_df[test_df['exists']].copy() if 'exists' in test_df.columns else test_df.copy()

# Create simplified dataset
train_data = train_df_clean[['path', 'sentence']].copy()
train_data['audio_path'] = train_data['path'].apply(lambda x: str(CLIPS_DIR / x))
train_data = train_data[['audio_path', 'sentence']]

print(f"Clean training samples: {len(train_data)}")
print("\nSample data:")
train_data.head()

In [None]:
# Save processed data
train_data.to_csv('train_processed.csv', index=False)
print("Saved processed training data to train_processed.csv")

In [None]:
# Statistics
print("\n=== Dataset Statistics ===")
print(f"Total samples: {len(train_data)}")
print(f"Total duration: {train_df_clean['duration'].sum()/3600:.2f} hours")
print(f"Average duration: {train_df_clean['duration'].mean():.2f} seconds")
print(f"Min duration: {train_df_clean['duration'].min():.2f} seconds")
print(f"Max duration: {train_df_clean['duration'].max():.2f} seconds")

# Text statistics
print(f"\nAverage text length: {train_data['sentence'].str.len().mean():.2f} characters")