In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import tqdm
import os

In [4]:
output_path = '../dataset'
df = pd.read_csv(f"{output_path}/line_index.tsv", sep='\t', names=["filename", "transcript"], index_col=1)
df.head()

Unnamed: 0,filename,transcript
,khm_0308_0011865648,ស្ពាន កំពង់ ចម្លង អ្នកលឿង នៅ ព្រៃវែង ជា ស្ពាន ...
,khm_0308_0032157149,ភ្លើង កំពុង ឆាប ឆេះ ផ្ទះ ប្រជា ពលរដ្ឋ នៅ សង្កា...
,khm_0308_0038959268,អ្នក សុំ ទាន ដេក ប្រកាច់ ម្នាក់ ឯង ក្បែរ ខ្លោង...
,khm_0308_0054635313,ស្ករ ត្នោត ដែល មាន គុណភាព ល្អ ផលិត នៅ ខេត្ត កំ...
,khm_0308_0055735195,ភ្នំបាខែង មាន កម្ពស់ តែ ចិត សិប ម៉ែត្រ សោះ


### Audio Processing

In [None]:
def process_audio_files(input_dir, output_dir):
    for index, row in tqdm(df.iterrows(), desc="Processing", leave=False):
        try:
            audio_path = os.path.join(input_dir, f"{row['filename']}.wav")
            output_path = os.path.join(output_dir, f"{row['filename']}_mel.npy")

            # Load and process audio
            waveform, sample_rate = librosa.load(audio_path, sr=22050)
            waveform = waveform / max(abs(waveform))  # Normalize
            waveform, _ = librosa.effects.trim(waveform)  # Trim silence

            # Compute Mel spectrogram
            mel_spectrogram = librosa.feature.melspectrogram(
                y=waveform, sr=sample_rate, n_fft=1024, hop_length=256, n_mels=80
            )
            mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

            # Save as .npy
            np.save(output_path, mel_spectrogram_db)

        except RuntimeError as e:
            print(f"Error in batch: {e}")
            continue

    print("Audio processing completed!")

# Example usage
process_audio_files(f"{output_path}/wavs/", f"{output_path}/mels/")

In [None]:
def validate_mel_spectrogram(n_batch=3):
    count = 1
    for idx, row in df.iterrows():
        if count > n_batch:
          break

        # Load the original waveform
        audio_path = f"{output_path}/wavs/{row['filename']}.wav"
        waveform, sr = librosa.load(audio_path, sr=22050)

        # Load the Mel spectrogram
        mel_path = f"{output_path}/processed_wavs/{row['filename']}_mel.npy"
        mel_spectrogram = np.load(mel_path)

        # Plot the waveform
        plt.figure(figsize=(10, 4))
        plt.title("Waveform")
        librosa.display.waveshow(waveform, sr=sr)
        plt.xlabel("Time (s)")
        plt.ylabel("Amplitude")
        plt.show()

        # Plot the Mel spectrogram
        plt.figure(figsize=(10, 4))
        plt.title("Mel Spectrogram")
        librosa.display.specshow(mel_spectrogram, sr=sr, hop_length=256, x_axis='time', y_axis='mel', cmap='viridis')
        plt.colorbar(format="%+2.0f dB")
        plt.xlabel("Time (s)")
        plt.ylabel("Mel Frequency")
        plt.show()

        count+=1
        print("\n")

validate_mel_spectrogram()