In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install librosa
!pip install tqdm

import librosa
import librosa.display
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm



In [4]:
project_path = "/content/drive/MyDrive/Voice project/unzipped_dataset/cv-corpus-23.0-2025-09-05/en/"
clips_path = os.path.join(project_path, "clips")
balanced_list_path = "/content/drive/MyDrive/voice_project/final_balanced_list.csv"

# THIS IS OUR NEW FOLDER to save all 10,236 features
feature_path = "/content/drive/MyDrive/voice_project/features_melspec_5sec_all"
os.makedirs(feature_path, exist_ok=True)

# 3. Load our balanced CSV
df = pd.read_csv(balanced_list_path)

print(f"Loaded balanced dataset with {len(df)} files to process.")
print(f"Original clips path: {clips_path}")
print(f"Features will be saved to: {feature_path}")

Loaded balanced dataset with 10236 files to process.
Original clips path: /content/drive/MyDrive/Voice project/unzipped_dataset/cv-corpus-23.0-2025-09-05/en/clips
Features will be saved to: /content/drive/MyDrive/voice_project/features_melspec_5sec_all


In [5]:
# Audio settings
SAMPLE_RATE = 22050  # Standard sample rate
DURATION = 5         # We will standardize all clips to 5 seconds
N_MELS = 128         # Number of Mel bands (height of the spectrogram)

# Calculate the fixed length in samples
FIXED_LENGTH = SAMPLE_RATE * DURATION

def create_mel_spectrogram(file_path):
    """
    Loads an audio file, pads/truncates it to FIXED_LENGTH,
    and returns its log-Mel Spectrogram.
    """
    try:
        # 1. Load audio file
        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)

        # 2. Pad or truncate to fixed length
        if len(audio) < FIXED_LENGTH:
            # Pad with zeros
            audio = np.pad(audio, (0, FIXED_LENGTH - len(audio)), 'constant')
        else:
            # Truncate
            audio = audio[:FIXED_LENGTH]

        # 3. Create Mel Spectrogram
        spectrogram = librosa.feature.melspectrogram(
            y=audio,
            sr=SAMPLE_RATE,
            n_mels=N_MELS,
            n_fft=2048,
            hop_length=512
        )

        # 4. Convert to log scale (decibels)
        log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

        return log_spectrogram

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

print("Feature extraction function 'create_mel_spectrogram' is defined and ready.")

Feature extraction function 'create_mel_spectrogram' is defined and ready.


In [None]:


# Use tqdm for a progress bar.
# iterrows() gives us (index, row)
for index, row in tqdm(df.iterrows(), total=len(df)):

    # 1. Get the original MP3 file path
    clip_filename = row['path']
    full_clip_path = os.path.join(clips_path, clip_filename)

    # 2. Define the new feature file path
    # We'll just replace ".mp3" with ".npy"
    feature_filename = clip_filename.replace(".mp3", ".npy")
    full_feature_path = os.path.join(feature_path, feature_filename)

    # 3. IMPORTANT: Check if we've already processed this file
    # This lets us resume if the script crashes
    if not os.path.exists(full_feature_path):

        # 4. Process the file
        spectrogram = create_mel_spectrogram(full_clip_path)

        if spectrogram is not None:
            # 5. Save the spectrogram as a NumPy file
            np.save(full_feature_path, spectrogram)

print("--- Feature extraction complete! ---")
print(f"All {len(df)} features saved to: {feature_path}")

  0%|          | 0/10236 [00:00<?, ?it/s]

  audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
