# Audio Processing Pipeline

*This is the first notebook out of three notebooks focused on preprocessing the data.*

## 1. Converting Audio Files
- **Objective:** Convert audio files into a standardized format for further processing.
- **Techniques:**
  - Load audio files in various formats (e.g., WAV, MP3) and convert them to a uniform format (e.g., WAV at a consistent sampling rate).
  - Ensure all audio files have a standard bitrate and channel configuration (e.g., mono).

## 2. Voice Activity Detection (VAD)
- **Objective:** Detect when speech is occurring and segment the audio accordingly.
- **Model:** Use the **Silero VAD** pre-trained model for detecting voice activity.
- **Steps:**
  - Load the Silero VAD model.
  - Apply the VAD to segment portions of the audio where speech is detected.
  - Discard or mark non-speech regions.

## 3. Processing the Audio Files
- **Objective:** Clean and standardize the audio input by addressing:
  - Background noise
  - Long silences
  - Sudden volume changes
  - Inconsistent audio lengths
- **Techniques:**
  - Noise reduction using filters (e.g., spectral gating, noise profiling).
  - Silence removal or trimming long pauses.
  - Normalizing audio volume to handle inconsistencies.
  - Padding or trimming to ensure consistent audio length.

## 4. Data Augmentation
- **Objective:** Enhance the dataset to improve model robustness and generalization.
- **Techniques:**
  - Apply pitch shifting to alter the frequency of the audio.
  - Use time stretching to modify the speed of the audio without affecting pitch.
  - Create variations of existing audio samples through random transformations.


## Importing libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')
# Data processing
import pandas as pd
import math
import numpy as np
import librosa
import os
from collections import Counter
from pydub import AudioSegment
from IPython.display import Audio
from pprint import pprint
import noisereduce as nr
import subprocess
import soundfile as sf
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import librosa.display
# Model and performance
import torch

## 1. Converting Audio Files

In [4]:
def convert_m4a_to_wav(input_folder):
    for audio_file in os.listdir(input_folder):
        if audio_file.endswith(".m4a"):
            file_path = os.path.join(input_folder, audio_file)
            output_file_path = os.path.splitext(file_path)[0] + ".wav"
            subprocess.run(['ffmpeg', '-i', file_path, output_file_path])

In [7]:
def delete_m4a_files(folder_path):
    for audio_file in os.listdir(folder_path):
        if audio_file.endswith(".m4a"):
            file_path = os.path.join(folder_path, audio_file)
            os.remove(file_path)  # Delete the file
            print(f"Deleted: {file_path}")

In [9]:
root_folder = "VoxCeleb_gender"
male_folder = os.path.join(root_folder, "males")
female_folder = os.path.join(root_folder, "females")

In [11]:
convert_m4a_to_wav(male_folder)
convert_m4a_to_wav(female_folder)

In [13]:
delete_m4a_files(male_folder)
delete_m4a_files(female_folder)

# Calculating the average length of audio files

In [16]:
def calculate_average_length(input_folder):
    total_length = 0
    audio_count = 0

    for audio_file in os.listdir(input_folder):
        if audio_file.endswith((".wav")):  # Accept both .wav and .m4a files
            file_path = os.path.join(input_folder, audio_file)
            audio = AudioSegment.from_file(file_path)
            total_length += len(audio) / 1000.0  # Length is in milliseconds, convert to seconds
            audio_count += 1

    if audio_count > 0:
        average_length = total_length / audio_count
    else:
        average_length = 0

    return average_length

In [18]:
average_length_male = calculate_average_length(male_folder)
average_length_female = calculate_average_length(female_folder)

print(f"Average audio length for male files: {average_length_male:.2f} seconds")
print(f"Average audio length for female files: {average_length_female:.2f} seconds")


Average audio length for male files: 6.89 seconds
Average audio length for female files: 7.16 seconds


## 2. Voice Activity Detection (VAD)

In [21]:
def create_output_folders(base_folder):
    male_output_folder = os.path.join(base_folder, "male_vad_results")
    female_output_folder = os.path.join(base_folder, "female_vad_results")

    os.makedirs(male_output_folder, exist_ok=True)
    os.makedirs(female_output_folder, exist_ok=True)

    return male_output_folder, female_output_folder

def apply_vad_to_audio(input_folder, output_folder):
    for audio_file in os.listdir(input_folder):
        if audio_file.endswith((".wav", ".m4a")):  # Accept both .wav and .m4a files
            file_path = os.path.join(input_folder, audio_file)
            sampling_rate = 16000  # Set the sampling rate
            wav = read_audio(file_path, sampling_rate=sampling_rate)  # Read audio file
            
            # Get speech timestamps from the audio
            speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sampling_rate)
            
            print(f"Speech timestamps for {audio_file}: {speech_timestamps}")
            
            combined_segments = []
            
            for segment in speech_timestamps:
                try:
                    start = int(segment['start'])  # Access start from dictionary
                    end = int(segment['end'])      # Access end from dictionary
                except ValueError as e:
                    print(f"Error converting timestamps for {audio_file}: {e}")
                    continue

                speech_segment = wav[start:end]
                combined_segments.append(speech_segment)  # Store the segment for combining

            # Combine all speech segments into a single audio
            if combined_segments:
                combined_audio = np.concatenate(combined_segments)  # Combine segments
                output_audio_file = os.path.join(output_folder, f"{os.path.splitext(audio_file)[0]}_combined.wav")
                sf.write(output_audio_file, combined_audio, sampling_rate)  

                print(f"Saved combined VAD segments for {audio_file} to {output_audio_file}")
            else:
                print(f"No segments found for {audio_file}, skipping...")

In [23]:
torch.set_num_threads(1)

# Load the VAD model
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                               model='silero_vad',
                               force_reload=True)


# Unpack the necessary functions from utils
get_speech_timestamps = utils[0]
read_audio = utils[2]

base_folder = "VoxCeleb_gender"  # Adjust this as needed

male_output_folder, female_output_folder = create_output_folders(base_folder)

male_folder = os.path.join(base_folder, "males")  # Path to male audio folder
female_folder = os.path.join(base_folder, "females")  # Path to female audio folder

apply_vad_to_audio(male_folder, male_output_folder)
apply_vad_to_audio(female_folder, female_output_folder)

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Mostafa/.cache\torch\hub\master.zip


Speech timestamps for 0.wav: [{'start': 0, 'end': 88064}]
Saved combined VAD segments for 0.wav to VoxCeleb_gender\male_vad_results\0_combined.wav
Speech timestamps for 1.wav: [{'start': 7712, 'end': 26080}, {'start': 31264, 'end': 60896}, {'start': 64032, 'end': 87520}, {'start': 90656, 'end': 113632}, {'start': 116768, 'end': 143328}, {'start': 145440, 'end': 155648}]
Saved combined VAD segments for 1.wav to VoxCeleb_gender\male_vad_results\1_combined.wav
Speech timestamps for 10.wav: [{'start': 0, 'end': 393216}]
Saved combined VAD segments for 10.wav to VoxCeleb_gender\male_vad_results\10_combined.wav
Speech timestamps for 100.wav: [{'start': 0, 'end': 79872}]
Saved combined VAD segments for 100.wav to VoxCeleb_gender\male_vad_results\100_combined.wav
Speech timestamps for 1000.wav: [{'start': 544, 'end': 66560}]
Saved combined VAD segments for 1000.wav to VoxCeleb_gender\male_vad_results\1000_combined.wav
Speech timestamps for 1001.wav: [{'start': 6176, 'end': 47584}, {'start': 52

In [24]:
male_folder = os.path.join(root_folder, "male_vad_results")
female_folder = os.path.join(root_folder, "female_vad_results")

average_length_male = calculate_average_length(male_folder)
average_length_female = calculate_average_length(female_folder)

print(f"Average audio length for male files: {average_length_male:.2f} seconds")
print(f"Average audio length for female files: {average_length_female:.2f} seconds")


Average audio length for male files: 6.53 seconds
Average audio length for female files: 6.82 seconds


## 3. processing the Audio Files

In [26]:
def process_audio(input_folder, output_folder, target_db=-30, target_duration=6):
    os.makedirs(output_folder, exist_ok=True)  # Create output folder if it doesn't exist

    for audio_file in os.listdir(input_folder):
        if audio_file.endswith((".wav")): 
            file_path = os.path.join(input_folder, audio_file)

            audio, sr = librosa.load(file_path, sr=None)

            # Log original audio length
            original_length = len(audio) / sr

            reduced_noise_audio = nr.reduce_noise(y=audio, sr=sr)

            # Check if noise reduction changes audio length
            if len(reduced_noise_audio) == 0:
                continue  # Skip if no audio after noise reduction

            # Normalize audio to target dB level
            audio_db = librosa.amplitude_to_db(reduced_noise_audio)
            normalization_factor = target_db - audio_db.mean()
            normalized_audio = reduced_noise_audio * (10 ** (normalization_factor / 20))

            target_samples = int(target_duration * sr)

            # Adjust audio length to target duration
            if len(normalized_audio) > target_samples:
                # Trim audio if it's longer than target duration
                adjusted_audio = normalized_audio[:target_samples]
            else:
                # Pad audio if it's shorter than target duration
                adjusted_audio = np.pad(normalized_audio, (0, target_samples - len(normalized_audio)), mode='constant')

            # Ensure the adjusted audio is exactly the target duration
            adjusted_audio = adjusted_audio[:target_samples]  # Trim again just to be safe

            output_file_path = os.path.join(output_folder, audio_file)
            sf.write(output_file_path, adjusted_audio, sr)

In [37]:
output_male_folder = 'VoxCeleb_gender/Final Data/males'
output_female_folder = 'VoxCeleb_gender/Final Data/females'

os.makedirs(output_male_folder, exist_ok=True)
os.makedirs(output_female_folder, exist_ok=True)

process_audio(male_folder, output_male_folder)
process_audio(female_folder, output_female_folder)

## 4. Data Augmentation

In [38]:
def pitch_shift(audio_file, n_steps):
    y, sr = librosa.load(audio_file)
    y_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)  
    sf.write(audio_file.replace('.wav', f'_pitch_shift_{n_steps}.wav'), y_shifted, sr)

In [39]:
def time_stretch(audio_file, rate):
    y, sr = librosa.load(audio_file)
    y_stretched = librosa.effects.time_stretch(y, rate=rate)  
    sf.write(audio_file.replace('.wav', f'_time_stretch_{rate}.wav'), y_stretched, sr)

In [40]:
def process_directory(root_dir):
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(subdir, file)
                pitch_shift(file_path, n_steps=2)
                time_stretch(file_path, rate=1.1)
                
root_directory = 'VoxCeleb_gender/Final Data'
process_directory(root_directory)
