In [1]:
import pandas as pd
import numpy as np
import librosa
import parselmouth
import noisereduce as nr
import glob

from joblib import Parallel, delayed
from tqdm import tqdm
from parselmouth.praat import call
from concurrent.futures import ThreadPoolExecutor

In [2]:
def remove_silence(audio):
    unsilenced = []
    time_intervals = librosa.effects.split(audio, top_db=25, ref=np.max).tolist()
    for start, end in time_intervals:
        unsilenced += audio.tolist()[start:end+1]
    unsilenced = np.array(unsilenced)

    return unsilenced

def normalize(audio): 
    rms = np.sqrt(np.mean(audio**2))
    current_db = 20 * np.log10(rms)
    target_db = -20.0
    gain = target_db - current_db
    audio_normalized = audio * (10**(gain / 20))
    return audio_normalized

In [3]:
x = ['mean', 'var', 'min', 'max', 'p25', 'p75']
mfccColumns = [ f'{att}_{i}' for att in x for i in range(1, 14)]
mfccColumns.insert(0, 'voiceID')
print(mfccColumns)

['voiceID', 'mean_1', 'mean_2', 'mean_3', 'mean_4', 'mean_5', 'mean_6', 'mean_7', 'mean_8', 'mean_9', 'mean_10', 'mean_11', 'mean_12', 'mean_13', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7', 'var_8', 'var_9', 'var_10', 'var_11', 'var_12', 'var_13', 'min_1', 'min_2', 'min_3', 'min_4', 'min_5', 'min_6', 'min_7', 'min_8', 'min_9', 'min_10', 'min_11', 'min_12', 'min_13', 'max_1', 'max_2', 'max_3', 'max_4', 'max_5', 'max_6', 'max_7', 'max_8', 'max_9', 'max_10', 'max_11', 'max_12', 'max_13', 'p25_1', 'p25_2', 'p25_3', 'p25_4', 'p25_5', 'p25_6', 'p25_7', 'p25_8', 'p25_9', 'p25_10', 'p25_11', 'p25_12', 'p25_13', 'p75_1', 'p75_2', 'p75_3', 'p75_4', 'p75_5', 'p75_6', 'p75_7', 'p75_8', 'p75_9', 'p75_10', 'p75_11', 'p75_12', 'p75_13']


In [4]:
def extract_mfcc_statistics(audio, sr, n_mfcc=13):
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    
    # Compute statistical summaries for each MFCC coefficient
    mfccs_mean = np.mean(mfccs, axis=1)  # Mean
    mfccs_var = np.var(mfccs, axis=1)    # Variance
    mfccs_min = np.min(mfccs, axis=1)    # Minimum
    mfccs_max = np.max(mfccs, axis=1)    # Maximum
    mfccs_percentile_25 = np.percentile(mfccs, 25, axis=1)  # 25th percentile
    mfccs_percentile_75 = np.percentile(mfccs, 75, axis=1)  # 75th percentile
    
    # Combine all statistics into a single feature vector
    mfcc_statistics = np.concatenate([
        mfccs_mean, mfccs_var, mfccs_min, mfccs_max, 
        mfccs_percentile_25, mfccs_percentile_75
    ])
    
    return mfcc_statistics

In [5]:
dfLabel = pd.read_csv('filtered_data_labeled.tsv', sep='\t')
dfLabel.head(3)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,label
0,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687170.mp3,He associated with the Formists.,2,1,fifties,female,us,3
1,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687171.mp3,"The ""ultra accelerator"" injection gives Derric...",2,1,fifties,female,us,3
2,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687172.mp3,"Despite running as a joke, candidates have won...",2,0,fifties,female,us,3


In [8]:
# List all audio files
# audio_files = dfLabel['path'].toList()
audio_files = glob.glob("../audio/*")

BATCH_SIZE = 1500

In [9]:
# Create an empty DataFrame
df = pd.DataFrame(columns=mfccColumns)

# Function to extract features from a single audio file
def extract_features(file_path):
    global mfccColumns
    try:
        # Load and preprocess the audio
        audio, sr = librosa.load(file_path, sr=None, mono=True)
        audio = remove_silence(audio)
        audio = normalize(audio)
        audio = nr.reduce_noise(y=audio, sr=sr)

        mfcc_stats = extract_mfcc_statistics(audio, sr)
        mfcc_stats = mfcc_stats.tolist()
        mfcc_stats.insert(0, file_path)

        f = { key: val for key, val in zip(mfccColumns, mfcc_stats)}

        return f
    except Exception as e:
        print(f"Failed to process {file_path}: {e}")
        return None

# Parallel processing of audio files
def process_audio_files(audio_files, type):
    for i in range(0, len(audio_files), BATCH_SIZE):
        batch_files = audio_files[i:i + BATCH_SIZE]
        res = Parallel(n_jobs=-1)(
            delayed(extract_features)(file_path) for file_path in tqdm(batch_files)
        )

        filtered_res = [item for item in res if item is not None]
        df = pd.DataFrame(filtered_res)
        df.to_csv(f"../CSVs/{type}_{i // BATCH_SIZE}.csv", index=False)
        print(f"Saved batch {i // BATCH_SIZE} with {len(df)} entries.")
        del df  # Free memory


# Process files in parallel
process_audio_files(audio_files, "general")





Failed to process ../audio/common_voice_en_675766.mp3: 


	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
 11%|█         | 168/1500 [00:19<02:31,  8.76it/s]

Failed to process ../audio/common_voice_en_682788.mp3: 


	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Failed to process ../audio/common_voice_en_85538.mp3: 


	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Failed to process ../audio/common_voice_en_675768.mp3: 


100%|██████████| 1500/1500 [03:07<00:00,  8.00it/s]


Saved batch 0 with 1496 entries.


 10%|█         | 152/1500 [00:17<02:46,  8.10it/s]

Failed to process ../audio/common_voice_en_682785.mp3: 


 24%|██▍       | 360/1500 [00:43<02:12,  8.62it/s]

Failed to process ../audio/common_voice_en_85541.mp3: 




Failed to process ../audio/common_voice_en_682778.mp3: 


	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
 26%|██▌       | 392/1500 [00:47<02:17,  8.04it/s]

Failed to process ../audio/common_voice_en_682782.mp3: 


 36%|███▌      | 536/1500 [01:06<02:18,  6.94it/s]

Failed to process ../audio/common_voice_en_85537.mp3: 


 36%|███▋      | 544/1500 [01:07<02:11,  7.30it/s]

Failed to process ../audio/common_voice_en_682789.mp3: 




Failed to process ../audio/common_voice_en_682779.mp3: 


 61%|██████▏   | 920/1500 [01:54<01:03,  9.11it/s]

Failed to process ../audio/common_voice_en_85542.mp3: 




Failed to process ../audio/common_voice_en_682784.mp3: 


100%|██████████| 1500/1500 [03:07<00:00,  8.01it/s]


Saved batch 1 with 1491 entries.




Failed to process ../audio/common_voice_en_682786.mp3: 


100%|██████████| 784/784 [02:01<00:00,  6.44it/s]


Saved batch 2 with 783 entries.
