# Measure Pitch, HNR, Jitter, Shimmer, Formants, and Estimate VTL

### The extracted features
* voiceID
* duration
* meanF0Hz
* stdevF0Hz
* HNR
* localJitter
* localabsoluteJitter
* rapJitter
* ppq5Jitter
* ddpJitter
* localShimmer
* localdbShimmer
* apq3Shimmer
* apq5Shimmer
* apq11Shimmer
* ddaShimmer
* f1_mean
* f2_mean
* f3_mean
* f4_mean
* f1_median
* f2_median
* f3_median
* f4_median
* JitterPCA
* ShimmerPCA
* pF
* fdisp
* avgFormant
* mff
* fitch_vtl
* delta_f
* vtl_delta_f

## Import the external modules

In [13]:
#!/usr/bin/env python3
import glob
import numpy as np
import pandas as pd
import parselmouth 
import statistics
import librosa
import noisereduce as nr
import scipy.stats as stats
from concurrent.futures import ThreadPoolExecutor
import scipy

from pydub import AudioSegment
from parselmouth.praat import call
from scipy.stats.mstats import zscore
from scipy.stats import mode as scipy_mode
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## This function measures duration, pitch, HNR, jitter, and shimmer

In [14]:
# This is the function to measure source acoustics using default male parameters.

def measurePitch(audio, sr, sound, f0min, f0max, unit):
    sound = parselmouth.Sound(sound) # read the sound
    duration = call(sound, "Get total duration") # duration
    pitch = call(sound, "To Pitch", 0.0, f0min, f0max) #create a praat pitch object
    meanF0 = call(pitch, "Get mean", 0, 0, unit) # get mean pitch
    stdevF0 = call(pitch, "Get standard deviation", 0 ,0, unit) # get standard deviation
    harmonicity = call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0)
    hnr = call(harmonicity, "Get mean", 0, 0)
    pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
    localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    localabsoluteJitter = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3)
    rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
    ppq5Jitter = call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
    ddpJitter = call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
    localShimmer =  call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    localdbShimmer = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    apq3Shimmer = call([sound, pointProcess], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    aqpq5Shimmer = call([sound, pointProcess], "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    apq11Shimmer =  call([sound, pointProcess], "Get shimmer (apq11)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    ddaShimmer = call([sound, pointProcess], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    
    return duration, meanF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer

## This function measures formants at each glottal pulse

Puts, D. A., Apicella, C. L., & Cárdenas, R. A. (2012). Masculine voices signal men's threat potential in forager and industrial societies. Proceedings of the Royal Society of London B: Biological Sciences, 279(1728), 601-609.

Adapted from: DOI 10.17605/OSF.IO/K2BHS

In [15]:
# This function measures formants using Formant Position formula
def measureFormants(sound, wave_file, f0min,f0max):
    sound = parselmouth.Sound(sound) # read the sound
    pitch = call(sound, "To Pitch (cc)", 0, f0min, 15, 'no', 0.03, 0.45, 0.01, 0.35, 0.14, f0max)
    pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
    
    formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
    numPoints = call(pointProcess, "Get number of points")

    f1_list = []
    f2_list = []
    f3_list = []
    f4_list = []
    
    # Measure formants only at glottal pulses
    for point in range(0, numPoints):
        point += 1
        t = call(pointProcess, "Get time from index", point)
        f1 = call(formants, "Get value at time", 1, t, 'Hertz', 'Linear')
        f2 = call(formants, "Get value at time", 2, t, 'Hertz', 'Linear')
        f3 = call(formants, "Get value at time", 3, t, 'Hertz', 'Linear')
        f4 = call(formants, "Get value at time", 4, t, 'Hertz', 'Linear')
        f1_list.append(f1)
        f2_list.append(f2)
        f3_list.append(f3)
        f4_list.append(f4)
    
    f1_list = [f1 for f1 in f1_list if str(f1) != 'nan']
    f2_list = [f2 for f2 in f2_list if str(f2) != 'nan']
    f3_list = [f3 for f3 in f3_list if str(f3) != 'nan']
    f4_list = [f4 for f4 in f4_list if str(f4) != 'nan']
    
    # calculate mean formants across pulses
    f1_mean = statistics.mean(f1_list)
    f2_mean = statistics.mean(f2_list)
    f3_mean = statistics.mean(f3_list)
    f4_mean = statistics.mean(f4_list)
    
    # calculate median formants across pulses, this is what is used in all subsequent calcualtions
    # you can use mean if you want, just edit the code in the boxes below to replace median with mean
    f1_median = statistics.median(f1_list)
    f2_median = statistics.median(f2_list)
    f3_median = statistics.median(f3_list)
    f4_median = statistics.median(f4_list)
    
    return f1_mean, f2_mean, f3_mean, f4_mean, f1_median, f2_median, f3_median, f4_median


## This function runs a 2-factor Principle Components Analysis (PCA) on Jitter and Shimmer

In [16]:
def runPCA(df):
    # z-score the Jitter and Shimmer measurements
    measures = ['localJitter', 'localabsoluteJitter', 'rapJitter', 'ppq5Jitter', 'ddpJitter',
                'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer', 'apq11Shimmer', 'ddaShimmer']
    x = df.loc[:, measures].values
    x = StandardScaler().fit_transform(x)
    # PCA
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data = principalComponents, columns = ['JitterPCA', 'ShimmerPCA'])
    principalDf
    return principalDf

## Preprocessing to the audio file

In [17]:
def remove_silence(audio):
    unsilenced = []
    time_intervals = librosa.effects.split(audio, top_db=25, ref=np.max).tolist()
    for start, end in time_intervals:
        unsilenced += audio.tolist()[start:end+1]
    unsilenced = np.array(unsilenced)

    return unsilenced

def normalize(audio): 
    rms = np.sqrt(np.mean(audio**2))
    current_db = 20 * np.log10(rms)
    target_db = -20.0
    gain = target_db - current_db
    audio_normalized = audio * (10**(gain / 20))
    return audio_normalized

def load_mp3(path):
    audio_segment = AudioSegment.from_file(path, format="mp3")
    # Ensure the audio is mono
    audio_segment = audio_segment.set_channels(1)
    # Convert AudioSegment to raw PCM data
    samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
    # Normalize the samples to the range [-1, 1]
    samples /= np.iinfo(audio_segment.array_type).max
    # Pass the samples and sampling rate to librosa
    sr = audio_segment.frame_rate
    return librosa.resample(samples, orig_sr=sr, target_sr=sr), sr

## Enhanced parallel feature extraction

In [18]:
def measureSpecialFeatures(y, sr):
    if len(y) == 0 or np.max(np.abs(y)) < 1e-4:
        raise ValueError("Audio too quiet or empty for feature extraction.")

    try:
        n_fft = 2048
        stft = np.abs(librosa.stft(y, n_fft=n_fft))
        if stft.shape[1] == 0:
            raise ValueError("STFT produced empty output.")

        freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
        stft_sum = np.sum(stft, axis=0, keepdims=True)
        stft_norm = stft / (stft_sum + 1e-6)

        meanfreq = np.sum(freqs[:, None] * stft_norm, axis=0)
        spectral_flatness = librosa.feature.spectral_flatness(y=y)
        mean_spectral_flatness = np.mean(spectral_flatness)

        power_spectrum = stft**2
        psd_norm = power_spectrum / (np.sum(power_spectrum, axis=0, keepdims=True) + 1e-10)
        if np.isnan(psd_norm).any():
            raise ValueError("PSD normalization resulted in NaNs.")

        spectral_entropy = -np.sum(psd_norm * np.log2(psd_norm + 1e-10), axis=0)
        mean_spectral_entropy = np.mean(spectral_entropy)

        iqr_per_frame = scipy.stats.iqr(stft, axis=0)
        avg_iqr = np.mean(iqr_per_frame) / 1000

        std_meanfreq = np.std(meanfreq) / 1000
        q25 = np.percentile(meanfreq, 25) / 1000

        mode_freq = freqs[np.argmax(stft, axis=0)]
        mode_frequency = float(scipy_mode(mode_freq, keepdims=True).mode[0] / 1000)

        return {
            "IQR": float(avg_iqr),
            "sd": float(std_meanfreq),
            "sfm": float(mean_spectral_flatness),
            "Q25": float(q25),
            "sp.ent": float(mean_spectral_entropy),
            "mode": mode_frequency
        }
    except Exception as e:
        print(f"Error in measureSpecialFeatures: {e}")
        raise


In [19]:
# def measureSpecialFeatures(y, sr):
#     stft = np.abs(librosa.stft(y))
#     freqs = librosa.fft_frequencies(sr=sr)
#     meanfreq = np.sum(freqs[:, None] * stft, axis=0) / (np.sum(stft, axis=0) + 1e-6)
#     spectral_flatness = librosa.feature.spectral_flatness(y=y)[0]
#     spectral_entropy = -np.sum((stft**2) * np.log(stft**2 + 1e-10), axis=0) / np.log(stft.shape[0])
#     mode_freq = freqs[np.argmax(stft, axis=0)]

#     return {
#         "stft": stft,
#         "IQR": float(stats.iqr(meanfreq) / 1000),
#         "sd": float(np.std(meanfreq) / 1000),
#         "sfm": float(np.mean(spectral_flatness)),
#         "Q25": float(np.percentile(meanfreq, 25) / 1000),
#         "sp.ent": float(np.mean(spectral_entropy)),
#         "mode": float(scipy_mode(mode_freq, keepdims=True)[0][0] / 1000)
#     }

In [20]:
# Define the columns for the DataFrame ( the general features case)
columns = [
    'voiceID', 'duration', 'meanF0Hz', 'stdevF0Hz', 'HNR', "IQR", "sd", "sfm", "Q25", "sp.ent", "mode",
    'localJitter', 'localabsoluteJitter', 'rapJitter', 'ppq5Jitter', 'ddpJitter',
    'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer', 'apq11Shimmer', 'ddaShimmer',
    'f1_mean', 'f2_mean', 'f3_mean', 'f4_mean',
    'f1_median', 'f2_median', 'f3_median', 'f4_median'
]

In [None]:
import pandas as pd
import glob
from concurrent.futures import ThreadPoolExecutor
import os

# Constants
BATCH_SIZE = 5000
# Function to extract features from a single audio file
def extract_features(file_path):
    try:
        # Load and preprocess the audio
        audio, sr = librosa.load('../Data/'+file_path, sr=None, mono=True)
        audio = remove_silence(audio)
        audio = normalize(audio)
        audio = nr.reduce_noise(y=audio, sr=sr)
        sound = parselmouth.Sound(audio, sampling_frequency=sr)
        # Extract features
        duration, meanF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, \
        localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer = measurePitch(
            audio, sr, sound, 75, 300, "Hertz"
        )

        f1_mean, f2_mean, f3_mean, f4_mean, f1_median, f2_median, f3_median, f4_median = measureFormants(
            sound, file_path, 75, 300
        )

        special = measureSpecialFeatures(audio, sr)

        f = {
            'voiceID': file_path,
            'duration': duration,
            'meanF0Hz': meanF0,
            'stdevF0Hz': stdevF0,
            'HNR': hnr,
            'localJitter': localJitter,
            'localabsoluteJitter': localabsoluteJitter,
            'rapJitter': rapJitter,
            'ppq5Jitter': ppq5Jitter,
            'ddpJitter': ddpJitter,
            'localShimmer': localShimmer,
            'localdbShimmer': localdbShimmer,
            'apq3Shimmer': apq3Shimmer,
            'apq5Shimmer': aqpq5Shimmer,
            'apq11Shimmer': apq11Shimmer,
            'ddaShimmer': ddaShimmer,
            'f1_mean': f1_mean,
            'f2_mean': f2_mean,
            'f3_mean': f3_mean,
            'f4_mean': f4_mean,
            'f1_median': f1_median,
            'f2_median': f2_median,
            'f3_median': f3_median,
            'f4_median': f4_median
        }
        f.update(special)
        return f

    except Exception as e:
        print(f"Failed to process {file_path}: {e}")
        return None

# Function to process and save in batches
def process_and_save_in_batches(df,type):
    audio_files= df['path']
    for i in range(0, len(audio_files), BATCH_SIZE):
        batch_files = audio_files[i:i + BATCH_SIZE]
        with ThreadPoolExecutor(max_workers=10) as executor:
            results = list(executor.map(extract_features, batch_files))

        df = pd.DataFrame([res for res in results if res], columns=columns)
        df.to_csv(f"../CSVs/{type}/{i // BATCH_SIZE}.csv", index=False)
        print(f"Saved batch {i // BATCH_SIZE} with {len(df)} entries.")
        del df  # Free memory


males_df = pd.read_csv("males.csv")
females_df = pd.read_csv("females.csv")
fifties_df = pd.read_csv("fifties.csv")
twenties_df = pd.read_csv("twenties.csv")
process_and_save_in_batches(males_df,"gender")
process_and_save_in_batches(females_df,"gender")
process_and_save_in_batches(fifties_df,"age")
process_and_save_in_batches(twenties_df,"age")


  audio, sr = librosa.load('../Data/'+file_path, sr=None, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Failed to process common_voice_en_555764.mp3: 
Failed to process common_voice_en_639067.mp3: 
Failed to process common_voice_en_639124.mp3: 
Failed to process common_voice_en_570623.mp3: 
Failed to process common_voice_en_513707.mp3: 
Failed to process common_voice_en_552706.mp3: 
Failed to process common_voice_en_566754.mp3: 
Failed to process common_voice_en_644850.mp3: 
Failed to process common_voice_en_608593.mp3: 
Failed to process common_voice_en_672296.mp3: 
Failed to process common_voice_en_493346.mp3: 
Failed to process common_voice_en_487152.mp3: 
Failed to process common_voice_en_19713510.mp3: [Errno 2] No such file or directory: '../Data/common_voice_en_19713510.mp3'
Failed to process common_voice_en_668563.mp3: 
Failed to process common_voice_en_549541.mp3: 
Failed to process common_voice_en_591799.mp3: 
Failed to process common_voice_en_502401.mp3: 
Failed to process common_voice_en_635518.mp3: 
Failed to process common_voice_en_21704.mp3: [Errno 2] No such file or direct

OSError: Cannot save file into a non-existent directory: '..\CSVs\gender'

In [26]:
labels_df = pd.read_csv('filtered_data_labeled.tsv', sep='\t')
labels_df.head(10)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,label
0,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687170.mp3,He associated with the Formists.,2,1,fifties,female,us,3
1,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687171.mp3,"The ""ultra accelerator"" injection gives Derric...",2,1,fifties,female,us,3
2,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687172.mp3,"Despite running as a joke, candidates have won...",2,0,fifties,female,us,3
3,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687173.mp3,Stoner also guest-starred in the television dr...,2,0,fifties,female,us,3
4,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687174.mp3,Rainelle was named for the Raine family.,2,0,fifties,female,us,3
5,502a6d4e8f53be650018dd4a9097ab9287250e654b148c...,common_voice_en_529569.mp3,He's insane or drunk or something.,2,0,twenties,male,england,0
6,502a6d4e8f53be650018dd4a9097ab9287250e654b148c...,common_voice_en_529573.mp3,I've been very busy the last few weeks.,2,0,twenties,male,england,0
7,509994614d284ab2b1415df149ac1bdb05fef1adf8d33f...,common_voice_en_18421093.mp3,He is a climate change sceptic. Good grief!,2,1,twenties,male,us,0
8,509994614d284ab2b1415df149ac1bdb05fef1adf8d33f...,common_voice_en_18421094.mp3,The director wasn't happy with the change.,2,0,twenties,male,us,0
9,509994614d284ab2b1415df149ac1bdb05fef1adf8d33f...,common_voice_en_18421095.mp3,The roof has been patched up many times over t...,2,0,twenties,male,us,0


In [9]:
import pandas as pd

# Load data
males_females_df = pd.read_csv('../CSVs/males_females.csv')
all_df = pd.read_csv('filtered_data_labeled.tsv', sep='\t')

# Find difference between the two DataFrames on the 'path' column
labels_df = all_df[~all_df['path'].isin(males_females_df['voiceID'])]

In [10]:
males_df = labels_df[labels_df["gender"]=="male"].sample(n=10000, random_state=1)
females_df = labels_df[labels_df["gender"]=="female"].sample(n=10000, random_state=1)
## save to csv
males_df.to_csv("males2.csv", index=False)
females_df.to_csv("females2.csv", index=False)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3770 entries, 0 to 3769
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   voiceID              3770 non-null   object 
 1   duration             3770 non-null   float64
 2   meanF0Hz             3770 non-null   float64
 3   stdevF0Hz            3770 non-null   float64
 4   HNR                  3770 non-null   float64
 5   IQR                  3770 non-null   float64
 6   sd                   3770 non-null   float64
 7   sfm                  3770 non-null   float64
 8   Q25                  3770 non-null   float64
 9   sp.ent               3770 non-null   float64
 10  mode                 3770 non-null   float64
 11  localJitter          3770 non-null   float64
 12  localabsoluteJitter  3770 non-null   float64
 13  rapJitter            3770 non-null   float64
 14  ppq5Jitter           3769 non-null   float64
 15  ddpJitter            3770 non-null   f

In [None]:
columns = df.columns
df[columns[1:]] = df[columns[1:]].astype('float64')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3770 entries, 0 to 3769
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   voiceID              3770 non-null   object 
 1   duration             3770 non-null   float64
 2   meanF0Hz             3770 non-null   float64
 3   stdevF0Hz            3770 non-null   float64
 4   HNR                  3770 non-null   float64
 5   IQR                  3770 non-null   float64
 6   sd                   3770 non-null   float64
 7   sfm                  3770 non-null   float64
 8   Q25                  3770 non-null   float64
 9   sp.ent               3770 non-null   float64
 10  mode                 3770 non-null   float64
 11  localJitter          3770 non-null   float64
 12  localabsoluteJitter  3770 non-null   float64
 13  rapJitter            3770 non-null   float64
 14  ppq5Jitter           3769 non-null   float64
 15  ddpJitter            3770 non-null   f

In [None]:
df = df[~df.isna().any(axis=1)]

In [None]:
pcaData = runPCA(df) # Run jitter and shimmer PCA
df = pd.concat([df, pcaData], axis=1) # Add PCA data
# reload the data so it's all numbers
df.to_csv("common_features.csv", index=False)
# df = pd.read_csv('processed_results_2.csv', header=0)
df.sort_values('voiceID').head(20)

Unnamed: 0,voiceID,duration,meanF0Hz,stdevF0Hz,HNR,IQR,sd,sfm,Q25,sp.ent,...,f1_mean,f2_mean,f3_mean,f4_mean,f1_median,f2_median,f3_median,f4_median,JitterPCA,ShimmerPCA
835,../audio/common_voice_en_102830.mp3,3.008083,213.243396,40.068462,11.100741,2.421097,2.717961,0.000115,0.899612,-1530.643707,...,475.928627,1388.67237,2169.631822,3330.441756,488.812591,1411.670381,1987.807183,3140.852555,-0.665802,-2.136635
2226,../audio/common_voice_en_109458.mp3,1.546687,145.115787,26.153789,12.804456,0.713129,1.316116,6e-05,0.436871,-2044.560213,...,450.444842,1363.676142,2222.358525,3224.555126,495.159733,1499.968247,2341.971761,3242.537002,-1.206614,2.255931
2773,../audio/common_voice_en_11017394.mp3,1.653417,159.343114,24.871168,12.792506,0.672118,1.630188,0.000425,0.606903,-2564.609795,...,410.279735,1631.177873,2477.928332,3507.274608,392.467973,1596.197301,2596.258459,3641.383598,-0.958142,0.302211
3039,../audio/common_voice_en_111906.mp3,3.616083,211.342513,43.74577,15.405684,0.726714,1.233943,0.000135,0.62938,-1009.197949,...,489.191744,1340.149941,2161.41766,3075.593601,495.260087,1164.903305,2035.771533,3062.980953,-3.983772,0.348359
2335,../audio/common_voice_en_111907.mp3,1.685437,223.577437,37.056594,15.66298,0.811675,1.637709,4.7e-05,0.576373,-1768.003979,...,489.310341,986.262987,1926.305058,3357.936769,495.541551,929.871759,1648.230695,3311.91152,7.079832,3.592096
1060,../audio/common_voice_en_111908.mp3,2.336146,216.324266,27.084933,17.191967,0.70723,1.212761,0.000665,0.444415,-2359.623297,...,538.249477,1160.844761,2119.842303,3379.859736,507.094524,1184.931831,1897.566531,3406.758811,-1.812533,-0.71105
3552,../audio/common_voice_en_111909.mp3,2.453583,207.804232,49.815519,12.060884,0.987408,1.30242,0.000242,0.749154,-2484.810513,...,494.764971,1702.43506,2475.511682,3699.697447,463.004273,1868.891081,2694.276217,3842.500482,-2.040233,0.653658
2683,../audio/common_voice_en_111910.mp3,2.080104,209.894623,41.380752,15.616589,1.117807,1.55596,0.000236,0.693309,-1814.131462,...,459.404828,1419.661197,2300.613115,3403.651786,448.896812,1163.283949,2184.572612,3365.534142,0.53671,0.671469
3765,../audio/common_voice_en_111912.mp3,2.474813,210.921215,32.519464,16.307838,0.379963,0.871384,0.000194,0.61445,-1365.63493,...,478.169162,1253.843242,2213.795552,3525.079828,464.765269,1095.951591,2292.066629,3722.026831,,
3384,../audio/common_voice_en_111913.mp3,2.0695,214.677739,30.047058,18.034834,0.594433,1.50733,0.000203,0.48312,-2408.251382,...,505.678092,973.511527,1768.249703,2995.732477,461.391523,977.828285,1499.700489,3065.745447,-1.814047,-0.773311


In [None]:
df.head()

Unnamed: 0,voiceID,duration,meanF0Hz,stdevF0Hz,HNR,IQR,sd,sfm,Q25,sp.ent,...,f1_mean,f2_mean,f3_mean,f4_mean,f1_median,f2_median,f3_median,f4_median,JitterPCA,ShimmerPCA
0,../audio/common_voice_en_18534851.mp3,1.653417,222.014108,51.318958,16.82933,2.590946,2.357206,0.000144,0.561122,-2756.461625,...,420.053422,1010.373773,2475.509873,3696.017683,279.806547,864.572129,2733.839435,3663.251888,-3.327086,0.09537
1,../audio/common_voice_en_19721473.mp3,3.658958,197.900264,36.011095,12.566395,1.456908,1.533911,0.000169,0.601329,-1791.164835,...,509.093425,1493.826542,2126.930547,3061.389053,469.603179,1255.784879,2036.851712,2891.063764,-1.810684,0.15676
2,../audio/common_voice_en_19065733.mp3,4.512229,205.211739,38.502619,11.249075,3.795021,2.01789,0.0001,0.755722,-1614.028249,...,504.372708,1503.906439,2171.456365,3316.511035,458.183243,1761.774514,2335.293541,3191.527166,-0.282858,-0.241512
3,../audio/common_voice_en_19703218.mp3,3.530958,224.59404,33.974662,15.751549,1.626245,1.766607,0.000816,0.786259,-1287.487543,...,507.858791,1748.850877,2476.726235,3658.137524,472.41865,1665.459675,2618.281868,3883.66538,-3.12944,-1.297744
4,../audio/common_voice_en_19647089.mp3,2.592146,214.070751,35.974475,10.610504,3.511184,2.139404,0.000139,1.287711,-1704.588366,...,559.68716,1720.47613,2372.949056,3472.616576,533.899874,1715.766331,2423.147832,3270.084778,-0.087711,1.686078


## Next we calculate the vocal-tract length estimates

### Formant position
 Puts, D. A., Apicella, C. L., & Cárdenas, R. A. (2012). Masculine voices signal men's threat potential in forager and industrial societies. Proceedings of the Royal Society of London B: Biological Sciences, 279(1728), 601-609.

In [None]:
df['pF'] = (zscore(df.f1_median) + zscore(df.f2_median) + zscore(df.f3_median) + zscore(df.f4_median)) / 4

### Formant Dispersion
Fitch, W. T. (1997). Vocal tract length and formant frequency dispersion correlate with body size in rhesus macaques. The Journal of the Acoustical Society of America, 102(2), 1213-1222.

In [None]:
df['fdisp'] = (df['f4_median'] - df['f1_median']) / 3

### Fn (Average Formant)
Pisanski, K., & Rendall, D. (2011). The prioritization of voice fundamental frequency or formants in listeners’ assessments of speaker size, masculinity, and attractiveness. The Journal of the Acoustical Society of America, 129(4), 2201-2212.

In [None]:
df['avgFormant'] = (df['f1_median'] + df['f2_median'] + df['f3_median'] + df['f4_median']) / 4

### MFF 
Smith, D. R., & Patterson, R. D. (2005). The interaction of glottal-pulse rate and vocal-tract length in judgements of speaker size, sex, and age. The Journal of the Acoustical Society of America, 118(5), 3177-3186.

In [None]:
df['mff'] = (df['f1_median'] * df['f2_median'] * df['f3_median'] * df['f4_median']) ** 0.25

### Fitch VTL
Fitch, W. T. (1997). Vocal tract length and formant frequency dispersion correlate with body size in rhesus macaques. The Journal of the Acoustical Society of America, 102(2), 1213-1222.

In [None]:
# reload the data again
df.to_csv("common_features.csv", index=False)
df = pd.read_csv('common_features.csv', header=0)

df['fitch_vtl'] = ((1 * (35000 / (4 * df['f1_median']))) +
                   (3 * (35000 / (4 * df['f2_median']))) + 
                   (5 * (35000 / (4 * df['f3_median']))) + 
                   (7 * (35000 / (4 * df['f4_median'])))) / 4

### $\Delta$F 
Reby,D.,& McComb,K.(2003). Anatomical constraints generate honesty: acoustic cues to age and weight in the roars of red deer stags. Animal Behaviour, 65, 519e-530.

In [None]:
xysum = (0.5 * df['f1_median']) + (1.5 * df['f2_median']) + (2.5 * df['f3_median']) + (3.5 * df['f4_median'])
xsquaredsum = (0.5 ** 2) + (1.5 ** 2) + (2.5 ** 2) + (3.5 ** 2)
df['delta_f'] = xysum / xsquaredsum

### VTL($\Delta$F)
Reby,D.,&McComb,K.(2003).Anatomical constraints generate honesty: acoustic cues to age and weight in the roars of red deer stags. Animal Behaviour, 65, 519e-530.

In [None]:
df['vtl_delta_f'] = 35000 / (2 * df['delta_f'])

## Save the final data

In [None]:
# Write out the final dataframe
df.to_csv("common_features_final.csv", index=False)

## Run this to tell you when it's done

In [None]:
print("finished")
raise SystemExit("Stopping notebook execution here.")

finished


SystemExit: Stopping notebook execution here.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
import pandas as pd
import numpy as np
import librosa
import parselmouth
from parselmouth.praat import call
from concurrent.futures import ProcessPoolExecutor

In [None]:
x = ['mean', 'var', 'min', 'max', 'p25', 'p75']
mfccColumns = [ f'{att}_{i}' for att in x for i in range(1, 14)]
mfccColumns.insert(0, 'voiceID')
print(mfccColumns)

['voiceID', 'mean_1', 'mean_2', 'mean_3', 'mean_4', 'mean_5', 'mean_6', 'mean_7', 'mean_8', 'mean_9', 'mean_10', 'mean_11', 'mean_12', 'mean_13', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7', 'var_8', 'var_9', 'var_10', 'var_11', 'var_12', 'var_13', 'min_1', 'min_2', 'min_3', 'min_4', 'min_5', 'min_6', 'min_7', 'min_8', 'min_9', 'min_10', 'min_11', 'min_12', 'min_13', 'max_1', 'max_2', 'max_3', 'max_4', 'max_5', 'max_6', 'max_7', 'max_8', 'max_9', 'max_10', 'max_11', 'max_12', 'max_13', 'p25_1', 'p25_2', 'p25_3', 'p25_4', 'p25_5', 'p25_6', 'p25_7', 'p25_8', 'p25_9', 'p25_10', 'p25_11', 'p25_12', 'p25_13', 'p75_1', 'p75_2', 'p75_3', 'p75_4', 'p75_5', 'p75_6', 'p75_7', 'p75_8', 'p75_9', 'p75_10', 'p75_11', 'p75_12', 'p75_13']


In [None]:
def extract_mfcc_statistics(audio, sr, n_mfcc=13):
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    
    # Compute statistical summaries for each MFCC coefficient
    mfccs_mean = np.mean(mfccs, axis=1)  # Mean
    mfccs_var = np.var(mfccs, axis=1)    # Variance
    mfccs_min = np.min(mfccs, axis=1)    # Minimum
    mfccs_max = np.max(mfccs, axis=1)    # Maximum
    mfccs_percentile_25 = np.percentile(mfccs, 25, axis=1)  # 25th percentile
    mfccs_percentile_75 = np.percentile(mfccs, 75, axis=1)  # 75th percentile
    
    # Combine all statistics into a single feature vector
    mfcc_statistics = np.concatenate([
        mfccs_mean, mfccs_var, mfccs_min, mfccs_max, 
        mfccs_percentile_25, mfccs_percentile_75
    ])
    
    return mfcc_statistics

In [None]:
# Create an empty DataFrame
df = pd.DataFrame(columns=mfccColumns)

# Function to extract features from a single audio file
def extract_features(file_path):
    global mfccColumns
    try:
        # Load and preprocess the audio
        audio, sr = librosa.load(file_path, sr=None, mono=True)
        audio = remove_silence(audio)
        audio = normalize(audio)
        audio = nr.reduce_noise(y=audio, sr=sr)

        mfcc_stats = extract_mfcc_statistics(audio, sr)
        mfcc_stats = mfcc_stats.tolist()
        mfcc_stats.insert(0, file_path)

        f = { key: val for key, val in zip(mfccColumns, mfcc_stats)}

        return f
    except Exception as e:
        print(f"Failed to process {file_path}: {e}")
        return None

# Parallel processing of audio files
def process_audio_files(audio_files):
    global df
    with ProcessPoolExecutor() as executor:
        results = executor.map(extract_features, audio_files)

    # Append results to the DataFrame
    for result in results:
        if result:
            df = pd.concat([df, pd.DataFrame([result])], ignore_index=True)

# List all audio files
audio_files = glob.glob("../audio/*")

# Process files in parallel
process_audio_files(audio_files)

# Save the final DataFrame
df.to_csv("mfcc_features.csv", index=False)
print("Processing complete!")

  audio, sr = librosa.load(file_path, sr=None, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Failed to process ../audio/common_voice_en_675766.mp3: 


  audio, sr = librosa.load(file_path, sr=None, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Failed to process ../audio/common_voice_en_682788.mp3: 
Failed to process ../audio/common_voice_en_85538.mp3: 
Failed to process ../audio/common_voice_en_675768.mp3: 


  audio, sr = librosa.load(file_path, sr=None, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Failed to process ../audio/common_voice_en_682785.mp3: 


  audio, sr = librosa.load(file_path, sr=None, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Failed to process ../audio/common_voice_en_85541.mp3: 
Failed to process ../audio/common_voice_en_682778.mp3: 
Failed to process ../audio/common_voice_en_682782.mp3: 
