In [10]:
import librosa
import numpy as np
import pandas as pd
import parselmouth
from scipy.signal import find_peaks
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import correlate

In [9]:
# Function to load an audio file
def load_audio(file_path):
    audio, sample_rate = librosa.load(file_path)
    return audio, sample_rate

In [8]:
db_directory = 'tess_database/dataverse_files'

##### Prosodic feature set extraction

In [1]:
# Function to extract pitch
def extract_pitch(y, sr):
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
    pitch = []
    for t in range(pitches.shape[1]):
        index = magnitudes[:, t].argmax()
        pitch.append(pitches[index, t])
    pitch = np.trim_zeros(np.array(pitch))
    return np.mean(pitch), np.std(pitch)  # Returning mean and standard deviation

In [2]:
# Function to extract energy
def extract_energy(y):
    frame_length = 1024
    energy = np.array([
        np.sum(np.abs(y[i:i+frame_length]**2))
        for i in range(0, len(y), frame_length)
    ])
    return np.mean(energy), np.std(energy)  # Returning mean and standard deviation

In [3]:
# Function to extract duration features
def extract_duration_features(y, sr):
    duration = librosa.get_duration(y=y, sr=sr)
    tempogram = librosa.feature.tempogram(y=y, sr=sr)
    speech_rate = np.mean(tempogram)  # Approximation for speech rate
    return duration, speech_rate

In [4]:
# Function to extract silence duration
def extract_silence_duration(y, sr, threshold=0.02):
    y_abs = np.abs(y)
    silence_duration = np.sum(y_abs < threshold) / sr
    return silence_duration

In [5]:
# Function to calculate voiced vs. unvoiced speech duration
def voiced_unvoiced(y, sr, hop_length=512):
    # Using zero-crossing rate to estimate voiced and unvoiced segments
    zero_crossings = librosa.feature.zero_crossing_rate(y, frame_length=hop_length, hop_length=hop_length)
    voiced = np.sum(zero_crossings > 0.1) * hop_length / sr  # Threshold for voiced segments
    unvoiced = np.sum(zero_crossings <= 0.1) * hop_length / sr  # Threshold for unvoiced segments
    return voiced, unvoiced

In [6]:
# Function to extract pitch variation
def extract_pitch_variation(y, sr):
    pitches, _ = librosa.core.piptrack(y=y, sr=sr)
    pitch_variation = np.var(pitches)
    return pitch_variation

In [7]:
# Main function to extract all features
def extract_prosodic_features(file_path):
    y, sr = load_audio(file_path)
    mean_pitch, std_pitch = extract_pitch(y, sr)
    mean_energy, std_energy = extract_energy(y)
    duration, speech_rate = extract_duration_features(y, sr)
    silence_duration = extract_silence_duration(y, sr)
    voiced_duration, unvoiced_duration = voiced_unvoiced(y, sr)
    pitch_variation = extract_pitch_variation(y, sr)
    
    features = {
        'mean_pitch': mean_pitch,
        'std_pitch': std_pitch,
        'mean_energy': mean_energy,
        'std_energy': std_energy,
        'duration': duration,
        'speech_rate': speech_rate,
        'silence_duration': silence_duration,
        'voiced_duration': voiced_duration,
        'unvoiced_duration': unvoiced_duration,
        'pitch_variation': pitch_variation
    }
    
    return features

In [11]:
# Initialize an empty list to store feature data
data = []

# Iterate over files in the EmoDB directory
for filename in os.listdir(db_directory):
    if filename.endswith(".wav"):  # Assuming the files are in WAV format
        file_path = os.path.join(db_directory, filename)
        # Extract features from the audio file
        features = extract_prosodic_features(file_path)  # Your feature extraction function
        # Include the filename or other identifiers as needed
        features['filename'] = filename
        # Append the features to the data list
        data.append(features)

# Convert the list of feature dictionaries to a DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame to verify
print(df.head())

# Define the path for the output CSV file
output_csv_path = 'features/prosodic_features_test.csv'

# Write the DataFrame to a CSV file
df.to_csv(output_csv_path, index=False)

print(f'Features extracted and saved to {output_csv_path}')

    mean_pitch    std_pitch  mean_energy  std_energy  duration  speech_rate  \
0   449.610199   381.894135     9.676556   11.913024  2.075601     0.064879   
1  1007.365051  1288.699097     0.327053    0.572303  2.345760     0.097476   
2   792.386292  1063.287964     0.588500    0.856471  1.794921     0.059025   
3   708.017578  1036.089600     0.109407    0.107576  1.861633     0.159514   
4   516.532471   480.635956     4.721364    6.218035  1.650204     0.054590   

   silence_duration  voiced_duration  unvoiced_duration  pitch_variation  \
0          0.940317         0.696599           1.393197     17896.482422   
1          1.993787         0.882358           1.486077     47069.945312   
2          1.316961         0.255420           1.555737     13634.870117   
3          1.706621         0.487619           1.393197     12398.823242   
4          0.870113         0.673379           0.998458     52227.867188   

               filename  
0    YAF_wire_happy.wav  
1   OAF_fat_disg

##### Acoustic feature set extraction

In [12]:
# Function to extract MFCC features
def extract_mfcc(audio, sample_rate, n_mfcc=13):
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)

In [13]:
# Function to extract LPCC features
def extract_lpcc(audio, sample_rate, n_lpcc=13):
    lpccs = librosa.lpc(y=audio, order=n_lpcc)
    return lpccs

In [14]:
# Function to extract LFPC features
def extract_lfpc(audio, sample_rate, n_bands=40):
    stft = np.abs(librosa.stft(audio))
    lfpcs = librosa.feature.melspectrogram(S=stft**2, sr=sample_rate, n_mels=n_bands)
    return np.mean(lfpcs.T, axis=0)

In [15]:
# Function to extract GFCC features
def extract_gfcc(audio, sample_rate, n_gfcc=13):
    gfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_gfcc, htk=True)
    return np.mean(gfccs.T, axis=0)

In [16]:
# Function to extract formants
def extract_formants(audio, sample_rate):
    pitches, magnitudes = librosa.core.piptrack(y=audio, sr=sample_rate)
    formants = np.max(magnitudes, axis=0)
    return formants

In [17]:
# Main function to extract all features
def extract_acoustic_features(file_path):
    audio, sample_rate = load_audio(file_path)
    features = {}
    features['mfcc'] = extract_mfcc(audio, sample_rate)
    features['lpcc'] = extract_lpcc(audio, sample_rate)
    features['lfpc'] = extract_lfpc(audio, sample_rate)
    features['gfcc'] = extract_gfcc(audio, sample_rate)
    features['formants'] = extract_formants(audio, sample_rate)
    return features

In [18]:
# Initialize an empty list to store feature data
data = []

# Iterate over files in the EmoDB directory
for filename in os.listdir(db_directory):
    if filename.endswith(".wav"):  # Assuming the files are in WAV format
        file_path = os.path.join(db_directory, filename)
        # Extract features from the audio file
        features = extract_acoustic_features(file_path)  # Your feature extraction function
        # Include the filename or other identifiers as needed
        features['filename'] = filename
        # Append the features to the data list
        data.append(features)

# Convert the list of feature dictionaries to a DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame to verify
print(df.head())

# Define the path for the output CSV file
output_csv_path = 'features/acoustic_features_test.csv'

# Write the DataFrame to a CSV file
df.to_csv(output_csv_path, index=False)

print(f'Features extracted and saved to {output_csv_path}')

                                                mfcc  \
0  [-300.5379, 40.605106, -28.108757, 17.45625, -...   
1  [-426.9512, 71.14616, 3.221315, -10.768615, 13...   
2  [-441.6265, 92.82772, 19.227163, -14.2388115, ...   
3  [-515.32117, 59.07907, 16.712137, 10.546044, 1...   
4  [-295.1762, 41.771236, -15.355103, 15.900852, ...   

                                                lpcc  \
0  [1.0, -1.7448161, 1.0000771, 0.1465317, -0.435...   
1  [1.0, -0.8589723, -0.024427855, -0.314105, -0....   
2  [1.0, -0.9826942, -0.07148894, -0.59121585, 0....   
3  [1.0, -1.0153912, 0.19047754, -0.6288274, 0.62...   
4  [1.0, -1.2599481, 0.16007785, 0.21459147, 0.06...   

                                                lfpc  \
0  [0.0023599719, 0.49987122, 2.4315553, 2.492736...   
1  [0.0057541984, 0.15236326, 0.28910416, 0.20294...   
2  [0.092098996, 0.15914197, 0.42431548, 0.923114...   
3  [0.0030326215, 0.14114383, 0.04965497, 0.30300...   
4  [0.049695488, 0.19012822, 1.384158, 6.00092