## Imports



In [3]:
from scipy.fft import fft
from mutagen.wave import WAVE
from parselmouth.praat import call
from scipy.signal import find_peaks
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import librosa
import librosa.display
import parselmouth
import noisereduce as nr
import os 
print(os.getcwd())



c:\Users\Samir\Documents\GitHub\Projet-STT-3795\src\data


##Features processing

###Get data frames

In [4]:
def get_length(path, type):
    audio = WAVE(f'./wav_files_clean/{type}/' + path)
    audio_info = audio.info
    return audio_info.length
#C:\Users\Kamen\ML\Projet-STT-3795\src\data\wav_files_clean\train\train_data.csv
train_df = pd.read_csv('./wav_files_clean/train/train_data.csv')
#train_df['Length'] = train_df['paths'].apply(lambda x: get_length(x, 'train'))

test_df = pd.read_csv('./wav_files_clean/test/test_data.csv')
#test_df['Length'] = test_df['paths'].apply(lambda x: get_length(x, 'test'))

validation_df = pd.read_csv('./wav_files_clean/validation/validation_data.csv')
#validation_df['Length'] = validation_df['paths'].apply(lambda x: get_length(x, 'validation'))
train_df.head()

Unnamed: 0.1,Unnamed: 0,sentence,age,gender,language,paths
0,0,عليك أن تفي بوعدك.,twenties,male,Arabic,common_voice_ar_20401372.wav
1,1,يشبه أباه.,twenties,female,Arabic,common_voice_ar_19216539.wav
2,2,لن يُغَيِّرَ ذلك شيئًا.,fourties,male,Arabic,common_voice_ar_19375914.wav
3,3,كيف حال الجميع ؟,not_defined,not_defined,Arabic,common_voice_ar_19220386.wav
4,4,أتعرف كيف تلعب الشطرنج ؟,not_defined,not_defined,Arabic,common_voice_ar_19803329.wav


###MFCCs

In [5]:
def get_Normalized_Mfccs(data, sample_rate):
    mfccs = librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=25)
    mfccs_mean = np.mean(mfccs, axis=1)
    mfccs_std = np.std(mfccs, axis=1)
    mfccs_normalized = ((mfccs.T - mfccs_mean).T) / mfccs_std[:, np.newaxis]
    return mfccs_normalized

###Spectral measurements

In [6]:
def get_spectral_measurements(data, sample_rate):
    spectral_centroids = librosa.feature.spectral_centroid(y=data, sr=sample_rate)[0]
    spectral_rolloff = librosa.feature.spectral_rolloff(y=data, sr=sample_rate)[0]
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=data, sr=sample_rate)[0]
    spectral_flatness = librosa.feature.spectral_flatness(y=data)[0]
    spectral_contrast = librosa.feature.spectral_contrast(y=data, sr=sample_rate)
    return (spectral_centroids, spectral_rolloff, spectral_bandwidth, spectral_flatness, spectral_contrast)


###Pitch sequence

In [7]:
# Extract the pitch sequence
def get_pitch_sequences(data, sample_rate):
    pitches, magnitudes = librosa.core.piptrack(y=data, sr=sample_rate)
    # Select the dominant pitch at each frame
    pitch_track = []
    for t in range(pitches.shape[1]):
        index = magnitudes[:, t].argmax()
        pitch = pitches[index, t]
        pitch_track.append(pitch)

    pitch_track = np.array(pitch_track)

    # Remove zeros values (unvoiced frames)
    pitch_track = pitch_track[pitch_track > 0]
    return pitch_track

###Formants

In [8]:
### Get formants data ###
def get_formants(path):
    audio = parselmouth.Sound(path)
    formants = audio.to_formant_burg()
    number_points = int(audio.duration / 0.01) + 1
    formant_data = {'time': [], 'F1': [], 'F2': [], 'F3': []}
    for i in range(number_points):
        time = i * 0.01
        formant_data['time'].append(time)
        formant_data['F1'].append(formants.get_value_at_time(1, time))
        formant_data['F2'].append(formants.get_value_at_time(2, time))
        formant_data['F3'].append(formants.get_value_at_time(3, time))

    return formant_data


###RMS and ZCR

In [9]:
### Energy and Amplitude Features ###

def get_rms_energy(data):
    # Root Mean Square (RMS) Energy - with a frame length of 2048 (default)
    return librosa.feature.rms(y=data, frame_length=2048, hop_length=512)

def get_ZCR(data):
    # Zero-Crossing Rate (ZCR) - with a frame length of 2048 (default)
    return librosa.feature.zero_crossing_rate(y=data, frame_length=2048, hop_length=512)

###HNR

In [10]:
### Voice Quality Features ###
def get_HNR(data, sample_rate):
    # Load the cleaned sound into parselmouth.Sound
    snd = parselmouth.Sound(data, sample_rate)
    # Harmonics-to-Noise Ratio (HNR)
    hnr = call(snd, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
    #print(hnr)
    hnr_mean = call(hnr, "Get mean", 0, 0)
    return hnr_mean

### Features matrix

In [11]:
training_audios = train_df['paths'].tolist()
test_audios = test_df['paths'].tolist()
validation_audios = validation_df['paths'].tolist()

def extract_zcr_features(zcr_vector, hnr_mean): 
    # Calculate aggregated features
    features = {
        "mean_zcr": np.mean(zcr_vector),
        "std_dev_zcr": np.std(zcr_vector),
        "median_zcr": np.median(zcr_vector),
        "min_zcr": np.min(zcr_vector),
        "max_zcr": np.max(zcr_vector),
        "q25_zcr": np.percentile(zcr_vector, 25),
        "q75_zcr": np.percentile(zcr_vector, 75),
        "iqr_zcr": np.percentile(zcr_vector, 75) - np.percentile(zcr_vector, 25),
        "hnr_mean": hnr_mean
    }

    # Convert the features dictionary to a pandas DataFrame
    features_df = pd.DataFrame([features])
    
    return features_df    

def extract_rms_features(rms_energy): 
    features = {
            "mean_energy": np.mean(rms_energy),
            "std_dev_energy": np.std(rms_energy),
            "median_energy": np.median(rms_energy),
            "min_energy": np.min(rms_energy),
            "max_energy": np.max(rms_energy),
            "energy_range": np.max(rms_energy) - np.min(rms_energy),
            "q25_energy": np.percentile(rms_energy, 25),
            "q75_energy": np.percentile(rms_energy, 75),
            "energy_iqr": np.percentile(rms_energy, 75) - np.percentile(rms_energy, 25),
            "energy_variability": np.sum(np.abs(np.diff(rms_energy))),
            "zero_crossing_rate": np.sum(np.diff(np.sign(np.diff(rms_energy))) != 0) / len(rms_energy),
            "low_energy_frame_rate": np.sum(rms_energy < (0.5 * np.mean(rms_energy))) / len(rms_energy)
        }

    features_df = pd.DataFrame([features])

    return features_df


def extract_formants(f1, f2, f3): 
    f1, f2, f3 = map(lambda x: np.nan_to_num(np.asarray(x)), [f1, f2, f3])
    features = {}
    for formant, name in zip([f1, f2, f3], ['F1', 'F2', 'F3']):
        features[f'{name}_mean'] = np.nanmean(formant)
        features[f'{name}_std_dev'] = np.nanstd(formant)
        features[f'{name}_median'] = np.nanmedian(formant)
        features[f'{name}_min'] = np.nanmin(formant)
        features[f'{name}_max'] = np.nanmax(formant)
        features[f'{name}_range'] = np.nanmax(formant) - np.nanmin(formant)
        features[f'{name}_q25'] = np.nanpercentile(formant, 25)
        features[f'{name}_q75'] = np.nanpercentile(formant, 75)
        features[f'{name}_iqr'] = np.nanpercentile(formant, 75) - np.nanpercentile(formant, 25)
        # Ensure there are at least 2 elements to compute diff, otherwise default to 0
        features[f'{name}_delta_sum'] = np.sum(np.abs(np.diff(formant))) if len(formant) > 1 else 0

    if all(len(formant) > 0 for formant in [f1, f2, f3]):
        features['F2_F1_diff_mean'] = np.nanmean(f2 - f1)
        features['F3_F2_diff_mean'] = np.nanmean(f3 - f2)
    else:
        features['F2_F1_diff_mean'], features['F3_F2_diff_mean'] = 0, 0

    features_df = pd.DataFrame([features])
    return features_df

def extract_spectre(data, type):
    # Define feature names
    feature_names = ['mean_' + type, 'std_' + type, 'median_' + type, 'min_' + type, 'max_' + type, 'q25_' + type, 'q75_' + type]
    
    # Compute the features
    mean_val = np.mean(data)
    std_val = np.std(data)
    median_val = np.median(data)
    min_val = np.min(data)
    max_val = np.max(data)
    q25, q75 = np.percentile(data, [25, 75])
    
    # Collect features into a list
    features = [mean_val, std_val, median_val, min_val, max_val, q25, q75]
    
    # Create a DataFrame from the features list
    features_df = pd.DataFrame([features], columns=feature_names)
    
    return features_df

# Outputs 3 features per band (Contrast peak, temporal evolution, rate of change) = 21 
# Mean and std of spectral contrast
def extract_contrast(spectral_contrast):
    features = []
    feature_names = []

    # Iterate over each frequency band to calculate band-specific features
    for band in range(spectral_contrast.shape[0]):
        contrast_band = spectral_contrast[band, :]

        # Count significant peaks
        peaks, _ = find_peaks(contrast_band, height=np.mean(contrast_band))
        features.append(len(peaks))
        feature_names.append(f'band_{band}_peaks')

        # Temporal evolution: difference between means of the first and second halves
        mid_point = len(contrast_band) // 2
        mean_diff = np.mean(contrast_band[mid_point:]) - np.mean(contrast_band[:mid_point])
        features.append(mean_diff)
        feature_names.append(f'band_{band}_mean_diff')

        # Rate of change (derivative)
        derivative = np.mean(np.abs(np.diff(contrast_band)))
        features.append(derivative)
        feature_names.append(f'band_{band}_derivative')

    # Add overall statistical measures for the entire spectral contrast matrix
    overall_mean = np.mean(spectral_contrast)
    features.append(overall_mean)
    feature_names.append('overall_mean')

    overall_std = np.std(spectral_contrast)
    features.append(overall_std)
    feature_names.append('overall_std')

    # Convert the features list into a DataFrame
    features_df = pd.DataFrame([features], columns=feature_names)

    return features_df

def mfccs_to_df(mfcc_means,mfcc_stds): 
    # Ensure mfcc_means and mfcc_stds are flat arrays
    mfcc_means = mfcc_means.flatten()
    mfcc_stds = mfcc_stds.flatten()
    
    # Generate column names
    mean_col_names = [f'MFCC_mean_{i+1}' for i in range(len(mfcc_means))]
    std_col_names = [f'MFCC_std_{i+1}' for i in range(len(mfcc_stds))]
    
    # Combine the MFCC means and stds into a single DataFrame
    mfcc_features_df = pd.DataFrame([np.concatenate([mfcc_means, mfcc_stds])],
                                    columns=mean_col_names + std_col_names)
    
    return mfcc_features_df

type = [(training_audios, 'train'), (test_audios, 'test'), (validation_audios, 'validation')]
c=0
for i in range(len(type)):
    attributes_df = pd.DataFrame()
    for j in range(len(type[i][0])):
        c+=1
        print(c)
        path = './wav_files_clean/' + type[i][1] + '/' + type[i][0][j]
        print(type[i][0][j] + " " + path)
        #sample_rate represent the number of samples per seconds in the original signal.
        data, sample_rate = librosa.load(path, sr=None)
        #clean data
        data = nr.reduce_noise(y=data, sr=sample_rate)

        #Get the attributes
        mfccs = get_Normalized_Mfccs(data, sample_rate)
        specs_measurements = get_spectral_measurements(data, sample_rate)
        pitch_track = get_pitch_sequences(data, sample_rate)
        formants_data = get_formants(path)
        rms_energy = get_rms_energy(data)
        zcr = get_ZCR(data)
        hnr_mean = get_HNR(data, sample_rate)
        # mfccs: Get mean and std atributes
        mfccs = mfccs_to_df(np.mean(mfccs, axis = 1),np.std(mfccs, axis = 1))

        # Spec measurements: Use extract specter to produce features
        spectre_centroid_df = extract_spectre(specs_measurements[0], "centroid")
        spectre_rollof_df = extract_spectre(specs_measurements[1], "rollof")
        spectre_bandwidth_df = extract_spectre(specs_measurements[2], "bandwidth")
        spectre_flatness_df = extract_spectre(specs_measurements[3], "flatness")
        

        # Spectre contrast
        spectre_contrast_df = extract_contrast(specs_measurements[4])

        # Pitch track: use extract spectre method
        # Add IQR and pitch delta sum -> 9 features
        #print(pitch_track.shape)
        pitch_track_df = extract_spectre(pitch_track, "pitch_track")
        # Formants : Call extract_formants with f1, f2, f3
        # Returns dataframe with 32 features
        formants_df = extract_formants(formants_data["F1"], formants_data["F2"], formants_data["F3"])
        
        # RMS
        rms_energy_df = extract_rms_features(rms_energy[0])

        if type[i][1] == 'train':
            label = pd.Series(train_df['language'][j])
        elif type[i][1] == 'test':
            label = pd.Series(test_df['language'][j])
        elif type[i][1] == 'validation':
            label = pd.Series(validation_df['language'][j])


        #ZCR and HNR mean
        combined_features_row = pd.concat([mfccs, spectre_centroid_df,\
                                            spectre_rollof_df, \
                                                spectre_bandwidth_df,\
                                                      spectre_flatness_df, \
                                                        spectre_contrast_df,\
                                                            pitch_track_df,\
                                                                formants_df, \
                                                                    rms_energy_df, label], axis = 1)

        """
        row = pd.DataFrame({'Audio': audio ,'MFCCs': [np.array(mfccs)],
                                    'Spec Centroid': [specs_measurements[0]], 'Spec Rollof': [specs_measurements[1]],
                                    'Spec Bandwidth': [specs_measurements[2]], 'Spec Flatness': [specs_measurements[3]],
                                    'Spec Contrast': [specs_measurements[4]], 'Pitch Track': [pitch_track],
                                    'Formants': [formants_data], 'RMS Energy': [rms_energy],
                                    'ZCR': [zcr], 'HNR Mean': [hnr_mean]})
        """
        attributes_df = pd.concat([attributes_df, combined_features_row], ignore_index=True)
        
    """
    if type[i][1] == 'train':
        attributes_df['label'] = train_df['language']
    elif type[i][1] == 'test':
        attributes_df['label'] = test_df['language']
    elif type[i][1] == 'validation':
        attributes_df['label'] = validation_df['language']
      
    attributes_df.to_csv(type[i][1] + "_preprocessed_data")
    """

1
common_voice_ar_20401372.wav ./wav_files_clean/train/common_voice_ar_20401372.wav
2
common_voice_ar_19216539.wav ./wav_files_clean/train/common_voice_ar_19216539.wav
3
common_voice_ar_19375914.wav ./wav_files_clean/train/common_voice_ar_19375914.wav
4
common_voice_ar_19220386.wav ./wav_files_clean/train/common_voice_ar_19220386.wav
5
common_voice_ar_19803329.wav ./wav_files_clean/train/common_voice_ar_19803329.wav
6
common_voice_ar_20026829.wav ./wav_files_clean/train/common_voice_ar_20026829.wav
7
common_voice_ar_19529991.wav ./wav_files_clean/train/common_voice_ar_19529991.wav
8
common_voice_ar_19083375.wav ./wav_files_clean/train/common_voice_ar_19083375.wav
9
common_voice_ar_19380209.wav ./wav_files_clean/train/common_voice_ar_19380209.wav
10
common_voice_ar_19476981.wav ./wav_files_clean/train/common_voice_ar_19476981.wav
11
common_voice_ar_19205882.wav ./wav_files_clean/train/common_voice_ar_19205882.wav
12
common_voice_ar_19204113.wav ./wav_files_clean/train/common_voice_ar_19



311
common_voice_ar_19541056.wav ./wav_files_clean/train/common_voice_ar_19541056.wav
312
common_voice_ar_19843022.wav ./wav_files_clean/train/common_voice_ar_19843022.wav
313
common_voice_ar_20732070.wav ./wav_files_clean/train/common_voice_ar_20732070.wav
314
common_voice_ar_21204173.wav ./wav_files_clean/train/common_voice_ar_21204173.wav
315
common_voice_ar_19963353.wav ./wav_files_clean/train/common_voice_ar_19963353.wav
316
common_voice_ar_19222349.wav ./wav_files_clean/train/common_voice_ar_19222349.wav


: 

In [None]:
attributes_df

Unnamed: 0,MFCC_mean_1,MFCC_mean_2,MFCC_mean_3,MFCC_mean_4,MFCC_mean_5,MFCC_mean_6,MFCC_mean_7,MFCC_mean_8,MFCC_mean_9,MFCC_mean_10,...,min_energy,max_energy,energy_range,q25_energy,q75_energy,energy_iqr,energy_variability,zero_crossing_rate,low_energy_frame_rate,0
0,2.157688e-07,-1.192093e-09,-4.827976e-08,1.072884e-08,-4.798174e-08,-1.585484e-07,-1.907349e-08,8.940697e-10,-2.145767e-08,1.192093e-08,...,0.003027612,0.068259,0.065231,0.02856073,0.048825,0.020265,0.345372,0.3,0.18,Arabic
1,2.880891e-07,-4.967054e-08,-3.973643e-08,-1.241763e-09,0.0,-3.973643e-08,-7.202228e-08,-3.973643e-08,0.0,1.490116e-08,...,0.02278906,0.108572,0.085782,0.04604545,0.079658,0.033613,0.269362,0.208333,0.041667,Arabic
2,2.037395e-07,4.118139e-08,2.844767e-08,-6.502325e-09,-2.384186e-08,7.586046e-08,-2.004884e-08,1.950697e-08,3.251163e-08,-1.408837e-08,...,0.001313695,0.094589,0.093276,0.01617604,0.038534,0.022358,0.377113,0.272727,0.2,Arabic
3,-1.739811e-07,7.732494e-08,-1.822372e-08,8.054681e-09,-9.665618e-09,1.288749e-08,-2.094217e-08,4.832809e-09,-2.416404e-08,-8.86015e-09,...,0.0009262349,0.077696,0.07677,0.02581857,0.063184,0.037365,0.235225,0.297297,0.162162,Arabic
4,2.78155e-07,-1.655685e-08,3.311369e-08,9.106265e-09,2.317958e-08,2.649095e-08,-1.324548e-08,-1.986822e-08,2.649095e-08,-1.986822e-08,...,0.001195703,0.038037,0.036842,0.00612662,0.019686,0.013559,0.27394,0.208333,0.277778,Arabic
5,2.242082e-07,-2.52629e-08,1.263145e-08,-6.315724e-09,-3.157862e-08,-1.815771e-08,1.263145e-08,-3.157862e-09,1.263145e-08,-1.263145e-08,...,1.748592e-18,0.114691,0.114691,0.00141677,0.040171,0.038755,0.947285,0.231788,0.503311,Arabic
6,6.00762e-08,-3.168855e-08,1.35808e-08,9.05387e-09,2.037121e-08,-4.376037e-08,-1.810774e-08,-7.544892e-09,-1.448619e-07,2.716161e-08,...,0.0,0.105521,0.105521,0.002374881,0.040885,0.03851,0.485798,0.227848,0.392405,Arabic
7,-1.383679e-07,-5.676633e-08,1.986822e-08,2.829447e-08,9.650276e-08,-5.676633e-09,-6.244296e-08,-1.135327e-08,-7.379623e-08,-5.676633e-09,...,0.0004492012,0.107352,0.106903,0.03935456,0.054963,0.015608,0.262174,0.333333,0.142857,Arabic
8,4.638088e-07,-3.647935e-08,1.5634e-08,1.042267e-08,-4.169068e-08,-4.559918e-08,0.0,0.0,1.042267e-08,-7.817002e-09,...,4.260098e-12,0.035795,0.035795,0.001227819,0.009033,0.007805,0.309905,0.180328,0.469945,Arabic
9,8.871389e-08,5.821849e-08,-2.772309e-08,5.544618e-09,-5.310704e-08,-3.742617e-08,-3.04954e-08,-8.057023e-08,-1.108924e-08,-2.772309e-09,...,0.0008926935,0.164103,0.16321,0.01071744,0.10641,0.095693,0.421306,0.255814,0.465116,Arabic


## Features visualisation

### Initialisation

In [None]:
data, sample_rate = librosa.load(path=f'./wav_files/validation/{validation_df['paths'][0]}')
data = nr.reduce_noise(y=data, sr=sample_rate)

SyntaxError: f-string: unmatched '[' (3879951603.py, line 1)

### MFCCs visualisation

In [None]:
normalized_mfccs = get_Normalized_Mfccs(data, sample_rate)

plt.figure(figsize=(12,6))
librosa.display.specshow(normalized_mfccs, x_axis='time')
plt.colorbar()
plt.title('MFCCs')
plt.tight_layout()
plt.show()

### Spectral measurements visualisation

In [None]:
specs = get_spectral_measurements(data, sample_rate)

t = librosa.frames_to_time(range(len(specs[0])), sr=sample_rate)

# Plotting the Spectral Features
plt.figure(figsize=(12, 4))
plt.plot(t, specs[0], color='red', label='Centroid')
plt.plot(t, specs[1], color='blue', label='Rolloff')
plt.plot(t, specs[2], color='green', label='Bandwidth')

# For spectral flatness, there's no need to convert to dB since it's a ratio and typically small.
#plt.plot(t, spectral_flatness, color='orange', label='Flatness')

# For spectral contrast, it's common to average over the frequency bands since it returns an array of shape (n_bands, n_frames).
#spectral_contrast_avg = np.mean(spectral_contrast, axis=0)
#plt.plot(t, spectral_contrast_avg, color='black', label='Contrast')

plt.xlabel("Time (s)")
plt.ylabel("Spectral Feature Value")
plt.title("Spectral Features Over Time")
plt.legend(loc='best')
plt.show()

### Pitches visualisation

In [None]:
pitch_track = get_pitch_sequences(data, sample_rate)
plt.figure(figsize=(12, 6))
plt.plot(pitch_track)
plt.xlabel('Time (frames)')
plt.ylabel('Frequency (Hz)')
plt.title('Pitch Track')
plt.show()

### RMS Visualisation

In [None]:
rms_energy = get_rms_energy(data)
frames = range(len(rms_energy[0]))
t = librosa.frames_to_time(frames, sr=sample_rate)

plt.figure(figsize=(12, 4))
plt.plot(t, rms_energy[0], label='RMS Energy')
plt.xlabel("Time (s)")
plt.ylabel("Energy")
plt.title("RMS Energy Over Time")
plt.legend()
plt.show()