In [6]:
import librosa
import librosa.display
import os
import numpy as np
import math
import re 

In [7]:
#torgoPath = r"C:\Users\Salma\Downloads\GP\Torgodata"
UASpeech = r"D:\GP\UASpeech\data"

In [8]:
def readDataSetFiles(DataSetPath,labels=True):
    dataSetAllMainFolders= sorted(os.listdir(DataSetPath)) #[f_audio,f_label,m_audio,m_label]
    allDataFilesInDataSetPaths =[DataSetPath + '\\' + Folder+'\\'+File for Folder in dataSetAllMainFolders for File in sorted(os.listdir(DataSetPath + '\\' + Folder)) if re.search('_audio',Folder)] #get all wav files in one list
    if labels:
        allLabelFilesInDataSetPaths=[DataSetPath + '\\' + Folder+'\\'+File for Folder in dataSetAllMainFolders         for File in sorted(os.listdir(DataSetPath + '/' + Folder)) if re.search('_label',Folder)] #get all txt files in one list
    else:allLabelFilesInDataSetPaths=[]
    del(dataSetAllMainFolders)
    return allDataFilesInDataSetPaths,allLabelFilesInDataSetPaths

In [9]:
allDataFilesInDataSetPaths,allLabelFilesInDataSetPaths=readDataSetFiles(UASpeech)

In [10]:
print(len(allDataFilesInDataSetPaths))
print(len(allLabelFilesInDataSetPaths))
print('first data file',allDataFilesInDataSetPaths[:1],'last data file',allDataFilesInDataSetPaths[-1:])
print('first label file',allLabelFilesInDataSetPaths[:1],'last label file',allLabelFilesInDataSetPaths[-1:])

7526
0
first data file ['D:\\GP\\UASpeech\\data\\CF02_audio\\CF02_0001.wav'] last data file ['D:\\GP\\UASpeech\\data\\F02_audio\\F02_3701.wav']
first label file [] last label file []


In [11]:
def readLabelsLinesFromTxt(dataSetPath):
    errorReadLabelTxtFile,allLabelFilesInDataSetPaths=[],[]
    labelsFiles=[labelFiles for labelFiles in os.listdir(dataSetPath+'\\..') if labelFiles.endswith('.txt')] 
    for labelsFileName in labelsFiles:
        try:
            with open(dataSetPath+'\\..\\'+labelsFileName) as labelTxtFile:
                allLabelFilesInDataSetPaths=sorted(labelTxtFile.readlines())
                allLabelFilesInDataSetPaths=[line.split(':')[-1].strip() for line in allLabelFilesInDataSetPaths]
        except:
            errorReadLabelTxtFile.append(labelsFileName)
    return allLabelFilesInDataSetPaths,errorReadLabelTxtFile

In [12]:
allLabelFilesInDataSetPaths,errorReadLabelTxtFile=readLabelsLinesFromTxt(UASpeech)

In [13]:
print(len(allLabelFilesInDataSetPaths))
print(len(errorReadLabelTxtFile))

7526
0


# Exctract Features Using PNCC
### you need to install librosa version !pip install librosa==0.6.0

In [14]:
from librosa.core import stft
from librosa import filters
from librosa import to_mono
import scipy

In [15]:
def medium_time_power_calculation(power_stft_signal, M=2):
    medium_time_power = np.zeros_like(power_stft_signal)
    power_stft_signal = np.pad(power_stft_signal, [(M, M), (0, 0)], 'constant')
    for i in range(medium_time_power.shape[0]):
        medium_time_power[i, :] = sum([1 / float(2 * M + 1) *power_stft_signal[i + k - M, :] for k in range(2 * M + 1)])
    return medium_time_power

In [16]:
def asymmetric_lawpass_filtering(rectified_signal, lm_a=0.999, lm_b=0.5):
    floor_level = np.zeros_like(rectified_signal)
    floor_level[0, ] = 0.9 * rectified_signal[0, ]
    for m in range(floor_level.shape[0]):
        floor_level[m, ] = np.where(rectified_signal[m, ] >=
                                    floor_level[m - 1, ],
                                    lm_a * floor_level[m - 1, ] +
                                    (1 - lm_a) * rectified_signal[m, ],
                                    lm_b * floor_level[m - 1, ] +
                                    (1 - lm_b) * rectified_signal[m, ])

    return floor_level

In [17]:
def halfwave_rectification(subtracted_lower_envelope, th=0):
    return np.where(subtracted_lower_envelope < th,
                    np.zeros_like(subtracted_lower_envelope),
                    subtracted_lower_envelope)

In [18]:
def temporal_masking(rectified_signal, lam_t=0.85, myu_t=0.2):
        # rectified_signal[m, l]
    temporal_masked_signal = np.zeros_like(rectified_signal)
    online_peak_power = np.zeros_like(rectified_signal)
    temporal_masked_signal[0, :] = rectified_signal[0, ]
    online_peak_power[0, :] = rectified_signal[0, :]
    for m in range(1, rectified_signal.shape[0]):
        online_peak_power[m, :] = np.maximum(lam_t * online_peak_power[m-1, :],
                                             rectified_signal[m, :])
        temporal_masked_signal[m, :] = np.where(
            rectified_signal[m, :] >= lam_t * online_peak_power[m - 1, :],
            rectified_signal[m, :],
            myu_t * online_peak_power[m - 1, :])

    return temporal_masked_signal


In [19]:
def switch_excitation_or_non_excitation(temporal_masked_signal,
                                        floor_level, lower_envelope,
                                        medium_time_power, c=2):
    return np.where(medium_time_power >= c * lower_envelope,
                    temporal_masked_signal, floor_level)

In [20]:
def weight_smoothing(final_output, medium_time_power, N=4, L=128):

    spectral_weight_smoothing = np.zeros_like(final_output)
    for m in range(final_output.shape[0]):
        for l in range(final_output.shape[1]):
            l_1 = max(l - N, 1)
            l_2 = min(l + N, L)
            spectral_weight_smoothing[m, l] = (1/float(l_2 - l_1 + 1)) * \
            sum([(final_output[m, l_] / medium_time_power[m, l_])
                 for l_ in range(l_1, l_2)])
    return spectral_weight_smoothing

In [21]:
def time_frequency_normalization(power_stft_signal,
                                 spectral_weight_smoothing):
    return power_stft_signal * spectral_weight_smoothing


In [22]:
def mean_power_normalization(transfer_function,
                             final_output, lam_myu=0.999, L=80, k=1):
    myu = np.zeros(shape=(transfer_function.shape[0]))
    myu[0] = 0.0001
    normalized_power = np.zeros_like(transfer_function)
    for m in range(1, transfer_function.shape[0]):
        myu[m] = lam_myu * myu[m - 1] + \
            (1 - lam_myu) / L * \
            sum([transfer_function[m, s] for s in range(0, L - 1)])
    normalized_power = k * transfer_function / myu[:, None]

    return normalized_power

In [23]:
def power_function_nonlinearity(normalized_power, n=15):
    return normalized_power ** float(1 / n)

In [24]:
def pncc(audio_wave, n_fft=512, sr=16000, winlen=0.020, winstep=0.010,
         n_mels=128, n_pncc=40, weight_N=4, power=2):

    pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave)
    mono_wave = to_mono(pre_emphasis_signal.T)
    stft_pre_emphasis_signal = np.abs(stft(mono_wave,
                                           n_fft=n_fft,
                                           hop_length=int(sr * winstep),
                                           win_length=int(sr * winlen),
                                           window=np.ones(int(sr * winlen)),
                                           center=False)) ** power

    mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels)) ** power
    power_stft_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T)

    medium_time_power = medium_time_power_calculation(power_stft_signal)

    lower_envelope = asymmetric_lawpass_filtering(
        medium_time_power, 0.999, 0.5)

    subtracted_lower_envelope = medium_time_power - lower_envelope

    rectified_signal = halfwave_rectification(subtracted_lower_envelope)

    floor_level = asymmetric_lawpass_filtering(rectified_signal)

    temporal_masked_signal = temporal_masking(rectified_signal)

    final_output = switch_excitation_or_non_excitation(
        temporal_masked_signal, floor_level, lower_envelope,
        medium_time_power)

    spectral_weight_smoothing = weight_smoothing(
        final_output, medium_time_power, L=n_mels)

    transfer_function = time_frequency_normalization(
        power_stft_signal,
        spectral_weight_smoothing)

    normalized_power = mean_power_normalization(
        transfer_function, final_output, L=n_mels)

    power_law_nonlinearity = power_function_nonlinearity(normalized_power)

    dct = np.dot(power_law_nonlinearity, filters.dct(
        n_pncc, power_law_nonlinearity.shape[1]).T)

    return dct

# set variables as default values

In [25]:
SAMPLE_RATE = 16000
n_fft=512
winlen=0.020
winstep=0.010
n_mels=128
n_pncc=40
weight_N=4
power=2
AUDIO_DURATION = 10 

In [26]:
print(len(allDataFilesInDataSetPaths))
print(allDataFilesInDataSetPaths[0])

7526
D:\GP\UASpeech\data\CF02_audio\CF02_0001.wav


In [50]:
#print(PnccFeaturesAsNumpyArray[0])
#print(len(PnccFeaturesAsNumpyArray))

In [27]:
def PnccFeatureExtraction(allAudioFilesPaths, sampleRate, audioDuration):
    allWavesInDataAsNumpyArray, pnccFeaturesNumpyArray,errorFilesInDataset = [],[],[]
    for dataFilePath in allAudioFilesPaths:
        print(dataFilePath.split('\\')[-1])
        try:
            signal, sampleRate = librosa.load(dataFilePath , sr = sampleRate , duration = audioDuration , res_type='kaiser_fast')#get signals and sampleRate in ane wave (dataFilePath)   
            pnccs = np.mean(pncc(signal, n_fft, sampleRate, winlen, winstep,n_mels, n_pncc, weight_N, power),axis=0)
  
        except:
            errorFilesInDataset.append(dataFilePath)
            print('there an error occurred in this file' + dataFilePath)
        feature = np.array(pnccs).reshape([-1,1]) #get pncc array featture for one wave after reshape it
        pnccFeaturesNumpyArray.append(feature) 
        allWavesInDataAsNumpyArray.append(signal)    
    return pnccFeaturesNumpyArray,allWavesInDataAsNumpyArray,errorFilesInDataset        

In [31]:
%%time
pnccFeaturesNumpyArray, allWavesInDataAsNumpyArray, errorFilesInDataset = PnccFeatureExtraction(allDataFilesInDataSetPaths[:], SAMPLE_RATE, AUDIO_DURATION)

CF02_0001.wav
there an error occurred in this fileD:\GP\UASpeech\data\CF02_audio\CF02_0001.wav


UnboundLocalError: local variable 'pnccs' referenced before assignment

In [24]:
print(pnccFeaturesNumpyArray[0])
print(len(pnccFeaturesNumpyArray))

[[ 8.78475089e+00]
 [ 6.99593424e-01]
 [-2.61023081e-01]
 [-2.78614445e-01]
 [-3.33414782e-01]
 [-2.35515399e-01]
 [-5.92686233e-02]
 [-6.61384345e-02]
 [-5.71070842e-02]
 [ 1.03512546e-01]
 [-3.18943151e-03]
 [-1.50290095e-02]
 [-8.22147816e-03]]
9433
