In [44]:
import librosa
import librosa.display
import IPython.display as ipd
import os
import numpy as np
import itertools
import pandas as pd
import math
from scipy.signal import get_window
import scipy
from sklearn import preprocessing
from scipy.io.wavfile import read
import python_speech_features as mfcc
from sklearn.mixture import GaussianMixture 
import pickle


## Feartures Extraction

##### MFCC Feature

In [45]:
# Normalization in order to get rid of amplification levels and differences between mics
def normalize_audio(audio):
    audio = audio / np.max(np.abs(audio))
    return audio

In [46]:
# we devide the signal into short frames. Each audio frame will be the same size as the FFT
def frame_audio(audio, FFT_size=2048, hop_size=10, sample_rate=44100):
    audio = np.pad(audio, int(FFT_size / 2), mode='reflect')
    frame_len = np.round(sample_rate * hop_size / 1000).astype(int)
    frame_num = int((len(audio) - FFT_size) / frame_len) + 1
    frames = np.zeros((frame_num,FFT_size))
    
    for n in range(frame_num):
        frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
    
    return frames

In [47]:
def freq_to_mel(freq):
    return 2595.0 * np.log10(1.0 + freq / 700.0)

def met_to_freq(mels):
    return 700.0 * (10.0**(mels / 2595.0) - 1.0)

In [48]:
def get_filter_points(fmin, fmax, mel_filter_num, FFT_size, sample_rate=44100):
    fmin_mel = freq_to_mel(fmin)
    fmax_mel = freq_to_mel(fmax)
    
    mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num+2)
    freqs = met_to_freq(mels)
    
    return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs

In [49]:
def get_filters(filter_points, FFT_size):
    filters = np.zeros((len(filter_points)-2,int(FFT_size/2+1)))
    
    for n in range(len(filter_points)-2):
        filters[n, filter_points[n] : filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
        filters[n, filter_points[n + 1] : filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
    
    return filters

In [50]:
def dct(dct_filter_num, filter_len):
    basis = np.empty((dct_filter_num,filter_len))
    basis[0, :] = 1.0 / np.sqrt(filter_len)
    
    samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len)

    for i in range(1, dct_filter_num):
        basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_len)
        
    return basis

In [51]:
def extract_MFCC(file_path):
    hop_size = 15 #ms
    FFT_size = 2048
    audio , sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    audio = normalize_audio(audio)
    audio_framed = frame_audio(audio, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)
    window = get_window("hann", FFT_size, fftbins=True)
    audio_win = audio_framed * window
    audio_winT = np.transpose(audio_win)
    
    audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')

    for n in range(audio_fft.shape[1]):
        audio_fft[:, n] = scipy.fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]

    audio_fft = np.transpose(audio_fft)

    audio_power = np.square(np.abs(audio_fft))

    freq_min = 0
    freq_high = sample_rate / 2
    mel_filter_num = 10
    filter_points, mel_freqs = get_filter_points(freq_min, freq_high, mel_filter_num, FFT_size, sample_rate=44100)
    filters = get_filters(filter_points, FFT_size)

    enorm = 2.0 / (mel_freqs[2:mel_filter_num+2] - mel_freqs[:mel_filter_num])
    filters *= enorm[:, np.newaxis]

    audio_filtered = np.dot(filters, np.transpose(audio_power))
    audio_log = 10.0 * np.log10(audio_filtered)

    dct_filter_num = 40

    dct_filters = dct(dct_filter_num, mel_filter_num)

    cepstral_coefficents = np.dot(dct_filters, audio_log)
    
    # cepstral_coefficents = list(itertools.chain.from_iterable(cepstral_coefficents))

    return cepstral_coefficents

In [52]:
def calculate_delta(array):
	
    rows,cols = array.shape
    print(rows)
    print(cols)
    deltas = np.zeros((rows,20))
    N = 2
    for i in range(rows):
        index = []
        j = 1
        while j <= N:
            if i-j < 0:
              first =0
            else:
             first = i-j
            if i+j > rows-1:
                second = rows-1
            else:
                second = i+j 
            index.append((second,first))
            j+=1
        deltas[i] = ( array[index[0][0]]-array[index[0][1]] + (2 * (array[index[1][0]]-array[index[1][1]])) ) / 10
    return deltas


def extract_features(file_path):
    audio , sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfcc_feature = mfcc.mfcc(audio,sample_rate, 0.025, 0.01,20,nfft = 1200, appendEnergy = True)    
    mfcc_feature = preprocessing.scale(mfcc_feature)
    print(mfcc_feature)
    delta = calculate_delta(mfcc_feature)
    combined = np.hstack((mfcc_feature,delta)) 
    return combined

In [53]:
# featuresYahia = []
# directory = './files/yahia/'
# for audio in os.listdir('./files/yahia/'):
#     audio_path = directory + audio
#     featuresYahia1=np.vstack(extract_features(audio_path) )

featuresAhmed = []
directory = './files/Ahmed/'
for audio in os.listdir('./files/Ahmed/'):
    audio_path = directory + audio
    featuresAhmed=np.vstack(extract_features(audio_path) )

featuresMo = []
directory = './files/Moustafa/'
for audio in os.listdir('./files/Moustafa/'):
    audio_path = directory + audio
    featuresMo=np.vstack(extract_features(audio_path) )

featuresMagdy = []
directory = './files/magdy/'
for audio in os.listdir('./files/magdy/'):
    audio_path = directory + audio
    featuresMagdy=np.vstack(extract_features(audio_path) )

featuresMahmoud = []
directory = './files/mostafa/'
for audio in os.listdir('./files/mostafa/'):
    audio_path = directory + audio
    featuresMahmoud=np.vstack(extract_features(audio_path) )

featuresMayar = []
directory = './files/mayar/'
for audio in os.listdir('./files/mayar/'):
    audio_path = directory + audio
    featuresMayar=np.vstack(extract_features(audio_path) )

[[-1.03799033 -1.08202199 -0.70504352 ...  0.72891269  0.71736599
  -0.08164243]
 [-1.01039356 -1.01866903 -0.6064167  ...  0.69666927  0.76103493
   0.23152329]
 [-0.99369511 -0.97220825 -0.49206718 ...  1.22628553  1.09938004
   0.11052633]
 ...
 [-1.19239167 -1.20951341  0.16874981 ...  1.06566786  0.52298832
   0.60747366]
 [-1.18338908 -1.30394756  0.08473117 ...  0.63878236  0.1971698
  -0.05223802]
 [-1.25469927 -1.29412423  0.12991359 ...  0.62772926  0.50660075
   0.24389224]]
298
20
[[-7.89323589  1.21869445  0.54783541 ... -0.23966704  0.64636887
  -0.82594121]
 [-0.14108271 -0.15265947  1.81178274 ...  2.05606611  1.09836282
   0.54705531]
 [ 0.14151781 -0.03958597  1.77102502 ...  1.25142315 -0.4783785
  -1.36132453]
 ...
 [-0.46492662 -0.46580099  0.2898985  ...  0.60145573 -1.40575436
  -0.3618373 ]
 [-0.46881333 -0.5310469   0.254768   ...  0.82519692 -0.09576848
  -0.17068502]
 [-0.52717411 -0.65828304  0.32454877 ...  1.11469259  0.40785072
  -0.64702002]]
574
20
[[-7

In [54]:
# yahia_gmm = GaussianMixture(n_components = 6, max_iter = 2000, covariance_type='diag',n_init = 3)
# yahia_gmm.fit(featuresYahia1)

ahmed_gmm = GaussianMixture(n_components = 6, max_iter = 2000, covariance_type='diag',n_init = 3)
ahmed_gmm.fit(featuresAhmed)

mostafa_gmm = GaussianMixture(n_components = 6, max_iter = 2000, covariance_type='diag',n_init = 3)
mostafa_gmm.fit(featuresMo)


magdy_gmm = GaussianMixture(n_components = 6, max_iter = 2000, covariance_type='diag',n_init = 3)
magdy_gmm.fit(featuresMagdy)

mahmoud_gmm = GaussianMixture(n_components = 6, max_iter = 2000, covariance_type='diag',n_init = 3)
mahmoud_gmm.fit(featuresMahmoud)

mayar_gmm = GaussianMixture(n_components = 6, max_iter = 2000, covariance_type='diag',n_init = 3)
mayar_gmm.fit(featuresMayar)


In [55]:
pickle.dump(mayar_gmm,open('mayar.gmm','wb'))

In [56]:
pickle.dump(mahmoud_gmm,open('mahmoud_gmm.gmm','wb'))

In [57]:

pickle.dump(magdy_gmm,open('magdy.gmm','wb'))

In [58]:
pickle.dump(ahmed_gmm,open('Ahmed.gmm','wb'))

In [59]:

# pickle.dump(yahia_gmm,open('yahia.gmm','wb'))

In [60]:
pickle.dump(mostafa_gmm,open('mostafa.gmm','wb'))

In [61]:
call_ahmed_model=pickle.load(open('Ahmed.gmm','rb'))

In [78]:
# test = extract_features('./static/assets/recordedAudio.wav')
test = extract_features('./files/test/ahmed-30-mic.wav')

[[-6.42412236  1.26222091  0.57105633 ... -0.02386914  0.65234395
  -0.56150957]
 [-0.05243789 -0.78662525  0.28055002 ...  0.78044006  2.76570367
   1.3789962 ]
 [ 0.07223511 -0.21985995  1.53010019 ... -0.04891922  0.90548104
  -0.42276089]
 ...
 [-0.41772291 -0.7084763  -0.04745201 ...  1.97546789  2.07001611
   0.06555905]
 [-0.4203558  -0.58961071  0.37859337 ...  1.9906608   2.66509367
   1.0908988 ]
 [-0.41515481 -0.60793988  0.52391672 ...  1.15358981  2.06884301
   1.212996  ]]
586
20


In [79]:
scores_1 = np.array(ahmed_gmm.score(test))
# scores_11 = np.array(call_ahmed_model.score(b))
# scores_2 = np.array(yahia_gmm.score(test))
scores_3 =np.array(mostafa_gmm.score(test))
scores_4=np.array(magdy_gmm.score(test))
scores_5 =np.array(mahmoud_gmm.score(test))
scores_6=np.array(mayar_gmm.score(test))


print(scores_1)
# print(scores_2)
print(scores_3)
print(scores_4)
print(scores_5)
print(scores_6)
# print(scores_11)


# if (scores_1 > scores_2):
#     print("Ahmed")
# else:
#     print("Yahia")

-33.731818312350235
-35.711069025521695
-29.60195785603506
-37.82767300614577
-37.08680070985235
