# MFCC

## Imports

In [2]:
import librosa
import os

import pickle

import pandas as pd
import numpy as np
from tqdm import tqdm

## Extract data

### Creation of speaker_id -> label map

We recover the labels from the "SPEAKERS.TXT" file and create a map from speaker ids to their gender, encoded 0 for Male and 1 for Female.

In [5]:
## We first read the file with the labels line by line
label_path = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
with open(label_path, 'r') as f:
    lines = f.readlines()

## We create a list of id-gender
lines = lines[12:]

## selects the subsets that belong to the dev-clean dataset
subsets = (np.array([l[11:20] for l in lines]) == 'dev-clean')
lines = np.array([l[:9].split('|') for l in lines])[subsets]

### We map male to 0 and female to 1
labels_map = {
    'M' : 0,
    'F' : 1
}
lines = [[int(l[0].strip()), labels_map[l[1].strip()]] for l in lines]

## We change it to a map for convenience 
id_to_labels_map = {}

for l in lines:
    id_to_labels_map[l[0]] = l[1]

### Creation of the clean dataset

In this section I compute the envelope of the signal and filter out the parts that have a too low amplitude because they don't add any relevant information for the classification. We create a new binary file with all the clean audio.

In [163]:
FILE_NAME = 'data_clean.txt'

## We also downsample as we only need low frequencies for speech recognition
sampling_rate = 16000

In [164]:
def filter_envelope(x, sr, threshold = 0.0005):
    """
    This function filters out part of the signal that has a too low amplitude according to its envelope
    """
    y = pd.Series(np.abs(x))
    x_mean = y.rolling(window = int(sr/10), min_periods = 1, center = True).mean()
    return x[x > x_mean]

def create_clean_file(file_name):
    """
    fetches all the audio, converts them into mfccs,
    combines them with according labels 
    and stores them in the specified file name
    """
    data = []

    main_path = os.path.join('LibriSpeech', 'dev-clean')
    for speaker_id in id_to_labels_map.keys():
        speaker_path = os.path.join(main_path, str(speaker_id))
        speaker_label = id_to_labels_map[speaker_id]
        for folder in os.listdir(speaker_path):
            speech_path = os.path.join(speaker_path, folder)
            for e in os.listdir(speech_path):
                if e.endswith('.flac'):
                    x, sr = librosa.load(os.path.join(speech_path, e), sr = sampling_rate)
                    x_filtered = filter_envelope(x, sr)
                    data.append([x_filtered, speaker_label])
                    
    with open(file_name, 'wb') as f:
        pickle.dump(data, f)
        
create_clean_file(FILE_NAME)

## MFCCs computation

In this section we fetch the clean data and cut each data element in chunk according to the minimal frame length so that we have a consistent input length for our future models.
We then compute the MFCCs for each of the chunks and store them in a file.

In [140]:
def get_lengths(data, sr):
    """
    computes the total number of frames in the data
    """
    l = []
    for d in data:
        l.append(len(d[0]))
    return l

def get_clean(file_name):
    """
    fetches the cleaned data as well as the total time (in sec) of audio
    """
    with open('data.txt', 'rb') as f:
        data = pickle.load(f)
    lengths = get_lengths(data, sampling_rate)
    return data, lengths

In [143]:
clean, lengths = get_clean(FILE_NAME)
min_ = np.min(lengths)
max_ = np.max(lengths)
print(r"""the shortest audio is of {} frames while the longest of {} and
the avertage number of frame is {}.""".format(np.min(lengths), np.max(lengths), np.mean(lengths)))

sample_size = min_

the shortest audio is of 4384 frames while the longest of 100402 and
the avertage number of frame is 20970.730299667037.


In [236]:
def build_features(sample_size, sampling_rate):
    """
    separate the samples in chunks of equal size and compute their mfcc
    """
    X = []
    y = []
#     min_ = np.float('inf')
#     max_ = -np.float('inf')
    for c in tqdm(clean):
        signal = c[0]
        signal_label = c[1]
        signal_length = len(c[0])
        n_samples = int(signal_length / sample_size)
        for n in range(n_samples):
            a = int(n * sample_size)
            b = int((n+1) * sample_size)
            s = signal[a:b]
            mfcc = librosa.feature.mfcc(
                s,
                sr = sampling_rate
            )
            X.append(mfcc)
            y.append(signal_label)

            # inplace normalizaton but it actually perfomed worse
#             M = np.max(mfcc)
#             m = np.min(mfcc)
#             if M > max_:
#                 max_ = M
#             if m < min_:
#                 min_ = m
                
    X, y = np.array(X), np.array(y)
    #X = (X - min_) / (max_ - min_)
            
    return X, y

In [237]:
X, y = build_features(min_, sampling_rate)

100%|██████████████████████████████████████████████████████████████████████████████| 2703/2703 [00:35<00:00, 75.27it/s]


In [241]:
data = {}
data['X'] = X
data['y'] = y

with open('mfccs.txt', 'wb') as f:
    pickle.dump(data, f)