# Pre-Processing of LibriSpeech's audio files

This is the link to the source: http://www.openslr.org/12/

In [75]:
import pandas as pd
import random

from pathlib import Path
import os

In [None]:
# Set current working dir
os.chdir(Path("/home/adriel_martins/Documents/voice_recognition"))
data_folder = Path('LibriSpeech')

## Extracting meta-data for each audio file

In [None]:
for i in data_folder.iterdir():
    print(i.name)

In [73]:
# The .txt is divides the data with "|" for each row.
# So let us build a function to extract the target data.
"""
;ID  |SEX| SUBSET           |MINUTES| NAME
14   | F | train-clean-360  | 25.03 | Kristin LeMoine
16   | F | train-clean-360  | 25.11 | Alys AtteWater
"""

def line_data_extraction(line, proj):
    """ Extracting id and sex from SPEAKERS.TXT

    Args: 
    line [str]
    cols [list]
    proj [str]: which subproject we are interested

    # Returns 'cols' components extracted from each 'line'.
    """

    if proj not in line:
        return None

    # First clear whitespaces
    line = line.replace(' ', '')
    line_split = line.split('|')

    return line_split[0:2]


speakers = data_folder / 'SPEAKERS.TXT'

sub_proj = 'train-clean-100'

df_speaker = pd.DataFrame(columns=('id', 'sex'))

with speakers.open() as f: 
    for index, line in enumerate(f):
        data = line_data_extraction(line, sub_proj)
        if isinstance(data, list) and data is not None:
            values_to_add = {'id': data[0], 'sex': data[1]}
            row_to_add = pd.Series(values_to_add)
            df_speaker = df_speaker.append(row_to_add, ignore_index = True)
df_sp.head(10)

Unnamed: 0,id,sex
0,19,F
1,26,M
2,27,M
3,32,F
4,39,F
5,40,F
6,60,M
7,78,M
8,83,F
9,87,F


## Organizing audio files

In [72]:
for i in data_folder.iterdir():
    print(i.name)

train-clean-100
BOOKS.TXT
SPEAKERS.TXT
LICENSE.TXT
CHAPTERS.TXT
README.TXT


In [98]:
def randomly_select_speakers(df_speaker, n_each):
    """ Select n random speakers equally divided between men and women.

    Returns: 
    men [list],
    women [list]
    """
    df = df_speaker.groupby(['sex']).sample(n = n_each, replace = False)
    
    return df

randomly_select_speakers(df_speaker, 50).sample(5)

Unnamed: 0,id,sex
91,2159,M
192,6437,M
69,1363,F
87,2002,M
54,909,M


In [118]:
def id_soundfiles_df(df_speaker, sound_files_folder, number_of_audio_files):
    """ Create the dataframe of soundfiles.
    """
    df = pd.DataFrame(columns=('id', 'soundfile'))

    for speaker_id_folder in sound_files_folder.iterdir():
        if df_speaker_sample.id.isin([speaker_id_folder.name]).any():
            chapters_id_folders = [x for x in speaker_id_folder.iterdir()]
            # Always pick the first folder and explore all the files.
            # Also filtering for just the .flac sound files
            audio_files = [x.name for x in chapters_id_folders[0].iterdir() if ".flac" in x.name]
            # Randomly selecting some number of files
            audio_files = random.sample(audio_files, k = number_of_audio_files)
            # Populating our dataframe
            for i in audio_files:
                values_to_add = {'id': speaker_id_folder.name,
                'soundfile': i}
                row_to_add = pd.Series(values_to_add)
                df = df.append(row_to_add, ignore_index = True)

    return(df)


sound_files_folder = data_folder / 'train-clean-100'
df_speaker_sample = randomly_select_speakers(df_speaker, 50)
number_of_audio_files = 5

df = id_soundfiles_df(df_speaker_sample, sound_files_folder, number_of_audio_files)
df.head(10)

Unnamed: 0,id,soundfile
0,1624,1624-142933-0029.flac
1,1624,1624-142933-0045.flac
2,1624,1624-142933-0014.flac
3,1624,1624-142933-0000.flac
4,1624,1624-142933-0009.flac
5,8770,8770-295465-0001.flac
6,8770,8770-295465-0015.flac
7,8770,8770-295465-0011.flac
8,8770,8770-295465-0020.flac
9,8770,8770-295465-0025.flac
