In [1]:
import pandas as pd
import numpy as np 
import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np


In [21]:
SEED = 1337
EPOCHS = 100
BATCH_SIZE = 64
VALIDATION_RATIO = 0.1
URL_PATH = "https://www.openslr.org/resources/83/"

zip_files = {
    0: "irish_english_male.zip",
    1: "midlands_english_female.zip",
    2: "midlands_english_male.zip",
    3: "northern_english_female.zip",
    4: "northern_english_male.zip",
    5: "scottish_english_female.zip",
    6: "scottish_english_male.zip",
    7: "southern_english_female.zip",
    8: "southern_english_male.zip",
    9: "welsh_english_female.zip",
    10: "welsh_english_male.zip",
}

gender_agnostic_categories = [
    "ir",  # Irish
    "mi",  # Midlands
    "no",  # Northern
    "sc",  # Scottish
    "so",  # Southern
    "we",  # Welsh
]
class_names = [
    "Irish",
    "Midlands",
    "Northern",
    "Scottish",
    "Southern",
    "Welsh",
    "Not a speech",
]

CACHE_DIR = None

In [7]:
keras.utils.set_random_seed(SEED)


DATASET_DESTINATION = os.path.join(CACHE_DIR if CACHE_DIR else "~/.keras/", "datasets")

Importing Data

In [9]:
line_index_file = keras.utils.get_file(fname="line_index_file", origin=URL_PATH + "line_index_all.csv")

for i in zip_files:
    fname = zip_files[i].split(".")[0]
    url = URL_PATH + zip_files[i]
    zip_file = keras.utils.get_file(fname=fname, origin=url, extract=True)
    os.remove(zip_file)

Downloading data from https://www.openslr.org/resources/83/irish_english_male.zip
Downloading data from https://www.openslr.org/resources/83/midlands_english_female.zip
Downloading data from https://www.openslr.org/resources/83/midlands_english_male.zip
Downloading data from https://www.openslr.org/resources/83/northern_english_female.zip
Downloading data from https://www.openslr.org/resources/83/northern_english_male.zip
Downloading data from https://www.openslr.org/resources/83/scottish_english_female.zip
Downloading data from https://www.openslr.org/resources/83/scottish_english_male.zip
Downloading data from https://www.openslr.org/resources/83/southern_english_female.zip
Downloading data from https://www.openslr.org/resources/83/southern_english_male.zip
Downloading data from https://www.openslr.org/resources/83/welsh_english_female.zip
Downloading data from https://www.openslr.org/resources/83/welsh_english_male.zip


In [20]:
data = pd.read_csv(line_index_file, names=["id", "filename", "transcript"], usecols=["filename"])
data

Unnamed: 0,filename
0,wef_12484_01482829612
1,wef_12484_01345932698
2,wef_12484_00999757777
3,wef_12484_00036278823
4,wef_12484_00458512623
...,...
17872,som_06592_00422956963
17873,som_06136_01223762368
17874,som_03349_00420644955
17875,som_03397_02006793154


In [22]:
def preprocess_dataframe(dataframe):
    # Remove leading space in filename column
    dataframe["filename"] = dataframe.apply(lambda row: row["filename"].strip(), axis=1)

    # Create gender agnostic labels based on the filename first 2 letters
    dataframe["label"] = dataframe.apply(
        lambda row: gender_agnostic_categories.index(row["filename"][:2]), axis=1
    )

    # Add the file path to the name
    dataframe["filename"] = dataframe.apply(
        lambda row: os.path.join(DATASET_DESTINATION, row["filename"] + ".wav"), axis=1
    )

    # Shuffle the samples
    dataframe = dataframe.sample(frac=1, random_state=SEED).reset_index(drop=True)

    return dataframe

data = preprocess_dataframe(data)
data.head()

Unnamed: 0,filename,label
0,D:\Data Science\ml\notebooks\datasets\som_0385...,4
1,D:\Data Science\ml\notebooks\datasets\som_0431...,4
2,D:\Data Science\ml\notebooks\datasets\sof_0613...,4
3,D:\Data Science\ml\notebooks\datasets\som_0248...,4
4,D:\Data Science\ml\notebooks\datasets\nom_0613...,2


In [24]:
split = int(len(data) * (1 - VALIDATION_RATIO))
train_df = data[:split]
valid_df = data[split:]

print( f"We have {train_df.shape[0]} training samples & {valid_df.shape[0]} validation ones")

We have 16089 training samples & 1788 validation ones


In [26]:
def load_16k_audio(filename):
    file_content = tf.io.read_file(filename)
    audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
    audio_wav = tf.squeeze(audio_wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)

    return audio_wav

def filepath_to_embeddings(filename, label):
    audio_wav = load_16k_audio(filename)
    scores, embeddings, _ = yamnet_model(audio_wav)

    embeddings_num = tf.shape(embeddings)[0]
    labels = tf.repeat(label, embeddings_num)
    #changing labels for time-slots that are not speech into new category 'other'
    labels = tf.where(tf.argmax(scores, axis=1) == 0, label, len(class_names) - 1)

    return (embeddings, tf.one_hot(labels, len(class_names)))

def dataframe_to_dataset(dataframe, batch_size=64):
    dataset = tf.data.Dataset.from_tensor_slices((dataframe["filename"], dataframe["label"]))

    dataset = dataset.map(lambda x, y: filepath_to_embeddings(x, y),num_parallel_calls=tf.data.experimental.AUTOTUNE,).unbatch()

    return dataset.cache().batch(batch_size).prefetch(tf.data.AUTOTUNE)


train_ds = dataframe_to_dataset(train_df)
valid_ds = dataframe_to_dataset(valid_df)

NotImplementedError: in user code:

    File "C:\Users\Lenovo\AppData\Local\Temp\ipykernel_19304\1715577312.py", line 24, in None  *
        lambda x, y: filepath_to_embeddings(x, y)
    File "C:\Users\Lenovo\AppData\Local\Temp\ipykernel_19304\1715577312.py", line 11, in filepath_to_embeddings  *
        audio_wav = load_16k_audio(filename)
    File "C:\Users\Lenovo\AppData\Local\Temp\ipykernel_19304\1715577312.py", line 6, in load_16k_audio  *
        audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)
    File "c:\Python310\lib\site-packages\tensorflow_io\python\ops\audio_ops.py", line 462, in resample  **
        value = tf.vectorized_map(f, input)
    File "c:\Python310\lib\site-packages\tensorflow_io\python\ops\audio_ops.py", line 458, in f
        return core_ops.io_audio_resample(
    File "c:\Python310\lib\site-packages\tensorflow_io\python\ops\__init__.py", line 88, in __getattr__
        return getattr(self._load(), attrb)
    File "c:\Python310\lib\site-packages\tensorflow_io\python\ops\__init__.py", line 84, in _load
        self._mod = _load_library(self._library)
    File "c:\Python310\lib\site-packages\tensorflow_io\python\ops\__init__.py", line 69, in _load_library
        raise NotImplementedError(

    NotImplementedError: unable to open file: libtensorflow_io.so, from paths: ['c:\\Python310\\lib\\site-packages\\tensorflow_io\\python\\ops\\libtensorflow_io.so']
    caused by: ['c:\\Python310\\lib\\site-packages\\tensorflow_io\\python\\ops\\libtensorflow_io.so not found']
