In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from IPython import display
from jiwer import wer
import librosa
import numpy as np
import soundfile
import json

import random
from python_speech_features import mfcc
import librosa
import scipy.io.wavfile as wav
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

from keras import backend as K
from keras.models import Model
from keras.layers import (BatchNormalization, Conv1D, Dense, Input, 
    TimeDistributed, Activation, Bidirectional, SimpleRNN, GRU, LSTM)
from keras.utils.vis_utils import plot_model


import _pickle as pickle
from numpy.lib.stride_tricks import as_strided

from keras.layers import (Input, Lambda)
from tensorflow.keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint   
import os

In [2]:
# loading scripts
sys.path.insert(1, '../scripts')
sys.path.append("..")
sys.path.append(".")

from data_cleaning import DataCleaner
from data_viz import Data_Viz
import acoustic_modeling as AM

DC = DataCleaner("../logs/preprocessing_notebook.log")
DV = Data_Viz()





In [3]:
# defining parameters

MIN_BATCH_SIZE = 40
MFCC_DIME = 13
WINDOW = 20          # in ms
STEP = 10            # in ms
MAX_FREQ = 8000      # in Hz
MODEL_NAME = "RNN_model"
EPOCHS = 50

In [4]:
# loading meta data

train_meta = DC.meta_loader("../data/train_meta.csv", "csv")
valid_meta = DC.meta_loader("../data/test_meta.csv", "csv")

print(f"Size of the training set: {len(train_meta)}")
print(f"Size of the validation set: {len(valid_meta)}")

Size of the training set: 800
Size of the validation set: 200


In [5]:
# replace redundant letters

train_meta["Target"] = train_meta["Target"].apply(lambda x: AM.replacer(x))
valid_meta["Target"] = valid_meta["Target"].apply(lambda x: AM.replacer(x))


In [6]:
audio_gen = AM.AudioGenerator(train_meta, valid_meta, minibatch_size=MIN_BATCH_SIZE,
                       window=WINDOW, step=STEP, max_freq=MAX_FREQ,
                       mfcc_dim=MFCC_DIME)

audio_gen.load_train_data()
audio_gen.load_validation_data()

In [7]:
model = AM.model_1(input_dim=13,
                units=5,
                activation='relu',
                output_dim=len(AM.char_map)+1)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None, 13)]        0         
_________________________________________________________________
rnn (GRU)                    (None, None, 5)           300       
_________________________________________________________________
batch_normalization (BatchNo (None, None, 5)           20        
_________________________________________________________________
time_distributed (TimeDistri (None, None, 223)         1338      
_________________________________________________________________
softmax (Activation)         (None, None, 223)         0         
Total params: 1,658
Trainable params: 1,648
Non-trainable params: 10
_________________________________________________________________
None


In [8]:
AM.train(audio_gen, input_to_softmax=model, model_name=MODEL_NAME, epochs=EPOCHS, minibatch_size=MIN_BATCH_SIZE)

Epoch 1/50
20/20 - 268s - loss: 10031.3916 - val_loss: 10067.7979
Epoch 2/50
20/20 - 211s - loss: 9788.7871 - val_loss: 9837.4434
Epoch 3/50
20/20 - 195s - loss: 9504.2236 - val_loss: 9541.1230
Epoch 4/50
20/20 - 190s - loss: 9181.6250 - val_loss: 9208.2998
Epoch 5/50
20/20 - 189s - loss: 8815.4062 - val_loss: 8777.6055
Epoch 6/50
20/20 - 188s - loss: 8405.6279 - val_loss: 8237.0898
Epoch 7/50
20/20 - 189s - loss: 7929.9326 - val_loss: 7596.1973
Epoch 8/50
20/20 - 189s - loss: 7429.4780 - val_loss: 7028.2588
Epoch 9/50
20/20 - 188s - loss: 6903.9346 - val_loss: 6456.1636
Epoch 10/50
20/20 - 189s - loss: 6367.2158 - val_loss: 5855.8735
Epoch 11/50
20/20 - 188s - loss: 5803.6587 - val_loss: 5219.4839
Epoch 12/50
20/20 - 189s - loss: 5235.6733 - val_loss: 4675.0933
Epoch 13/50
20/20 - 190s - loss: 4713.3882 - val_loss: 4161.4941
Epoch 14/50
20/20 - 189s - loss: 4231.5278 - val_loss: 3607.8831
Epoch 15/50
20/20 - 189s - loss: 3796.4541 - val_loss: 3330.9231
Epoch 16/50
20/20 - 189s - loss:

In [9]:
AM.predict(audio_gen,14, 'train', model)
_,_,_,raw_pred = AM.predict_raw(audio_gen,14, 'train', model)
#raw_pred_char = np.vstack([sorted(AM.char_map.keys(), key=lambda k: AM.char_map[k]) + ['BLANK'], raw_pred])

Truth: ለ መሆኑ የ ቁንጅና መለኪያ ው ምንድነው
Predicted: ሀዎሀዎሀዎኡዎኡዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሶዎሀ
wer: 235
