In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from IPython import display
from jiwer import wer
import librosa
import numpy as np
import soundfile
import json

import random
from python_speech_features import mfcc
import librosa
import scipy.io.wavfile as wav
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

from keras import backend as K
from keras.models import Model
from keras.layers import (BatchNormalization, Conv1D, Dense, Input, 
    TimeDistributed, Activation, Bidirectional, SimpleRNN, GRU, LSTM)
from keras.utils.vis_utils import plot_model


import _pickle as pickle
from numpy.lib.stride_tricks import as_strided

from keras.layers import (Input, Lambda)
from tensorflow.keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint   
import os

In [16]:
# loading scripts
sys.path.insert(1, '../scripts')
sys.path.append("..")
sys.path.append(".")

from data_cleaning import DataCleaner
from data_viz import Data_Viz
import acoustic_modeling as AM

DC = DataCleaner("../logs/preprocessing_notebook.log")
DV = Data_Viz()


In [17]:
# defining parameters

MIN_BATCH_SIZE = 20
MFCC_DIME = 13
WINDOW = 20          # in ms
STEP = 10            # in ms
MAX_FREQ = 8000      # in Hz
MODEL_NAME = "RNN_model"
EPOCHS = 5

In [18]:
# loading meta data

train_meta = DC.meta_loader("../data/train_meta.csv", "csv")
valid_meta = DC.meta_loader("../data/test_meta.csv", "csv")

print(f"Size of the training set: {len(train_meta)}")
print(f"Size of the validation set: {len(valid_meta)}")

Size of the training set: 800
Size of the validation set: 200


In [19]:
# replace redundant letters

train_meta["Target"] = train_meta["Target"].apply(lambda x: AM.replacer(x))
valid_meta["Target"] = valid_meta["Target"].apply(lambda x: AM.replacer(x))


In [20]:
audio_gen = AM.AudioGenerator(train_meta, valid_meta, minibatch_size=MIN_BATCH_SIZE,
                       window=WINDOW, step=STEP, max_freq=MAX_FREQ,
                       mfcc_dim=MFCC_DIME)

audio_gen.load_train_data()
audio_gen.load_validation_data()

In [21]:
model = AM.model_1(input_dim=13,
                units=5,
                activation='relu',
                output_dim=len(AM.char_map)+1)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None, 13)]        0         
_________________________________________________________________
rnn (GRU)                    (None, None, 5)           300       
_________________________________________________________________
batch_normalization_1 (Batch (None, None, 5)           20        
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 223)         1338      
_________________________________________________________________
softmax (Activation)         (None, None, 223)         0         
Total params: 1,658
Trainable params: 1,648
Non-trainable params: 10
_________________________________________________________________
None


In [22]:
AM.train(audio_gen, input_to_softmax=model, model_name=MODEL_NAME, epochs=EPOCHS, minibatch_size=MIN_BATCH_SIZE)

Epoch 1/5
40/40 - 273s - loss: 9914.3125 - val_loss: 9820.0371
Epoch 2/5
40/40 - 227s - loss: 9334.2529 - val_loss: 9163.5469
Epoch 3/5
40/40 - 222s - loss: 8526.9434 - val_loss: 8111.6938
Epoch 4/5
40/40 - 220s - loss: 7464.6177 - val_loss: 6501.9727
Epoch 5/5
40/40 - 218s - loss: 6256.3687 - val_loss: 5185.6133


In [25]:
AM.predict(audio_gen,14, 'train', model)
_,_,_,raw_pred = AM.predict_raw(audio_gen,14, 'train', model)
#raw_pred_char = np.vstack([sorted(AM.char_map.keys(), key=lambda k: AM.char_map[k]) + ['BLANK'], raw_pred])

Truth: አንቺ ልጅ ያ የ ሽውን ነገር ሁሉ ለ ማግኘት አት ጓጉ
Predicted: ሀዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡዩኡኖኡኖኡቶዎቶጆኖኡኖረኖፆዢፆፑኖኛሱቶሱሞሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀቦሀ
wer: 34
