<a href="https://colab.research.google.com/github/GreihMurray/KriolTranscriber/blob/master/lstm_chars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate
!pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.4 MB/s 
Collecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 28.2 MB/s 
Collecting datasets>=2.0.0
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 68.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 68.0 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 72.7 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloa

In [None]:
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from keras.utils.vis_utils import plot_model
import tensorflow as tf
from tensorflow.keras import activations
import numpy as np
from evaluate import load
import itertools
from scipy.io import wavfile
from keras import backend as K
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Reshape, GRU, Flatten, Activation

In [None]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
import pandas as pd
from tqdm import tqdm
import os
import unicodedata
import re
from keras.models import load_model

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
DEFAULT_DIR = '/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/'
SR = 44100

In [None]:
def load_html_data(dir_ext):
    all_data = []
    chars_to_replace = '.,;:<>?/\'\\[]()!#$%"”“'
    directory = DEFAULT_DIR + 'html/' + dir_ext + '/'

    dir_files = sorted(os.listdir(directory))

    for file in tqdm(dir_files, desc='Loading HTML Data'):
        file = directory + file
        divs = []
        with open(file, 'r', encoding='UTF-8') as in_file:
            data = ' '.join(in_file.readlines())
            data = unicodedata.normalize('NFC', data)
            data = re.sub(r'<span class=.*?</span>', '', data)
            data = re.sub(r'<a.*?</a>', '', data)
            data = data.replace(u'\xa0', u' ')
            check_divs = re.findall(r'<div class=\'mt\'.*?>(.*?) </div>', data)
            check_divs.extend(re.findall(r'<div class=\'mt2\'.*?>(.*?) </div>', data))
            # print(data)
            check_divs.extend(re.findall(r'<div class=\'ip\'>(.*)', data))

            if len(check_divs) > 0:
                full = '!'.join(check_divs)
                full = re.sub(r'[\,,@,#,$,%,^,&,*,(,),\[,\],\',\",;,:,“,”,‘,’]', '', full)
                full = re.sub('^\s+', ' ', full).strip('\u00A0')
                full = re.split('[\.,\?,!,\n]', str(full))
                all_data.extend([s.strip() for s in full])

            divs.extend(re.findall(r'<div class=\'[p,s]\'.*?>(.*?) </div>', data))

        full_data = '!'.join(divs)
        full_data = re.sub(r'[\,,@,#,$,%,^,&,*,(,),\[,\],\',\",;,:,“,”,‘,’]', '', full_data)
        full_data = re.sub(' +', ' ', full_data)
        full_data = re.split('[\.,\?,!,\n]', str(full_data))

        all_data.extend([s.strip() for s in full_data])

        clean = []
        for row2 in all_data:
            if len(row2) >= 1:
                clean.append(row2.lower())

    return clean

In [None]:
def load_dataset(dir_ext):
    transcripts = load_html_data(dir_ext)
    all_x = []
    all_y = []
    directory = DEFAULT_DIR + 'audio/' + dir_ext + '/'

    order = []

    dir_files = sorted(os.listdir(directory))

    for i, file in tqdm(enumerate(dir_files), desc='Loading Audio Data & Creating Dataset'):
        file = directory + file
        
        sr, data = wavfile.read(file)

        max_len = SR//1000 * 15000

        if len(data) > max_len:
          continue

        all_x.append(data)
        all_y.append(transcripts[i])


    return pd.DataFrame(list(zip(all_x, all_y)), columns=['audio', 'transcription'])

In [None]:
def one_hot(data, test_data, map_use):
    mapping = {}
    mapped = []

    i = 0

    if not test_data:
        for sentence in data:
          cur_map = []
          for word in sentence:
              if word in mapping.keys():
                cur_map.append(mapping[word])
              else:
                i += 1
                cur_map.append(i)
                mapping[word] = i
                
          mapped.append(cur_map)
    else:
        for sentence in data:
          cur_map = []
          for word in sentence:
              if word in map_use.keys():
                cur_map.append(map_use[word])
              else:
                i += 1
                cur_map.append(i)
                map_use[word] = i
                
          mapped.append(cur_map)
      
    return mapped, mapping

In [None]:
def vectorize(data):
    vecs = []
    for row in data:
      cur_vec = [0] * len(row[0])
      for piece in row:
        piece = list(piece)
        cur_vec[piece.index(1)] += 1
      cur_vec[2] = 1
      vecs.append(cur_vec)

    return vecs

In [None]:
def pad_audio(data):
    max_len = SR//1000 * 15000

    for i, aud in tqdm(enumerate(data), desc='Padding audio'):
      if len(aud) < max_len:
        data[i] = np.array(np.append(aud, np.zeros(max_len - len(aud))))

    return data

In [None]:
def get_min(audio):
    minim = 0

    for row in tqdm(audio, desc='Finding min'):
        if min(row) < minim:
            minim = min(row)
    
    return minim

In [None]:
def get_max(audio):
    maxim = 0

    for row in tqdm(audio, desc='Finding max'):
        if max(row) > maxim:
           maxim = max(row)

    return maxim

In [None]:
def adjust_audio(audio, minim):
    for i, row in tqdm(enumerate(audio), desc='Adjusting Audio'):
        for j, val in enumerate(row):
            audio[i][j] = int(val + abs(minim))

    return audio

In [None]:
def advanced_relu(x):
    return K.relu(x, max_value=10000000)

In [None]:
def build_model(input_len, output_len, maxim):
    model = Sequential()
    model.add(Dense(321, input_shape=(input_len,), activation=advanced_relu))
    model.add(Embedding(input_dim=9999999, output_dim=34))
    model.add(Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1)))
    model.add(Dense(34, activation='softmax'))
    #model.add(Reshape((321, 34)))
    
    model.compile(optimizer="adam", metrics=["accuracy"], loss='categorical_crossentropy')


    return model

In [None]:
def load_and_process_data(folder_path, test_data=False, map_use=None):
    df = load_dataset(folder_path)
    clean_y = list(df['transcription'])

    padded_y = list(zip(*itertools.zip_longest(*list(df['transcription']), fillvalue='!')))
    enc_y, mapping = one_hot(padded_y, test_data, map_use)
    cat_y = np.array(to_categorical(enc_y))

    padded_x = pad_audio(df['audio'])

    minim = get_min(padded_x)
    padded_x = adjust_audio(padded_x, minim)
    maxim = get_max(padded_x)

    padded_x = np.stack(padded_x)

    for entry in cat_y:
        for row in entry:
            if row[7] != 0:
                row[7] = 0

    return clean_y, cat_y, padded_x, minim, maxim, mapping

In [None]:
def evaluate(model, clean_y, padded_x, mapping):
    cur_words = []

    map_list = mapping.items()
    map_key = list(mapping.keys())

    last_char = False

    all_words = []

    preds = model.predict(padded_x)

    print(len(preds))

    for pred in preds:
        cur_words = []
        for row in pred:
            row = list(row)
            cur_words.append(map_key[row.index(max(row))-1])
            
        len_words = len(cur_words) - 1

        for i in range(len_words, 0, -1):
            if cur_words[i] != ' ':
                joined = ''.join(cur_words)
                all_words.append(re.sub(' +', ' ', joined))
                break
            else:
                cur_words.pop()

    cer = load('cer')
    cer_score = cer.compute(predictions=all_words, references=clean_y[:len(all_words)])
    print('Character Error Rate:', cer_score)

    wer = load('wer')
    wer_score = wer.compute(predictions=all_words, references=clean_y[:len(all_words)])
    print('Word Error Rate:', wer_score)

    return cer_score, wer_score

In [None]:
hold_padded = 'NA'
hold_test = []
hold_map = []
hold_y_test = []
hold_clean_y = []
hold_cat_y = []

In [None]:
def multi_run():
    print('\033[95m' + 'LOADING TRAINING DATA\n')
    clean_y, cat_y, padded_x, minim, maxim, mapping = load_and_process_data('train')
    print('\n\n', '\033[95m' + 'LOADING TESTING DATA\n', sep='')
    y_test, _, x_test, _, _, _ = load_and_process_data('test', test_data=True, map_use=mapping)

    base_model_name = '/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_big_'

    all_metrics = []

    model = build_model(len(padded_x[0]), len(cat_y[0]), maxim)

    for i in range(5, 100, 5):
      model_name = base_model_name + str(i)
      model.fit(padded_x, cat_y, epochs=5, verbose=1, batch_size=1)

      cer, wer = evaluate(model, clean_y, padded_x, mapping)
      test_cer, test_wer = evaluate(model, y_test, x_test, mapping)

      all_metrics.append((i, (cer, wer), (test_cer, test_wer)))

      model.save(model_name)

    for row in all_metrics:
        print(row)

In [None]:
multi_run()

[95mLOADING TRAINING DATA



Loading HTML Data: 100%|██████████| 290/290 [00:04<00:00, 67.31it/s] 
  sr, data = wavfile.read(file)
Loading Audio Data & Creating Dataset: 356it [00:06, 52.44it/s] 
Padding audio: 306it [00:00, 461.11it/s]
Finding min: 100%|██████████| 306/306 [00:17<00:00, 17.87it/s]
Adjusting Audio: 306it [09:55,  1.95s/it]
Finding max: 100%|██████████| 306/306 [00:16<00:00, 18.35it/s]




[95mLOADING TESTING DATA



Loading HTML Data: 100%|██████████| 3/3 [00:01<00:00,  1.82it/s]
Loading Audio Data & Creating Dataset: 51it [00:25,  2.01it/s]
Padding audio: 41it [00:00, 694.39it/s]
Finding min: 100%|██████████| 41/41 [00:02<00:00, 16.11it/s]
Adjusting Audio: 41it [01:19,  1.94s/it]
Finding max: 100%|██████████| 41/41 [00:02<00:00, 17.50it/s]


Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306


Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

Character Error Rate: 0.9786222402417898


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Word Error Rate: 1.0
41
Character Error Rate: 0.978601997146933
Word Error Rate: 1.0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.9769636209502046
Word Error Rate: 0.9935458634852338
41
Character Error Rate: 0.9789586305278174
Word Error Rate: 1.0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.9728723600309609
Word Error Rate: 0.9888519460199492
41
Character Error Rate: 0.9768188302425107
Word Error Rate: 1.0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.8724706055803325
Word Error Rate: 0.9884607862311754
41
Character Error Rate: 0.9629101283880172
Word Error Rate: 0.9980353634577603
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.7711842541741919
Word Error Rate: 0.9880696264424017
41
Character Error Rate: 0.9033523537803139
Word Error Rate: 1.0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.6945191846964727
Word Error Rate: 0.98

In [None]:
def main(train=False, model=''):
    print('\033[95m' + 'LOADING TRAINING DATA\n')
    clean_y, cat_y, padded_x, minim, maxim, mapping = load_and_process_data('train')
    print('\n\n', '\033[95m' + 'LOADING TESTING DATA\n', sep='')
    y_test, _, x_test, _, _, _ = load_and_process_data('test', test_data=True, map_use=mapping)
    

    if train:
        # model = build_model(len(padded_x[0]), len(cat_y[0]), get_max(padded_x))
        model = load_model(model)
        plot_model(model, show_shapes=True)

        model.fit(padded_x, cat_y, epochs=5, verbose=1, batch_size=1)
        model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_20e')
    else:
        # '/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_450e'
        model = load_model(model)

    evaluate(model, y_test, x_test, mapping)

    print("Train set")

    evaluate(model, clean_y, padded_x, mapping)

In [None]:
main(train=True, model='/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_15e')

[95mLOADING TRAINING DATA



Loading HTML Data: 100%|██████████| 290/290 [00:02<00:00, 115.27it/s]
  sr, data = wavfile.read(file)
Loading Audio Data & Creating Dataset: 356it [00:01, 199.10it/s]
Padding audio: 306it [00:03, 97.18it/s]
Finding min: 100%|██████████| 306/306 [00:14<00:00, 20.83it/s]
Adjusting Audio: 306it [10:43,  2.10s/it]
Finding max: 100%|██████████| 306/306 [00:14<00:00, 21.39it/s]




[95mLOADING TESTING DATA



Loading HTML Data: 100%|██████████| 3/3 [00:00<00:00, 362.51it/s]
Loading Audio Data & Creating Dataset: 51it [00:00, 253.53it/s]
Padding audio: 41it [00:00, 206.92it/s]
Finding min: 100%|██████████| 41/41 [00:02<00:00, 19.08it/s]
Adjusting Audio: 41it [01:32,  2.25s/it]
Finding max: 100%|██████████| 41/41 [00:01<00:00, 20.76it/s]


Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




41
Character Error Rate: 0.8209700427960057
Word Error Rate: 1.0412573673870333
Train set
306
Character Error Rate: 0.7161151824709453
Word Error Rate: 0.9929286977018268


In [None]:
hold_test

array([[26478., 26483., 26489., ..., 26478., 26478., 26478.],
       [26544., 26540., 26535., ..., 26478., 26478., 26478.],
       [26432., 26449., 26473., ..., 26478., 26478., 26478.],
       ...,
       [26453., 26452., 26451., ..., 26478., 26478., 26478.],
       [26468., 26443., 26453., ..., 26478., 26478., 26478.],
       [26480., 26481., 26479., ..., 26478., 26478., 26478.]])

In [None]:
# 10 epochs CER - 0.9186 WER - 0.9918
# 15 epochs CER - 0.8466 WER - 1.0059
# 20 epochs CER - 0.8219 WER - 1.0412
# 45 epochs CER - 0.8638 WER - 1.0 