<a href="https://colab.research.google.com/github/GreihMurray/KriolTranscriber/blob/master/lstm_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate
!pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 611 kB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 9.8 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 52.0 MB/s 
Collecting datasets>=2.0.0
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 52.0 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 48.8 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Download

In [None]:
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from keras.utils.vis_utils import plot_model
import tensorflow as tf
from tensorflow.keras import activations
import numpy as np
import itertools
from scipy.io import wavfile
from evaluate import load
from keras import backend as K
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Reshape, GRU, Flatten, Activation

In [None]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
import pandas as pd
from tqdm import tqdm
import os
import unicodedata
import re

from keras.models import load_model

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
DEFAULT_DIR = '/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/'
SR = 44100

In [None]:
def load_html_data(dir_ext):
    all_data = []
    chars_to_replace = '.,;:<>?/\'\\[]()!#$%"”“'
    directory = DEFAULT_DIR + 'html/' + dir_ext + '/'

    dir_files = sorted(os.listdir(directory))

    for file in tqdm(dir_files, desc='Loading HTML Data'):
        file = directory + file
        divs = []
        with open(file, 'r', encoding='UTF-8') as in_file:
            data = ' '.join(in_file.readlines())
            data = unicodedata.normalize('NFC', data)
            data = re.sub(r'<span class=.*?</span>', '', data)
            data = re.sub(r'<a.*?</a>', '', data)
            data = data.replace(u'\xa0', u' ')
            check_divs = re.findall(r'<div class=\'mt\'.*?>(.*?) </div>', data)
            check_divs.extend(re.findall(r'<div class=\'mt2\'.*?>(.*?) </div>', data))
            # print(data)
            check_divs.extend(re.findall(r'<div class=\'ip\'>(.*)', data))

            if len(check_divs) > 0:
                full = '!'.join(check_divs)
                full = re.sub(r'[\,,@,#,$,%,^,&,*,(,),\[,\],\',\",;,:,“,”,‘,’]', '', full)
                full = re.sub('^\s+', ' ', full).strip('\u00A0')
                full = re.split('[\.,\?,!,\n]', str(full))
                all_data.extend([s.strip() for s in full])

            divs.extend(re.findall(r'<div class=\'[p,s]\'.*?>(.*?) </div>', data))

        full_data = '!'.join(divs)
        full_data = re.sub(r'[\,,@,#,$,%,^,&,*,(,),\[,\],\',\",;,:,“,”,‘,’]', '', full_data)
        full_data = re.sub(' +', ' ', full_data)
        full_data = re.split('[\.,\?,!,\n]', str(full_data))

        all_data.extend([s.strip() for s in full_data])

        clean = []
        for row2 in all_data:
            if len(row2) >= 1:
                clean.append(row2.lower())

    return clean

In [None]:
def load_dataset(dir_ext):
    transcripts = load_html_data(dir_ext)
    all_x = []
    all_y = []
    directory = DEFAULT_DIR + 'audio/' + dir_ext + '/'

    order = []

    dir_files = sorted(os.listdir(directory))

    for i, file in tqdm(enumerate(dir_files), desc='Loading Audio Data & Creating Dataset'):
        file = directory + file
        
        sr, data = wavfile.read(file)

        max_len = SR//1000 * 15000

        if len(data) > max_len:
          continue

        all_x.append(data)
        all_y.append(transcripts[i])


    return pd.DataFrame(list(zip(all_x, all_y)), columns=['audio', 'transcription'])

In [None]:
def one_hot(data, test_data, map_use):
    mapping = {}
    mapped = []

    i = 0

    if not test_data:
        for sentence in data:
          cur_map = []
          for word in sentence:
              if word in mapping.keys():
                cur_map.append(mapping[word])
              else:
                i += 1
                cur_map.append(i)
                mapping[word] = i
                
          mapped.append(cur_map)
    else:
        for sentence in data:
          cur_map = []
          for word in sentence:
              if word in map_use.keys():
                cur_map.append(map_use[word])
              else:
                i += 1
                cur_map.append(i)
                map_use[word] = i
                
          mapped.append(cur_map)
      
    return mapped, mapping

In [None]:
def vectorize(data):
    vecs = []
    for row in data:
      cur_vec = [0] * len(row[0])
      for piece in row:
        piece = list(piece)
        cur_vec[piece.index(1)] += 1
      cur_vec[2] = 1
      vecs.append(cur_vec)

    return vecs

In [None]:
def pad_audio(data):
    max_len = SR//1000 * 15000

    for i, aud in tqdm(enumerate(data), desc='Padding audio'):
      if len(aud) < max_len:
        data[i] = np.array(np.append(aud, np.zeros(max_len - len(aud))))

    return data

In [None]:
def get_min(audio):
    minim = 0

    for row in tqdm(audio, desc='Finding min'):
        if min(row) < minim:
            minim = min(row)
    
    return minim

In [None]:
def get_max(audio):
    maxim = 0

    for row in tqdm(audio, desc='Finding max'):
        if max(row) > maxim:
           maxim = max(row)

    return maxim

In [None]:
def adjust_audio(audio, minim):
    for i, row in tqdm(enumerate(audio), desc='Adjusting Audio'):
        for j, val in enumerate(row):
            audio[i][j] = int(val + abs(minim))

    return audio

In [None]:
def advanced_relu(x):
    return K.relu(x, max_value=500000)

In [None]:
def build_model(input_len, output_len, maxim):
    model = Sequential()
    model.add(Dense(321, input_shape=(input_len,), activation=advanced_relu))
    model.add(Embedding(input_dim=499999, output_dim=34))
    model.add(Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1)))
    model.add(Dense(34, activation='softmax'))
    #model.add(Reshape((321, 34)))
    
    model.compile(optimizer="adam", metrics=["accuracy"], loss='categorical_crossentropy')


    return model

In [None]:
def load_and_process_data(folder_path, test_data=False, map_use=None):
    df = load_dataset(folder_path)
    clean_y = list(df['transcription'])

    padded_y = list(zip(*itertools.zip_longest(*list(df['transcription']), fillvalue='!')))
    enc_y, mapping = one_hot(padded_y, test_data, map_use)
    cat_y = np.array(to_categorical(enc_y))

    padded_x = pad_audio(df['audio'])

    minim = get_min(padded_x)
    padded_x = adjust_audio(padded_x, minim)
    maxim = get_max(padded_x)

    padded_x = np.stack(padded_x)

    for entry in cat_y:
        for row in entry:
            if row[7] != 0:
                row[7] = 0

    return clean_y, cat_y, padded_x, minim, maxim, mapping

In [None]:
def evaluate(model, clean_y, padded_x, mapping):
    cur_words = []

    map_list = mapping.items()
    map_key = list(mapping.keys())

    last_char = False

    all_words = []

    preds = model.predict(padded_x)

    print(len(preds))

    for pred in preds:
        cur_words = []
        for row in pred:
            row = list(row)
            cur_words.append(map_key[row.index(max(row))-1])
            
        len_words = len(cur_words) - 1

        for i in range(len_words, 0, -1):
            if cur_words[i] != ' ':
                joined = ''.join(cur_words)
                all_words.append(re.sub(' +', ' ', joined))
                break
            else:
                cur_words.pop()

    cer = load('cer')
    cer_score = cer.compute(predictions=all_words, references=clean_y[:len(all_words)])
    print('Character Error Rate:', cer_score)

    wer = load('wer')
    wer_score = wer.compute(predictions=all_words, references=clean_y[:len(all_words)])
    print('Word Error Rate:', wer_score)

    return cer_score, wer_score

In [None]:
def multi_run():
    print('\033[95m' + 'LOADING TRAINING DATA\n')
    clean_y, cat_y, padded_x, minim, maxim, mapping = load_and_process_data('train')
    print('\n\n', '\033[95m' + 'LOADING TESTING DATA\n', sep='')
    y_test, _, x_test, _, _, _ = load_and_process_data('test', test_data=True, map_use=mapping)

    base_model_name = '/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_500000_'

    all_metrics = []

    model = build_model(len(padded_x[0]), len(cat_y[0]), maxim)

    for i in range(5, 100, 5):
      model_name = base_model_name + str(i)
      model.fit(padded_x, cat_y, epochs=5, verbose=1, batch_size=1)

      cer, wer = evaluate(model, clean_y, padded_x, mapping)
      test_cer, test_wer = evaluate(model, y_test, x_test, mapping)

      all_metrics.append((i, (cer, wer), (test_cer, test_wer)))

      model.save(model_name)

    for row in all_metrics:
        print(row)

In [None]:
# BIG 

multi_run()

[95mLOADING TRAINING DATA



Loading HTML Data: 100%|██████████| 290/290 [00:02<00:00, 125.69it/s]
  sr, data = wavfile.read(file)
Loading Audio Data & Creating Dataset: 356it [00:01, 238.06it/s]
Padding audio: 306it [00:02, 115.45it/s]
Finding min: 100%|██████████| 306/306 [00:22<00:00, 13.68it/s]
Adjusting Audio: 306it [10:52,  2.13s/it]
Finding max: 100%|██████████| 306/306 [00:14<00:00, 21.12it/s]




[95mLOADING TESTING DATA



Loading HTML Data: 100%|██████████| 3/3 [00:00<00:00,  5.29it/s]
Loading Audio Data & Creating Dataset: 51it [00:10,  4.73it/s]
Padding audio: 41it [00:00, 157.42it/s]
Finding min: 100%|██████████| 41/41 [00:02<00:00, 19.21it/s]
Adjusting Audio: 41it [01:23,  2.04s/it]
Finding max: 100%|██████████| 41/41 [00:01<00:00, 20.71it/s]


Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.97747963584092
Word Error Rate: 1.0
41
Character Error Rate: 0.9554208273894437
Word Error Rate: 1.0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.9777745014927574
Word Error Rate: 1.0
41
Character Error Rate: 0.9586305278174037
Word Error Rate: 1.0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.9795436954037816
Word Error Rate: 1.0
41
Character Error Rate: 0.9661198288159771
Word Error Rate: 1.0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.9793962625778629
Word Error Rate: 1.0
41
Character Error Rate: 0.9661198288159771
Word Error Rate: 1.0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.9785116656223508
Word Error Rate: 1.0
41
Character Error Rate: 0.9661198288159771
Word Error Rate: 1.0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
306
Character Error Rate: 0.735136928237072
Word Error Ra

In [None]:
#MAX (Should be 5,000,000)

multi_run()

[95mLOADING TRAINING DATA



Loading HTML Data: 100%|██████████| 290/290 [00:02<00:00, 116.91it/s]
  sr, data = wavfile.read(file)
Loading Audio Data & Creating Dataset: 356it [00:03, 100.11it/s]
Padding audio: 306it [00:01, 162.67it/s]
Finding min: 100%|██████████| 306/306 [00:14<00:00, 20.72it/s]
Adjusting Audio: 306it [10:42,  2.10s/it]
Finding max: 100%|██████████| 306/306 [00:14<00:00, 21.39it/s]




[95mLOADING TESTING DATA



Loading HTML Data: 100%|██████████| 3/3 [00:00<00:00, 92.65it/s]
Loading Audio Data & Creating Dataset: 51it [00:00, 70.58it/s]
Padding audio: 41it [00:00, 148.94it/s]
Finding min: 100%|██████████| 41/41 [00:03<00:00, 11.71it/s]
Adjusting Audio: 41it [01:30,  2.21s/it]
Finding max: 100%|██████████| 41/41 [00:01<00:00, 21.10it/s]


/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_max_5
306
Character Error Rate: 0.9805067429505517
Word Error Rate: 1.0
41
Character Error Rate: 0.9718259629101283
Word Error Rate: 1.0



/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_max_10
306
Character Error Rate: 0.9723555702698229
Word Error Rate: 0.9982475355969331
41
Character Error Rate: 0.9625534950071327
Word Error Rate: 1.0



/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_max_15
306
Character Error Rate: 0.9582760381880329
Word Error Rate: 0.9950020824656394
41
Character Error Rate: 0.9629101283880172
Word Error Rate: 1.0



/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_max_20
306
Character Error Rate: 0.8887429150557411
Word Error Rate: 0.9852707006369427
41
Character Error Rate: 0.9561340941512125
Word Error Rate: 1.0



/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_max_25
306
Character Error Rate: 0.7336625999778851
Word Error Rate: 0.964991

In [None]:
def main(train=False, model=''):
    print('\033[95m' + 'LOADING TRAINING DATA\n')
    clean_y, cat_y, padded_x, minim, maxim, mapping = load_and_process_data('train')
    print('\n\n', '\033[95m' + 'LOADING TESTING DATA\n', sep='')
    y_test, _, x_test, _, _, _ = load_and_process_data('test', test_data=True, map_use=mapping)
    if train:
        # model = build_model(len(padded_x[0]), len(cat_y[0]), maxim)
        model = load_model(model)
        display(plot_model(model, show_shapes=True))

        model.fit(padded_x, cat_y, epochs=5, verbose=1, batch_size=1)

        model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_max_10e')
    else:
        model = load_model(model)

    evaluate(model, y_test, x_test, mapping)
    print("train data")
    evaluate(model, clean_y, padded_x, mapping)

In [None]:
main(train=False, model='/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/lstm_model_max_80')

[95mLOADING TRAINING DATA



Loading HTML Data: 100%|██████████| 290/290 [00:02<00:00, 134.23it/s]
  sr, data = wavfile.read(file)
Loading Audio Data & Creating Dataset: 356it [00:01, 337.67it/s]
Padding audio: 306it [00:01, 235.15it/s]
Finding min: 100%|██████████| 306/306 [00:14<00:00, 21.51it/s]
Adjusting Audio: 306it [10:30,  2.06s/it]
Finding max: 100%|██████████| 306/306 [00:14<00:00, 21.32it/s]




[95mLOADING TESTING DATA



Loading HTML Data: 100%|██████████| 3/3 [00:00<00:00, 252.82it/s]
Loading Audio Data & Creating Dataset: 51it [00:00, 138.63it/s]
Padding audio: 41it [00:00, 121.69it/s]
Finding min: 100%|██████████| 41/41 [00:02<00:00, 19.33it/s]
Adjusting Audio: 41it [01:25,  2.07s/it]
Finding max: 100%|██████████| 41/41 [00:01<00:00, 21.03it/s]


41


Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

Character Error Rate: 0.8769614835948645


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Word Error Rate: 1.0019646365422397
train data
306
Character Error Rate: 0.23946776749843351
Word Error Rate: 0.5865441032661842


In [None]:
# 5 epochs CER - 0.9822 WER - 1.0