<a href="https://colab.research.google.com/github/GreihMurray/KriolTranscriber/blob/master/char_net.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 582 kB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 10.4 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 19.6 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 30.3 MB/s 
[?25hCollecting datasets>=2.0.0
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 11.4 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21

In [None]:
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf
import numpy as np
import itertools
from scipy.io import wavfile
from keras.models import load_model
from evaluate import load

In [None]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
import pandas as pd
from tqdm import tqdm
import os
import unicodedata
import re

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
DEFAULT_DIR = '/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/'
SR = 44100

In [None]:
def load_html_data(dir_ext):
    all_data = []
    chars_to_replace = '.,;:<>?/\'\\[]()!#$%"”“'
    directory = DEFAULT_DIR + 'html/' + dir_ext + '/'

    dir_files = sorted(os.listdir(directory))

    for file in tqdm(dir_files, desc='Loading HTML Data'):
        file = directory + file
        divs = []
        with open(file, 'r', encoding='UTF-8') as in_file:
            data = ' '.join(in_file.readlines())
            data = unicodedata.normalize('NFC', data)
            data = re.sub(r'<span class=.*?</span>', '', data)
            data = re.sub(r'<a.*?</a>', '', data)
            data = data.replace(u'\xa0', u' ')
            check_divs = re.findall(r'<div class=\'mt\'.*?>(.*?) </div>', data)
            check_divs.extend(re.findall(r'<div class=\'mt2\'.*?>(.*?) </div>', data))
            # print(data)
            check_divs.extend(re.findall(r'<div class=\'ip\'>(.*)', data))

            if len(check_divs) > 0:
                full = '!'.join(check_divs)
                full = re.sub(r'[\,,@,#,$,%,^,&,*,(,),\[,\],\',\",;,:,“,”,‘,’]', '', full)
                full = re.sub('^\s+', ' ', full).strip('\u00A0')
                full = re.split('[\.,\?,!,\n]', str(full))
                all_data.extend([s.strip() for s in full])

            divs.extend(re.findall(r'<div class=\'[p,s]\'.*?>(.*?) </div>', data))

        full_data = '!'.join(divs)
        full_data = re.sub(r'[\,,@,#,$,%,^,&,*,(,),\[,\],\',\",;,:,“,”,‘,’]', '', full_data)
        full_data = re.sub(' +', ' ', full_data)
        full_data = re.split('[\.,\?,!,\n]', str(full_data))

        all_data.extend([s.strip() for s in full_data])

        clean = []
        for row2 in all_data:
            if len(row2) >= 1:
                clean.append(row2.lower())

    return clean

In [None]:
def load_dataset(dir_ext):
    transcripts = load_html_data(dir_ext)
    all_x = []
    all_y = []
    directory = DEFAULT_DIR + 'audio/' + dir_ext + '/'

    order = []

    dir_files = sorted(os.listdir(directory))

    for i, file in tqdm(enumerate(dir_files), desc='Loading Audio Data & Creating Dataset'):
        file = directory + file
        
        sr, data = wavfile.read(file)

        max_len = SR//1000 * 15000

        if len(data) > max_len:
          continue

        all_x.append(data)
        all_y.append(transcripts[i])


    return pd.DataFrame(list(zip(all_x, all_y)), columns=['audio', 'transcription'])

In [None]:
def one_hot(data):
    mapping = {}
    mapped = []

    i = 0

    for sentence in data:
      cur_map = []
      for word in sentence:
          if word in mapping.keys():
            cur_map.append(mapping[word])
          else:
            i += 1
            cur_map.append(i)
            mapping[word] = i
            
      mapped.append(cur_map)
      
    return mapped, mapping

In [None]:
def vectorize(data):
    vecs = []
    for row in data:
      cur_vec = [0] * len(row[0])
      for piece in row:
        piece = list(piece)
        cur_vec[piece.index(1)] += 1
      cur_vec[7] = 1
      vecs.append(cur_vec)

    return vecs

In [None]:
def pad_audio(data):
    max_len = SR//1000 * 15000

    for i, aud in tqdm(enumerate(data), desc='Padding audio'):
      if len(aud) < max_len:
        data[i] = np.array(np.append(aud, np.zeros(max_len - len(aud))))

    return data

In [None]:
def build_model(input_len, output_len):
    model = Sequential()
    model.add(Dense(256, input_shape=(input_len,), activation="sigmoid"))
    model.add(Dense(128, activation="sigmoid"))
    model.add(Dense(output_len, activation="relu"))

    model.compile(loss='poisson', metrics=['accuracy'])

    return model

In [None]:
df = load_dataset('train')

Loading HTML Data: 100%|██████████| 293/293 [00:04<00:00, 71.55it/s] 
  sr, data = wavfile.read(file)
Loading Audio Data & Creating Dataset: 356it [00:06, 54.64it/s] 


In [None]:
clean_y = list(df['transcription'])

In [None]:
print(df.head())

                                               audio  \
0  [-41, -35, -10, -4, -31, -67, -63, -41, -15, -...   
1  [-38, -37, -36, -34, -36, -32, -32, -29, -30, ...   
2  [1, -1, 1, -3, 1, 0, -2, 0, -3, 2, 1, 0, 3, 1,...   
3  [-4, -4, 1, -4, 0, -1, -7, -1, -7, -2, -1, -4,...   
4  [7, 4, 8, 5, 6, 10, 4, 9, 8, 6, 10, 7, 8, 9, 3...   

                                       transcription  
0                                            jenasis  
1                                ola basdamwan stori  
2  wal dijan im stat garram det stori blanga god ...  
3                            god bin meigim ebrijing  
4  orait longtaim wen god bin stat meigimbat ebri...  


In [None]:
padded_y = list(zip(*itertools.zip_longest(*list(df['transcription']), fillvalue='!')))
enc_y, mapping = one_hot(padded_y)
enc_y = np.array(enc_y)
# cat_y = to_categorical(enc_y)
# vec_y = np.array(vectorize(cat_y))

In [None]:
print(enc_y[0])

[1 2 3 4 5 6 5 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]


In [None]:
print(enc_y[0])
print(mapping)

[1 2 3 4 5 6 5 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
{'j': 1, 'e': 2, 'n': 3, 'a': 4, 's': 5, 'i': 6, '!': 7, 'o': 8, 'l': 9, ' ': 10, 'b': 11, 'd': 12, 'm': 13, 'w': 14, 't': 15, 'r': 16, 'g': 17, 'y': 18, 'f': 19, 'u': 20, 'k': 21, 'p': 22, 'h': 23, '4': 24, '5': 25, '7': 26, '3': 27, '9': 28, '0': 29, '1': 30, '2': 31, '6': 32, '8': 33}


In [None]:
padded_x = pad_audio(df['audio'])

Padding audio: 306it [00:01, 187.82it/s]


In [None]:
print(padded_x[0])

[-41. -35. -10. ...   0.   0.   0.]


In [None]:
padded_x = np.stack(padded_x)

In [None]:
model = build_model(len(padded_x[0]), len(enc_y[0]))

In [None]:
print(padded_x.shape)
print(padded_x[0].shape)
print(enc_y.shape)

(306, 660000)
(660000,)
(306, 321)


In [None]:
model = load_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/charnet_model_350e')

In [None]:
model.fit(padded_x, enc_y, epochs=50, verbose=1, batch_size=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-35-9c56ac32f251>", line 1, in <module>
    model.fit(padded_x, enc_y, epochs=50, verbose=1, batch_size=1)
  File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1409, in fit
    tmp_logs = self.train_function(iterator)
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/util/traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/def_function.py", line 915, in __call__
    result = self._call(*args, **kwds)
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/def_function.py", line 947, in _c

KeyboardInterrupt: ignored

In [None]:
model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/project/charnet_model_400e')

In [None]:
preds = model.predict(padded_x[:5])



In [None]:
print(list(preds[0]))

[0.95105976, 1.0968852, 4.293931, 3.8894193, 5.786033, 5.3364453, 7.1176314, 7.5195417, 8.647802, 5.954364, 8.40326, 5.833742, 8.433533, 5.940993, 8.03411, 6.0828767, 6.9753113, 7.164442, 7.277256, 4.775851, 5.5980344, 8.994245, 0.0, 5.912291, 7.121748, 5.3818455, 7.263736, 7.0212383, 7.6859345, 8.5756035, 7.463218, 7.9209685, 6.984968, 7.75491, 6.0806026, 6.1755543, 6.841129, 7.1973977, 10.0632305, 6.488615, 7.7286544, 7.56567, 6.4215956, 8.72812, 5.5774846, 8.433198, 8.908519, 0.0, 8.291846, 8.215379, 7.9059825, 8.101697, 6.9118705, 8.041801, 7.856848, 7.032808, 8.586787, 6.534698, 7.404859, 8.38932, 5.8053036, 7.1542487, 0.0, 0.0, 6.7600245, 7.101625, 5.3133206, 7.1035504, 7.8313527, 0.0, 6.5645547, 6.4634595, 6.7056785, 7.249283, 6.0019584, 6.6703873, 8.826771, 6.5105515, 6.674313, 8.069011, 5.2594595, 7.087092, 7.5515194, 6.8225694, 6.0771217, 6.80709, 7.329076, 9.30311, 7.2455993, 7.3225827, 7.563947, 6.3112345, 6.509956, 7.8165574, 6.9766436, 8.065083, 8.296447, 7.4648714, 9.229

In [None]:
print(mapping)

{'j': 1, 'e': 2, 'n': 3, 'a': 4, 's': 5, 'i': 6, '!': 7, 'o': 8, 'l': 9, ' ': 10, 'b': 11, 'd': 12, 'm': 13, 'w': 14, 't': 15, 'r': 16, 'g': 17, 'y': 18, 'f': 19, 'u': 20, 'k': 21, 'p': 22, 'h': 23, '4': 24, '5': 25, '7': 26, '3': 27, '9': 28, '0': 29, '1': 30, '2': 31, '6': 32, '8': 33}


In [None]:
model.evaluate(padded_x, enc_y, verbose=1)



[-0.9588984847068787, 0.33006536960601807]

In [None]:
cur_words = []

map_list = mapping.items()
map_keys = list(mapping.keys())
map_vals = list(mapping.values())

all_words = []
preds = model.predict(padded_x)

# for i, val in enumerate(preds[0]):
#     if val > 0:
#       cur_words.append([item[0] for item in map_list if item[1] == i+1])

for pred in preds:
    cur_words = []
    for i, val in enumerate(pred):
        if val > 0:
            if round(val) == 0:
                val = 1
            if round(val) == 7:
                break
            cur_words.append(map_keys[map_vals.index(round(val))])
    all_words.append(''.join(cur_words))


print(all_words)
print(enc_y[0])

['jjaais', '', 'mndbwsjanm', 'tlb msnlmeit', 'dws', 'orb', 'ie', 'brlbl efabriblmsalwofoi', 'en agweiwadns mewlos', 'nemfea w b danbotrbdewls', 'mrodl efalw blmiiohst iro nbibaoddimob baw', 'wg bl eyilnaiwnsb mieb', 'wmlmbdeuadyob', 'nndirbma mjfin bay', 'dw r megaoy bdbindd', 'ne amlinbrefsnlbimajswnsoenm lmodoeddpltbbdamlwab asotawldm', 'jnlwenbtom mlndlyy', 'byoroweyilt ml ield', 'eemsdbla ynuonolawsjatnioei meblfrnys oo bdm', 'je genbgodm isligfbeltjanrl iaomytbsnl', 'dwowdmnrabnniga', 'rtbmlmotsor', 'it sab', 'bwldomagalnn', 'bw', 'endt', 'iw', 'bgbdllntadrom oialwog awmat   fainlnn', ' swmiwwjsrnnwl', 'nndibm', 'ne mea ', ' rob  aynbtl looaltim', 'nnldomlagriidniwim bl', 'lwabnwdgsywsljsdandboimoangiolonmglitwnsoosgmlnl  aataoidaokibrsneiibrlleaoi s basomaa snidg ', 'itwindueiwsdolidtan eaweltsn na ', 'enbdjdl', 'nnlboanra', 'tallelg', 'nadwi ', 'wwbmldera sssws', 'rnl osmojawlnsly  wb', 'set anua dat sndda g', 'tabbsdb nbgn ', 'dew nabmabhbdeibnsodemonsban beslns ', 'tlbobsn bor

In [None]:
print(mapping)

{'j': 1, 'e': 2, 'n': 3, 'a': 4, 's': 5, 'i': 6, '!': 7, 'o': 8, 'l': 9, ' ': 10, 'b': 11, 'd': 12, 'm': 13, 'w': 14, 't': 15, 'r': 16, 'g': 17, 'y': 18, 'f': 19, 'u': 20, 'k': 21, 'p': 22, 'h': 23, '4': 24, '5': 25, '7': 26, '3': 27, '9': 28, '0': 29, '1': 30, '2': 31, '6': 32, '8': 33}


In [None]:
!pip install evaluate
!pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jiwer
  Downloading jiwer-2.5.1-py3-none-any.whl (15 kB)
Collecting levenshtein==0.20.2
  Downloading Levenshtein-0.20.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 7.7 MB/s 
[?25hCollecting rapidfuzz<3.0.0,>=2.3.0
  Downloading rapidfuzz-2.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 16.9 MB/s 
[?25hInstalling collected packages: rapidfuzz, levenshtein, jiwer
Successfully installed jiwer-2.5.1 levenshtein-0.20.2 rapidfuzz-2.13.3


In [None]:
from evaluate import load

In [None]:
cer = load('cer')
cer_score = cer.compute(predictions=all_words, references=clean_y[:306])
print(cer_score)

0.8919685968080793


In [None]:
wer = load('wer')
wer_score = wer.compute(predictions=all_words, references=clean_y[:306])
print(wer_score)

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

0.9990221005280657
