<a href="https://colab.research.google.com/github/Jaidon-Smith/AI-Karaoke/blob/main/Japanese%20STT%20Version%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploration of Tensorflow CTC Loss

# Sentence piece for grapheme based BPE https://github.com/google/sentencepiece

Upon reading the github docs, there appears to be some tensorflow integration if you search 'Sentencepiece'

# Exploring pretained tokenisations

In [None]:
#@title Install dependencies
!pip install --quiet tensorflow-text

In [None]:
#@title Import dependencies

In [None]:
#@title Original hub code
!pip install tensorflow-text

import numpy as np
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import tensorflow_text as tf_text
tf.disable_eager_execution()

n_layer = 12
d_model = 768
max_gen_len = 128

def generate(module, inputs, mems):
  """Generate text."""
  inputs = tf.dtypes.cast(inputs, tf.int64)
  generation_input_dict = dict(input_tokens=inputs)
  mems_dict = {}
  for i in range(n_layer):
    mems_dict["mem_{}".format(i)] = mems[i]
  generation_input_dict.update(mems_dict)

  generation_outputs = module(generation_input_dict, signature="prediction",
                              as_dict=True)
  probs = generation_outputs["probs"]

  new_mems = []
  for i in range(n_layer):
    new_mems.append(generation_outputs["new_mem_{}".format(i)])

  return probs, new_mems

g = tf.Graph()
with g.as_default():
  module = hub.Module("https://tfhub.dev/google/wiki40b-lm-ja/1")
  text = ["\n_START_ARTICLE_\nしのぶ・まさみshow'05 恋してラララ\n_START_SECTION_\n概要\n_START_PARAGRAPH_\n『上海ルーキーSHOW』の打ち切り後に放送された年末特番で、同番組MCの大竹しのぶと久本雅美が恋愛にまつわるテーマでトークや音楽企画を展開していた。基本は女"]

  # Word embeddings.
  embeddings = module(dict(text=text), signature="word_embeddings",
                      as_dict=True)
  embeddings = embeddings["word_embeddings"]

  # Activations at each layer.
  activations = module(dict(text=text),signature="activations", as_dict=True)
  activations = activations["activations"]

  # Negative log likelihood of the text, and perplexity.
  neg_log_likelihood = module(dict(text=text), signature="neg_log_likelihood",
                              as_dict=True)
  neg_log_likelihood = neg_log_likelihood["neg_log_likelihood"]
  ppl = tf.exp(tf.reduce_mean(neg_log_likelihood, axis=1))

  # Tokenization and detokenization with the sentencepiece model.
  token_ids = module(dict(text=text), signature="tokenization", as_dict=True)
  token_ids = token_ids["token_ids"]

  detoken_text = module(dict(token_ids=token_ids), signature="detokenization",
                        as_dict=True)
  detoken_text = detoken_text["text"]

  # Generation
  mems_np = [np.zeros([1, 0, d_model], dtype=np.float32) for _ in range(n_layer)]
  inputs_np = token_ids
  sampled_ids = []
  for step in range(max_gen_len):
    probs, mems_np = generate(module, inputs_np, mems_np)
    sampled_id = tf.random.categorical(tf.math.log(probs[0]), num_samples=1, dtype=tf.int32)
    sampled_id = tf.squeeze(sampled_id)

    sampled_ids.append(sampled_id)
    inputs_np = tf.reshape(sampled_id, [1, 1])

  sampled_ids = tf.expand_dims(sampled_ids, axis=0)
  generated_text = module(dict(token_ids=sampled_ids),
                          signature="detokenization", as_dict=True)
  generated_text = generated_text["text"]

  init_op = tf.group([tf.global_variables_initializer(),
                      tf.tables_initializer()])

# Initialize session.
with tf.Session(graph=g) as session:
  session.run(init_op)
  embeddings, neg_log_likelihood, ppl, activations, token_ids, detoken_text, generated_text = session.run([
    embeddings, neg_log_likelihood, ppl, activations, token_ids, detoken_text, generated_text])

In [None]:
#@title Original hub code without generation
!pip install tensorflow-text

import numpy as np
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import tensorflow_text as tf_text
tf.disable_eager_execution()

n_layer = 12
d_model = 768
max_gen_len = 128


g = tf.Graph()
with g.as_default():
  module = hub.Module("https://tfhub.dev/google/wiki40b-lm-ja/1")
  text = ["\n_START_ARTICLE_\nしのぶ・まさみshow'05 恋してラララ\n_START_SECTION_\n概要\n_START_PARAGRAPH_\n『上海ルーキーSHOW』の打ち切り後に放送された年末特番で、同番組MCの大竹しのぶと久本雅美が恋愛にまつわるテーマでトークや音楽企画を展開していた。基本は女"]

  # Word embeddings.
  embeddings = module(dict(text=text), signature="word_embeddings",
                      as_dict=True)
  embeddings = embeddings["word_embeddings"]

  # Activations at each layer.
  activations = module(dict(text=text),signature="activations", as_dict=True)
  activations = activations["activations"]

  # Negative log likelihood of the text, and perplexity.
  neg_log_likelihood = module(dict(text=text), signature="neg_log_likelihood",
                              as_dict=True)
  neg_log_likelihood = neg_log_likelihood["neg_log_likelihood"]
  ppl = tf.exp(tf.reduce_mean(neg_log_likelihood, axis=1))

  # Tokenization and detokenization with the sentencepiece model.
  token_ids = module(dict(text=text), signature="tokenization", as_dict=True)
  token_ids = token_ids["token_ids"]

  detoken_text = module(dict(token_ids=token_ids), signature="detokenization",
                        as_dict=True)
  detoken_text = detoken_text["text"]



  init_op = tf.group([tf.global_variables_initializer(),
                      tf.tables_initializer()])

# Initialize session.
with tf.Session(graph=g) as session:
  session.run(init_op)
  embeddings, neg_log_likelihood, ppl, activations, token_ids, detoken_text = session.run([
    embeddings, neg_log_likelihood, ppl, activations, token_ids, detoken_text])

In [None]:
token_ids

array([[   13,     3,    13,    32,     7,  1060,    12,  6708,   198,
         4888,  6824,   577,  8469,    13,  1824,    65,   125, 12974,
           13,     4,    13,    54,    13,     5,    13,    33,  3322,
         9505, 20236,    35,     7, 16745,   219,  3174,  6761, 11421,
           19,     8, 10110,  3549,     7,    53,  1202,    32,     7,
         1060,    20,   546,    84,  1967,   315,    15,  4871, 13775,
         2624,    19,  2702,    27,   296,   736, 14103,   175,     9,
         2819,    10,   542]], dtype=int32)

In [None]:
detoken_text[0].decode()

"_START_ARTICLE_ しのぶ・まさみshow'05 恋してラララ _START_SECTION_ 概要 _START_PARAGRAPH_ 『上海ルーキーSHOW』の打ち切り後に放送された年末特番で、同番組MCの大竹しのぶと久本雅美が恋愛にまつわるテーマでトークや音楽企画を展開していた。基本は女"

In [None]:
token_ids.shape
num_tokens = 10000
token_explorer = list(range(num_tokens))
for i in range(num_tokens - 1):
  token_explorer.insert(num_tokens - 1 - i, 0)
token_explorer = np.array([token_explorer])
token_explorer

array([[   0,    0,    1, ..., 9998,    0, 9999]])

In [None]:
g = tf.Graph()
with g.as_default():
  module = hub.Module("https://tfhub.dev/google/wiki40b-lm-ja/1")
  detoken_text = module(dict(token_ids=token_explorer), signature="detokenization",
                        as_dict=True)
  detoken_text = detoken_text["text"]

  init_op = tf.group([tf.global_variables_initializer(),
                      tf.tables_initializer()])

# Initialize session.
with tf.Session(graph=g) as session:
  session.run(init_op)
  detoken_text = session.run([
    detoken_text])

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [None]:
detoken_text[0][0].decode().split('⁇')

[' ',
 '  ',
 '  ',
 '  ',
 ' _START_ARTICLE_ ',
 ' _START_SECTION_ ',
 ' _START_PARAGRAPH_ ',
 ' _NEWLINE_ ',
 ' の ',
 ' 、 ',
 ' 。 ',
 ' は ',
 ' 年 ',
 ' ・ ',
 '   ',
 ' ) ',
 ' が ',
 ' ( ',
 ' に ',
 ' を ',
 ' で ',
 ' と ',
 ' 月 ',
 ' 」 ',
 ' 「 ',
 ' 2 ',
 ' 1 ',
 ' から ',
 ' や ',
 ' 3 ',
 ' 日 ',
 ' である ',
 ' した ',
 ' し ',
 ' 『 ',
 ' も ',
 ' 』 ',
 ' として ',
 ' 4 ',
 ' する ',
 ' 年に ',
 ' では ',
 ' 5 ',
 ' た ',
 ' 6 ',
 ' には ',
 ' また ',
 ' 7 ',
 ' 10 ',
 ' など ',
 ' 第 ',
 ' 8 ',
 ' 9 ',
 ' この ',
 ' 大 ',
 ' 概要 ',
 ' る ',
 ' 12 ',
 ' という ',
 ' ス ',
 '  ( ',
 ' された ',
 ' その ',
 ' て ',
 ' - ',
 ' 日に ',
 ' して ',
 ' している ',
 ' 11 ',
 ' 人 ',
 ' となった ',
 ' な ',
 ' 市 ',
 '  - ',
 ' ている ',
 ' 日本 ',
 ' : ',
 ' 中 ',
 ' 山 ',
 ' 町 ',
 ' 一 ',
 ' により ',
 ' 回 ',
 ' であった ',
 ' による ',
 ' 本 ',
 ' 昭和 ',
 ' . ',
 ' され ',
 ' その後 ',
 ' となる ',
 ' によって ',
 ' 後 ',
 ' ア ',
 ' であり ',
 ' 子 ',
 ' , ',
 ' 月に ',
 ' 15 ',
 ' 長 ',
 ' より ',
 ' ト ',
 ' 17 ',
 ' 上 ',
 ' 川 ',
 ' 新 ',
 ' か ',
 ' 部 ',
 ' がある ',
 ' 同 ',
 ' ズ ',
 ' り '

# Training our own tokenisation with google SentencePiece

---



By reading the paper for Wiki-40b (https://storage.googleapis.com/pub-tools-public-publication-data/pdf/18cd66cc7d31ce4c724cef1d2755b417f74de27c.pdf), it is clear that they do not use anything extra except the statistics based SentencePiece for tokenisation.

https://github.com/google/sentencepiece

---


https://github.com/google/sentencepiece/tree/master/python


---



https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb

---

All options for training can be viewed here: https://github.com/google/sentencepiece/blob/master/doc/options.md Note this was found by reading the readme on the main page


In [1]:
#@title Install
!pip install sentencepiece
!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |▎                               | 10kB 15.3MB/s eta 0:00:01[K     |▌                               | 20kB 21.3MB/s eta 0:00:01[K     |▉                               | 30kB 26.4MB/s eta 0:00:01[K     |█                               | 40kB 20.1MB/s eta 0:00:01[K     |█▍                              | 51kB 16.1MB/s eta 0:00:01[K     |█▋                              | 61kB 18.3MB/s eta 0:00:01[K     |██                              | 71kB 15.1MB/s eta 0:00:01[K     |██▏                             | 81kB 13.3MB/s eta 0:00:01[K     |██▌                             | 92kB 12.4MB/s eta 0:00:01[K     |██▊                             | 102kB 12.1MB/s eta 0:00:01[K     |███                             | 112kB 12.1MB/s eta 0:00:01[K     |███▎        

In [12]:
#@title Download the test folder from github
!mkdir test
!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt -O /content/test/botchan.txt
!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/test/test_model.model -O /content/test/test_model.model


mkdir: cannot create directory ‘test’: File exists
--2021-02-11 02:28:12--  https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 278779 (272K) [text/plain]
Saving to: ‘/content/test/botchan.txt’


2021-02-11 02:28:12 (16.6 MB/s) - ‘/content/test/botchan.txt’ saved [278779/278779]

--2021-02-11 02:28:12--  https://raw.githubusercontent.com/google/sentencepiece/master/python/test/test_model.model
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 253165 (247K) [a

## Tokenisation exploration of pretrained example

In [12]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor(model_file='test/test_model.model')

In [13]:
sp.encode('This is a test')

[284, 47, 11, 4, 15, 400]

In [14]:
sp.encode(['This is a test', 'Hello world'], out_type=int)

[[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]]

In [15]:
sp.encode('This is a test', out_type=str)

['▁This', '▁is', '▁a', '▁', 't', 'est']

In [16]:
sp.encode(['This is a test', 'Hello world'], out_type=str)

[['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']]

In [18]:
for _ in range(10):
  print(sp.encode('This is a test', out_type=str, enable_sampling=True, alpha=0.1, nbest_size=-1))

['▁T', 'h', 'is', '▁is', '▁a', '▁', 't', 'est']
['▁', 'This', '▁is', '▁a', '▁', 'te', 's', 't']
['▁T', 'h', 'i', 's', '▁is', '▁', 'a', '▁', 'te', 'st']
['▁', 'T', 'h', 'i', 's', '▁', 'i', 's', '▁a', '▁', 't', 'e', 'st']
['▁', 'This', '▁', 'i', 's', '▁', 'a', '▁', 'te', 's', 't']
['▁This', '▁is', '▁', 'a', '▁', 'te', 's', 't']
['▁This', '▁', 'i', 's', '▁', 'a', '▁', 't', 'e', 's', 't']
['▁T', 'h', 'i', 's', '▁', 'is', '▁a', '▁', 'te', 's', 't']
['▁', 'T', 'h', 'is', '▁', 'i', 's', '▁a', '▁', 'te', 'st']
['▁This', '▁', 'is', '▁a', '▁', 'te', 's', 't']


In [20]:
sp.decode([284, 47, 11, 4, 15, 400])

'This is a test'

In [21]:
sp.decode([[284, 47, 11, 4, 15, 400], [151, 88, 21, 887]])

['This is a test', 'Hello world']

In [22]:
sp.decode(['▁', 'This', '▁', 'is', '▁a', '▁', 't', 'e', 'st'])

'This is a test'

In [23]:
sp.decode([['▁This', '▁is', '▁a', '▁', 't', 'est'], ['▁He', 'll', 'o', '▁world']])

['This is a test', 'Hello world']

In [24]:
sp.get_piece_size()

1000

In [25]:
sp.id_to_piece(2)


'</s>'

In [26]:
sp.id_to_piece([2, 3, 4])

['</s>', '\r', '▁']

In [27]:
sp.piece_to_id('<s>')

1

In [28]:
sp.piece_to_id(['</s>', '\r', '▁'])

[2, 3, 4]

In [29]:
len(sp)

1000

In [30]:
sp['</s>']

2

## Training example

In [19]:
import sentencepiece as spm
#spm.SentencePieceTrainer.train(input='test/botchan.txt', model_prefix='m', vocab_size=1000, user_defined_symbols=['foo', 'bar'], hard_vocab_limit = False)
#spm.SentencePieceTrainer.train(input='test/botchan.txt', model_prefix='m', vocab_size=1000, user_defined_symbols=['foo', 'bar'])
spm.SentencePieceTrainer.train(input='test/botchan.txt', model_prefix='m', vocab_size=1000)


In [21]:
sp = spm.SentencePieceProcessor(model_file='m.model')

In [15]:
sp.vocab_size()

1000

In [16]:
len(sp)

1000

In [22]:
sp.id_to_piece([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])

['<unk>',
 '<s>',
 '</s>',
 ',',
 '▁',
 '.',
 '▁the',
 's',
 '▁I',
 '▁to',
 '▁a',
 'ed',
 'e',
 't',
 '▁and',
 '▁of',
 'ing',
 'a']

In [25]:
spm.SentencePieceTrainer.train(input='test/botchan.txt', model_prefix='m', vocab_size=1000, user_defined_symbols=[' ', 'bar'])

RuntimeError: ignored

# Training our own tokenisation with tensorflow text encoders

https://github.com/tensorflow/text/blob/master/docs/api_docs/python/text.md

https://blog.tensorflow.org/2019/06/introducing-tftext.html

# Training an english model on Librispeech subset

In [27]:
import tensorflow_datasets as tfds

In [28]:
ds = tfds.load('mnist', split='train', shuffle_files=True)

[1mDownloading and preparing dataset mnist/3.0.1 (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /root/tensorflow_datasets/mnist/3.0.1...[0m


local data directory. If you'd instead prefer to read directly from our public
GCS bucket (recommended if you're running on GCP), you can instead pass
`try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.



HBox(children=(FloatProgress(value=0.0, description='Dl Completed...', max=4.0, style=ProgressStyle(descriptio…



[1mDataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.[0m


In [29]:
tfds.list_builders()

['abstract_reasoning',
 'accentdb',
 'aeslc',
 'aflw2k3d',
 'ag_news_subset',
 'ai2_arc',
 'ai2_arc_with_ir',
 'amazon_us_reviews',
 'anli',
 'arc',
 'bair_robot_pushing_small',
 'bccd',
 'beans',
 'big_patent',
 'bigearthnet',
 'billsum',
 'binarized_mnist',
 'binary_alpha_digits',
 'blimp',
 'bool_q',
 'c4',
 'caltech101',
 'caltech_birds2010',
 'caltech_birds2011',
 'cars196',
 'cassava',
 'cats_vs_dogs',
 'celeb_a',
 'celeb_a_hq',
 'cfq',
 'chexpert',
 'cifar10',
 'cifar100',
 'cifar10_1',
 'cifar10_corrupted',
 'citrus_leaves',
 'cityscapes',
 'civil_comments',
 'clevr',
 'clic',
 'clinc_oos',
 'cmaterdb',
 'cnn_dailymail',
 'coco',
 'coco_captions',
 'coil100',
 'colorectal_histology',
 'colorectal_histology_large',
 'common_voice',
 'coqa',
 'cos_e',
 'cosmos_qa',
 'covid19sum',
 'crema_d',
 'curated_breast_imaging_ddsm',
 'cycle_gan',
 'deep_weeds',
 'definite_pronoun_resolution',
 'dementiabank',
 'diabetic_retinopathy_detection',
 'div2k',
 'dmlab',
 'downsampled_imagenet',
 

In [None]:
# >>> ds2 = tfds.load('librispeech', split='train_clean100', shuffle_files=True)
# Not enough disk space. Needed: 361.62 GiB (download: 57.14 GiB, generated: 304.47 GiB)

In [32]:
tfds.is_dataset_on_gcs('librispeech')

False

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
with open('/gdrive/My Drive/foo.txt', 'w') as f:
  f.write('Hello Google Drive!')
!cat '/gdrive/My Drive/foo.txt'

In [35]:
ds2 = tfds.load('librispeech', split='train_clean100', shuffle_files=True, data_dir='/gdrive/My Drive/datasets/librispeech')

OSError: ignored

In [36]:
ds2 = tfds.load('mnist', split='train', shuffle_files=True, data_dir='/gdrive/My Drive/datasets/mnist')

[1mDownloading and preparing dataset mnist/3.0.1 (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /gdrive/My Drive/datasets/mnist/mnist/3.0.1...[0m


local data directory. If you'd instead prefer to read directly from our public
GCS bucket (recommended if you're running on GCP), you can instead pass
`try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.



HBox(children=(FloatProgress(value=0.0, description='Dl Completed...', max=4.0, style=ProgressStyle(descriptio…



[1mDataset mnist downloaded and prepared to /gdrive/My Drive/datasets/mnist/mnist/3.0.1. Subsequent calls will reuse this data.[0m


In [4]:
import shutil
import os

In [5]:
shutil.disk_usage(os.path.abspath('/gdrive/My Drive/datasets/mnist')).free

77183885312