<a href="https://colab.research.google.com/github/Jaidon-Smith/AI-Karaoke/blob/main/Japanese%20STT%20Version%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploration of Tensorflow CTC Loss

# Sentence piece for grapheme based BPE https://github.com/google/sentencepiece

Upon reading the github docs, there appears to be some tensorflow integration if you search 'Sentencepiece'

# Exploring pretained tokenisations

In [6]:
#@title Install dependencies
!pip install --quiet tensorflow-text

In [7]:
#@title Import dependencies

In [None]:
#@title Original hub code
!pip install tensorflow-text

import numpy as np
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import tensorflow_text as tf_text
tf.disable_eager_execution()

n_layer = 12
d_model = 768
max_gen_len = 128

def generate(module, inputs, mems):
  """Generate text."""
  inputs = tf.dtypes.cast(inputs, tf.int64)
  generation_input_dict = dict(input_tokens=inputs)
  mems_dict = {}
  for i in range(n_layer):
    mems_dict["mem_{}".format(i)] = mems[i]
  generation_input_dict.update(mems_dict)

  generation_outputs = module(generation_input_dict, signature="prediction",
                              as_dict=True)
  probs = generation_outputs["probs"]

  new_mems = []
  for i in range(n_layer):
    new_mems.append(generation_outputs["new_mem_{}".format(i)])

  return probs, new_mems

g = tf.Graph()
with g.as_default():
  module = hub.Module("https://tfhub.dev/google/wiki40b-lm-ja/1")
  text = ["\n_START_ARTICLE_\nしのぶ・まさみshow'05 恋してラララ\n_START_SECTION_\n概要\n_START_PARAGRAPH_\n『上海ルーキーSHOW』の打ち切り後に放送された年末特番で、同番組MCの大竹しのぶと久本雅美が恋愛にまつわるテーマでトークや音楽企画を展開していた。基本は女"]

  # Word embeddings.
  embeddings = module(dict(text=text), signature="word_embeddings",
                      as_dict=True)
  embeddings = embeddings["word_embeddings"]

  # Activations at each layer.
  activations = module(dict(text=text),signature="activations", as_dict=True)
  activations = activations["activations"]

  # Negative log likelihood of the text, and perplexity.
  neg_log_likelihood = module(dict(text=text), signature="neg_log_likelihood",
                              as_dict=True)
  neg_log_likelihood = neg_log_likelihood["neg_log_likelihood"]
  ppl = tf.exp(tf.reduce_mean(neg_log_likelihood, axis=1))

  # Tokenization and detokenization with the sentencepiece model.
  token_ids = module(dict(text=text), signature="tokenization", as_dict=True)
  token_ids = token_ids["token_ids"]

  detoken_text = module(dict(token_ids=token_ids), signature="detokenization",
                        as_dict=True)
  detoken_text = detoken_text["text"]

  # Generation
  mems_np = [np.zeros([1, 0, d_model], dtype=np.float32) for _ in range(n_layer)]
  inputs_np = token_ids
  sampled_ids = []
  for step in range(max_gen_len):
    probs, mems_np = generate(module, inputs_np, mems_np)
    sampled_id = tf.random.categorical(tf.math.log(probs[0]), num_samples=1, dtype=tf.int32)
    sampled_id = tf.squeeze(sampled_id)

    sampled_ids.append(sampled_id)
    inputs_np = tf.reshape(sampled_id, [1, 1])

  sampled_ids = tf.expand_dims(sampled_ids, axis=0)
  generated_text = module(dict(token_ids=sampled_ids),
                          signature="detokenization", as_dict=True)
  generated_text = generated_text["text"]

  init_op = tf.group([tf.global_variables_initializer(),
                      tf.tables_initializer()])

# Initialize session.
with tf.Session(graph=g) as session:
  session.run(init_op)
  embeddings, neg_log_likelihood, ppl, activations, token_ids, detoken_text, generated_text = session.run([
    embeddings, neg_log_likelihood, ppl, activations, token_ids, detoken_text, generated_text])

In [None]:
#@title Original hub code without generation
!pip install tensorflow-text

import numpy as np
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import tensorflow_text as tf_text
tf.disable_eager_execution()

n_layer = 12
d_model = 768
max_gen_len = 128


g = tf.Graph()
with g.as_default():
  module = hub.Module("https://tfhub.dev/google/wiki40b-lm-ja/1")
  text = ["\n_START_ARTICLE_\nしのぶ・まさみshow'05 恋してラララ\n_START_SECTION_\n概要\n_START_PARAGRAPH_\n『上海ルーキーSHOW』の打ち切り後に放送された年末特番で、同番組MCの大竹しのぶと久本雅美が恋愛にまつわるテーマでトークや音楽企画を展開していた。基本は女"]

  # Word embeddings.
  embeddings = module(dict(text=text), signature="word_embeddings",
                      as_dict=True)
  embeddings = embeddings["word_embeddings"]

  # Activations at each layer.
  activations = module(dict(text=text),signature="activations", as_dict=True)
  activations = activations["activations"]

  # Negative log likelihood of the text, and perplexity.
  neg_log_likelihood = module(dict(text=text), signature="neg_log_likelihood",
                              as_dict=True)
  neg_log_likelihood = neg_log_likelihood["neg_log_likelihood"]
  ppl = tf.exp(tf.reduce_mean(neg_log_likelihood, axis=1))

  # Tokenization and detokenization with the sentencepiece model.
  token_ids = module(dict(text=text), signature="tokenization", as_dict=True)
  token_ids = token_ids["token_ids"]

  detoken_text = module(dict(token_ids=token_ids), signature="detokenization",
                        as_dict=True)
  detoken_text = detoken_text["text"]



  init_op = tf.group([tf.global_variables_initializer(),
                      tf.tables_initializer()])

# Initialize session.
with tf.Session(graph=g) as session:
  session.run(init_op)
  embeddings, neg_log_likelihood, ppl, activations, token_ids, detoken_text = session.run([
    embeddings, neg_log_likelihood, ppl, activations, token_ids, detoken_text])

In [10]:
token_ids

array([[   13,     3,    13,    32,     7,  1060,    12,  6708,   198,
         4888,  6824,   577,  8469,    13,  1824,    65,   125, 12974,
           13,     4,    13,    54,    13,     5,    13,    33,  3322,
         9505, 20236,    35,     7, 16745,   219,  3174,  6761, 11421,
           19,     8, 10110,  3549,     7,    53,  1202,    32,     7,
         1060,    20,   546,    84,  1967,   315,    15,  4871, 13775,
         2624,    19,  2702,    27,   296,   736, 14103,   175,     9,
         2819,    10,   542]], dtype=int32)

In [15]:
detoken_text[0].decode()

"_START_ARTICLE_ しのぶ・まさみshow'05 恋してラララ _START_SECTION_ 概要 _START_PARAGRAPH_ 『上海ルーキーSHOW』の打ち切り後に放送された年末特番で、同番組MCの大竹しのぶと久本雅美が恋愛にまつわるテーマでトークや音楽企画を展開していた。基本は女"

In [41]:
token_ids.shape
num_tokens = 10000
token_explorer = list(range(num_tokens))
for i in range(num_tokens - 1):
  token_explorer.insert(num_tokens - 1 - i, 0)
token_explorer = np.array([token_explorer])
token_explorer

array([[   0,    0,    1, ..., 9998,    0, 9999]])

In [42]:
g = tf.Graph()
with g.as_default():
  module = hub.Module("https://tfhub.dev/google/wiki40b-lm-ja/1")
  detoken_text = module(dict(token_ids=token_explorer), signature="detokenization",
                        as_dict=True)
  detoken_text = detoken_text["text"]

  init_op = tf.group([tf.global_variables_initializer(),
                      tf.tables_initializer()])

# Initialize session.
with tf.Session(graph=g) as session:
  session.run(init_op)
  detoken_text = session.run([
    detoken_text])

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [46]:
detoken_text[0][0].decode().split('⁇')

[' ',
 '  ',
 '  ',
 '  ',
 ' _START_ARTICLE_ ',
 ' _START_SECTION_ ',
 ' _START_PARAGRAPH_ ',
 ' _NEWLINE_ ',
 ' の ',
 ' 、 ',
 ' 。 ',
 ' は ',
 ' 年 ',
 ' ・ ',
 '   ',
 ' ) ',
 ' が ',
 ' ( ',
 ' に ',
 ' を ',
 ' で ',
 ' と ',
 ' 月 ',
 ' 」 ',
 ' 「 ',
 ' 2 ',
 ' 1 ',
 ' から ',
 ' や ',
 ' 3 ',
 ' 日 ',
 ' である ',
 ' した ',
 ' し ',
 ' 『 ',
 ' も ',
 ' 』 ',
 ' として ',
 ' 4 ',
 ' する ',
 ' 年に ',
 ' では ',
 ' 5 ',
 ' た ',
 ' 6 ',
 ' には ',
 ' また ',
 ' 7 ',
 ' 10 ',
 ' など ',
 ' 第 ',
 ' 8 ',
 ' 9 ',
 ' この ',
 ' 大 ',
 ' 概要 ',
 ' る ',
 ' 12 ',
 ' という ',
 ' ス ',
 '  ( ',
 ' された ',
 ' その ',
 ' て ',
 ' - ',
 ' 日に ',
 ' して ',
 ' している ',
 ' 11 ',
 ' 人 ',
 ' となった ',
 ' な ',
 ' 市 ',
 '  - ',
 ' ている ',
 ' 日本 ',
 ' : ',
 ' 中 ',
 ' 山 ',
 ' 町 ',
 ' 一 ',
 ' により ',
 ' 回 ',
 ' であった ',
 ' による ',
 ' 本 ',
 ' 昭和 ',
 ' . ',
 ' され ',
 ' その後 ',
 ' となる ',
 ' によって ',
 ' 後 ',
 ' ア ',
 ' であり ',
 ' 子 ',
 ' , ',
 ' 月に ',
 ' 15 ',
 ' 長 ',
 ' より ',
 ' ト ',
 ' 17 ',
 ' 上 ',
 ' 川 ',
 ' 新 ',
 ' か ',
 ' 部 ',
 ' がある ',
 ' 同 ',
 ' ズ ',
 ' り '

# Training our own tokenisation with google SentencePiece

---



By reading the paper for Wiki-40b (https://storage.googleapis.com/pub-tools-public-publication-data/pdf/18cd66cc7d31ce4c724cef1d2755b417f74de27c.pdf), it is clear that they do not use anything extra except the statistics based SentencePiece for tokenisation.

https://github.com/google/sentencepiece

---


https://github.com/google/sentencepiece/tree/master/python


---



https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb

---



# Training our own tokenisation with tensorflow text encoders

https://github.com/tensorflow/text/blob/master/docs/api_docs/python/text.md

https://blog.tensorflow.org/2019/06/introducing-tftext.html