In [22]:
# Install tensorflow_text, if executed in google colab
if 'google.colab' in str(get_ipython()):
  !pip install -q -U "tensorflow-text==2.8.*"
  !pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [23]:
# disable compiler warnings
import os

# imports 
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.keras.layers import Dense
from typing import List
import datetime
from tqdm.notebook import tqdm
import sentencepiece as sp


from tensorflow.python.client import device_lib
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'  # FATAL
print("Num GPUs Available: ", tf.config.list_physical_devices('GPU'))

Num GPUs Available:  []


# Load data

In [65]:
import requests

# Load file from remote, if notebook is executed inside google colab, otherwise it gets loaded from the local file system
if 'google.colab' in str(get_ipython()):
  bible_url = "https://raw.githubusercontent.com/IANNwTF-Group-3/homework11/main/bible.txt"
  response = requests.get(bible_url)
  text = response.text
else:
  file_path = "bible.txt"
  with open(file_path, "r") as f:
      text = f.read()

# Preprocessing

In [66]:
import re

# Lowercase the text
text = text.lower()

# Remove sentence numeration
text = re.sub('[0-9]+:[0-9]+ ', '', text)

# Remove special characters
for c in "!'()*,-.0123456789:;?":
  text = text.replace(c, '')

# Replace multiple spaces with a single space
text = re.sub(' +', ' ', text)

sentence_separator = "sentence-separator-placeholder"
# Remember double line breaks
text = re.sub('\n\n+', sentence_separator, text)
# Remove line breaks
text = text.replace('\n', '')
# Substitute sentence line breaks back into text
text = text.replace(sentence_separator, '\n')

Write text to file for later processing

In [67]:
bible_file_name = "bible_preprocessed.txt"

if os.path.exists(bible_file_name):
  os.remove(bible_file_name)

bible_file = open(bible_file_name, 'xb')
bible_file.write(text.encode(encoding='UTF-8'))
bible_file.close()

# Tokenize

Create tokenizer model

In [70]:
# Pretrained model
# sp_model_url = "https://github.com/tensorflow/text/blob/master/tensorflow_text/python/ops/test_data/fast_sentencepiece.model?raw=true"
# sp_model = requests.get(sp_model_url).content

# Self trained model
sp_model_name = "sp_tokenizer"
sp.SentencePieceTrainer.train(input=bible_file_name, model_prefix=sp_model_name, model_type="unigram", vocab_size=3000)
sp_model = tf.io.gfile.GFile(f"{sp_model_name}.model", "rb").read()

Create tokenizer and bible tokens

In [75]:
sp_tokenizer = tf_text.SentencepieceTokenizer(sp_model)
sp_tokens = sp_tokenizer.tokenize(text)

Test tokenizer

In [76]:
test_tokens = sp_tokenizer.tokenize("I Jesus have sent mine angel".lower())
for t in test_tokens:
  print(sp_tokenizer.detokenize([t]))

tf.Tensor(b'i', shape=(), dtype=string)
tf.Tensor(b'jesus', shape=(), dtype=string)
tf.Tensor(b'have', shape=(), dtype=string)
tf.Tensor(b'sent', shape=(), dtype=string)
tf.Tensor(b'mine', shape=(), dtype=string)
tf.Tensor(b'angel', shape=(), dtype=string)


# Prepare dataset

Create training data using sliding window

In [96]:
SLIDING_WINDOW_SIZE = 64
BATCH_SIZE = 64
# We now split each sentence. We concatenate each sentence to retrieve a better context
words = tf.constant(text.replace('\n', ' ').split(' '))

sliding_window = tf_text.sliding_window(data=words, width=SLIDING_WINDOW_SIZE + 1, axis=0)

# Visualize sliding window
print(sliding_window)

tf.Tensor(
[[b'the' b'first' b'book' ... b'was' b'good' b'and']
 [b'first' b'book' b'of' ... b'good' b'and' b'god']
 [b'book' b'of' b'moses' ... b'and' b'god' b'divided']
 ...
 [b'any' b'man' b'shall' ... b'be' b'with' b'you']
 [b'man' b'shall' b'take' ... b'with' b'you' b'all']
 [b'shall' b'take' b'away' ... b'you' b'all' b'amen']], shape=(740430, 65), dtype=string)


Create dataset

In [99]:
dataset = tf.data.Dataset.from_tensor_slices((sliding_window[:,:-1], sliding_window[:,-1]))
dataset = dataset.shuffle(4096)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.AUTOTUNE)