<a href="https://colab.research.google.com/github/MaCoZu/NLP/blob/main/05_classical_lan_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import os
import glob # file, path etc.
import re
import random
import shutil # shell utilities
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras import models, layers
from tqdm import tqdm # progress indication
import matplotlib.pyplot as plt


In [14]:
%cd /content/drive/MyDrive/Colab Notebooks/NLP/classic_texts
!ls

/content/drive/MyDrive/Colab Notebooks/NLP/classic_texts
 artofwar.txt		     dataset
'aurel - meditations.txt'   'herodotus - history.txt'
'confucius - analects.txt'  'On War.txt'
'confucius - doctmean.txt'  'tacitus - annals.txt'
'confucius - learning.txt'  'tacitus - histories.txt'


In [15]:
# Where the text files are going to live.
dataset_path = "dataset"
dataset_path_all = os.path.join(dataset_path, "all")
dataset_path_train = os.path.join(dataset_path, "train")
dataset_path_valid = os.path.join(dataset_path, "valid")

# Just use 20 files.
file_number = 20

# Gather the corpus if it has not been gathered yet.
if not os.path.exists(dataset_path):

    # Create all the folders.
    for path in [dataset_path, dataset_path_all, dataset_path_train, dataset_path_valid]:
        if not os.path.exists(path):
            os.mkdir(path)

    # Find all the files.
    paths_all = glob.glob("*.txt")
    print(sorted(paths_all))

    # Do not use all.
    paths_all = paths_all[:file_number]

    # Split 80/20.
    split_index = int(len(paths_all) * 0.8)
    paths_train = paths_all[:split_index]
    paths_valid = paths_all[split_index:]

    # Copy files.
    def copy(paths, destination):
        for path in paths:
            shutil.copy2(path, destination)
    copy(paths_all, dataset_path_all)
    copy(paths_train, dataset_path_train)
    copy(paths_valid, dataset_path_valid)

    # Done.
    print("Corpus downloaded.")

In [16]:
def create_dataset(dataset_path):
  dataset = preprocessing.text_dataset_from_directory(
      dataset_path,
      labels=None,
      seed=42,
      batch_size=32, # not training
      )
  return dataset

dataset_original_all = create_dataset(dataset_path_all)
dataset_original_train = create_dataset(dataset_path_train)
dataset_original_valid = create_dataset(dataset_path_valid)

Found 9 files belonging to 1 classes.
Found 7 files belonging to 1 classes.
Found 2 files belonging to 1 classes.


In [17]:
for batch in dataset_original_all:
  # print(batch.shape)
  for sample in batch[:2]:
    print(sample)
    # break

tf.Tensor(b'The Doctrine of the Mean\nBy Confucius\n\n\n\nWhat Heaven has conferred is called The Nature; an accordance with\nthis nature is called The Path of duty; the regulation of this path\nis called Instruction. \n\nThe path may not be left for an instant. If it could be left, it would\nnot be the path. On this account, the superior man does not wait till\nhe sees things, to be cautious, nor till he hears things, to be apprehensive.\n\nThere is nothing more visible than what is secret, and nothing more\nmanifest than what is minute. Therefore the superior man is watchful\nover himself, when he is alone. \n\nWhile there are no stirrings of pleasure, anger, sorrow, or joy, the\nmind may be said to be in the state of Equilibrium. When those feelings\nhave been stirred, and they act in their due degree, there ensues\nwhat may be called the state of Harmony. This Equilibrium is the great\nroot from which grow all the human actings in the world, and this\nHarmony is the universal path 

In [18]:
vocabulary_size= 10_000

def custom_tokenizer(text):
    tokens = tf.strings.regex_replace(text, r'([.,!?()])', r' \1 ')  # Add spaces around punctuation
    tokens = tf.strings.regex_replace(tokens, "http\S+", "")
    tokens = tf.strings.strip(tokens)  # Remove leading/trailing spaces
    return tf.strings.split(tokens)

encoder = layers.TextVectorization(
    max_tokens = vocabulary_size,
    standardize = None, # TODO: reconsider this
    split = custom_tokenizer,
    output_mode ='int',
)

encoder.adapt(dataset_original_all)

In [19]:
vocabulary = encoder.get_vocabulary()
print(vocabulary[-100:])

['roaming', 'road;', 'river-bank', 'ripen', 'riotous', 'rigorously', 'rigid', 'rife', 'revolting', 'reviving', 'reviewed', 'reverently', 'reverential', 'reverence;', 'revels', 'retrograde', 'retrieve', 'restitution', 'respond', 'respect;', 'resolves', 'resolutely', 'resists', 'residing', 'research', 'republic', 'reproaching', 'reposed', 'replacement', 'replace', 'repairing', 'renounce', 'renders', 'remotest', 'remonstrance', 'remnants', 'relaxation', 'rejoicings', 'rejection', 'reins', 'reigning', 'regiments', 'regained', 'regain', 'refused;', 'refreshment', 'references', 'reduction', 'recur', 'rectified', 'recoiled', 'reclining', 'receipt', 'rebuilt', 'rebellious', 'rebelling', 'reared', 'reappeared', 'realized', 'realities', 'reads', 'ravage', 'rapine', 'ransom', 'rank;', 'ranges', 'raging', 'raged', 'radius', 'questionings', 'questioning', 'quell', 'quantity;', 'qualified', 'quail', 'purpose;', 'purify', 'publish', 'prowess', 'proving', 'provinces;', 'province;', 'proscribed', 'prop

# Dataset for Autoregression

In [20]:
sequence_length = 32
padding_token_id = 0

def create_dataset_for_autoregression(dataset):
  x_inputs=[]
  y_outputs=[]

  for books in dataset:
    books = encoder(books).numpy()

    for book in tqdm(books):
      book = [x for x in list(book) if x != padding_token_id]
      # time.sleep(1.0)

      # add padding at the beginning
      padding = [padding_token_id] * sequence_length
      book = padding + book
      # print(list(books[0][-100:]))

      # create inputs and
      for start_index in range(0, len(book)-sequence_length):
        x = book[start_index:start_index+sequence_length]
        assert len(x)==sequence_length
        y = book[start_index + sequence_length]

        x_inputs +=[x]
        y_outputs +=[y]

  return tf.data.Dataset.from_tensor_slices((x_inputs, y_outputs))

dataset_train = create_dataset_for_autoregression(dataset_original_train)
dataset_valid = create_dataset_for_autoregression(dataset_original_valid)


print('Done')

100%|██████████| 7/7 [00:01<00:00,  3.60it/s]
100%|██████████| 2/2 [00:01<00:00,  1.19it/s]


Done


In [21]:
def decode(indicies):
  return "".join([vocabulary[index] for index in indicies if index !=0])


for input, output in dataset_train.take(20):
  print('input :',  ", ".join([str(x) for x in input.numpy()]))
  print('output:', output.numpy())
  print('input decoded :', decode(input))
  print('output decoded:', decode([output]) )
  print("")

input : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
output: 30
input decoded : 
output decoded: The

input : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30
output: 732
input decoded : The
output decoded: Great

input : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 732
output: 1
input decoded : TheGreat
output decoded: [UNK]

input : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 732, 1
output: 516
input decoded : TheGreat[UNK]
output decoded: By

input : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 732, 1, 516
output: 1113
input decoded : TheGreat[UNK]By
output decoded: Confucius

input : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 732, 1, 516, 1113
output: 405
input decoded : TheGreat[UNK]ByConfucius
output decode

In [23]:
embedding_size = 128

model = models.Sequential()
model.add(layers.Embedding(
    vocabulary_size,
    embedding_size,
    input_length=sequence_length))
model.add(layers.LSTM(256))
model.add(layers.Dense(vocabulary_size, activation='softmax'))

model.summary()


model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(dataset_train.cache().shuffle(200_000).batch(2**10),
                    epochs=2,
                    validation_data=dataset_valid.cache().batch(2**10))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 32, 128)           1280000   
                                                                 
 lstm_1 (LSTM)               (None, 256)               394240    
                                                                 
 dense_1 (Dense)             (None, 10000)             2570000   
                                                                 
Total params: 4,244,240
Trainable params: 4,244,240
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 32, 128)           1280000   
                                                                 
 l

In [None]:
def render_history(history):
    plt.title("Training loss vs. validation loss")
    plt.plot(history.history["loss"], label="train_loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.legend()
    plt.show()
    plt.close()

    plt.title("Training accuracy vs. validation accuracy")
    plt.plot(history.history["accuracy"], label="accuracy")
    plt.plot(history.history["val_accuracy"], label="val_accuracy")
    plt.legend()
    plt.show()
    plt.close()

In [None]:
render_history(history)

In [None]:
model.save("/content/drive/MyDrive/Colab Notebooks/NLP/classic_model.h5")
# loaded_model = models.load_model("language_model.h5")

In [None]:
import numpy as np

def generate(model, seed_text, generated_sequence_length, temperature):

    input_sequence = encoder(seed_text).numpy()

    generated_sequence = list(input_sequence[::])

    # Pad.
    padding = [0] * (sequence_length - len(input_sequence))
    input_sequence = padding + list(input_sequence)

    # Generate the sequence by repeatedly predicting.
    while len(generated_sequence) < generated_sequence_length:
        prediction = model.predict(np.expand_dims(input_sequence, axis=0), verbose=0)
        predicted_index = get_index_from_prediction(prediction[0], temperature)
        generated_sequence.append(predicted_index)
        input_sequence = input_sequence[1:]
        input_sequence.append(predicted_index)

    # Convert the generated sequence to a string.
    text = decode(generated_sequence)
    print(text)
    print("")


def get_index_from_prediction(prediction, temperature=0.7):
    """ Gets an index from a prediction. """

    # Zero temperature - use the argmax.
    if temperature == 0.0:
        return np.argmax(prediction)

    # Non-zero temperature - do some random magic.
    else:
        prediction = np.asarray(prediction).astype('float64')

        prediction = np.log(prediction) / temperature

        # Softmax
        exp_prediction= np.exp(prediction)
        prediction = exp_prediction / np.sum(exp_prediction)  # prob-dist
        probabilities = np.random.multinomial(1, prediction, 1)
        return np.argmax(probabilities)


generate(model, "we are all doomed", 100, temperature=1.0)