<a href="https://colab.research.google.com/github/NID123-CH/NLP--Hugging-Face-Project/blob/main/DSR_40_NLP_Language_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade tensorflow==2.8.0
import os
import glob
import random
import shutil
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras import models, layers
from tqdm import tqdm



In [2]:
# Where the text files are going to live.
dataset_path = "dataset"
dataset_path_all = os.path.join(dataset_path, "all")
dataset_path_train = os.path.join(dataset_path, "train")
dataset_path_valid = os.path.join(dataset_path, "valid")

# Just use 20 files.
file_number = 20

# Gather the corpus if it has not been gathered yet.
if not os.path.exists(dataset_path):

    # Create all the folders.
    for path in [dataset_path, dataset_path_all, dataset_path_train, dataset_path_valid]:
        if not os.path.exists(path):
            os.mkdir(path)

    # Clone the repo.
    !git clone https://github.com/vilmibm/lovecraftcorpus

    # Find all the files.
    paths_all = glob.glob("lovecraftcorpus/*.txt")
    print(sorted(paths_all))

    # Standardize.
    for path in paths_all:
        content = open(path).read()
        content = content.lower()
        for punctuation in ".,:;?!":
            content = content.replace(punctuation, " " + punctuation)
        open(path, "w").write(content)

    # Do not use all.
    paths_all = paths_all[:file_number]

    # Split 80/20.
    split_index = int(len(paths_all) * 0.8)
    paths_train = paths_all[:split_index]
    paths_valid = paths_all[split_index:]

    # Copy files.
    def copy(paths, destination):
        for path in paths:
            shutil.copy2(path, destination)
    copy(paths_all, dataset_path_all)
    copy(paths_train, dataset_path_train)
    copy(paths_valid, dataset_path_valid)

    # Delete repo.
    !rm -rf lovecraftcorpus

    # Done.
    print("Corpus downloaded.")

In [3]:
batch_size = 32 # not for SGD
seed = 42

def create_dataset(dataset_path):
    dataset = preprocessing.text_dataset_from_directory(
        dataset_path,
        labels=None,
        batch_size=batch_size,
        seed=seed
    )
    return dataset
dataset_original_all = create_dataset(dataset_path_all)
dataset_original_train = create_dataset(dataset_path_train)
dataset_original_valid = create_dataset(dataset_path_valid)

Found 20 files belonging to 1 classes.
Found 16 files belonging to 1 classes.
Found 4 files belonging to 1 classes.


In [4]:
#unravel it
for x in dataset_original_all.take(1):
    print(x.shape)
    print(x[0])

(20,)
tf.Tensor(b'cool air\n\nyou ask me to explain why i am afraid of a draught of cool air ; why i shiver more than others upon entering a cold room , and seem nauseated and repelled when the chill of evening creeps through the heat of a mild autumn day . there are those who say i respond to cold as others do to a bad odour , and i am the last to deny the impression . what i will do is to relate the most horrible circumstance i ever encountered , and leave it to you to judge whether or not this forms a suitable explanation of my peculiarity .\n\nit is a mistake to fancy that horror is associated inextricably with darkness , silence , and solitude . i found it in the glare of mid-afternoon , in the clangour of a metropolis , and in the teeming midst of a shabby and commonplace rooming-house with a prosaic landlady and two stalwart men by my side . in the spring of 1923 i had secured some dreary and unprofitable magazine work in the city of new york ; and being unable to pay any substa

In [5]:
vocabulary_size = 10_000

encoder = layers.TextVectorization(
    max_tokens=vocabulary_size,
    standardize=None, # keep the punctuation
    split='whitespace',
    output_mode='int'
)

encoder.adapt(dataset_original_all)
vocabulary = encoder.get_vocabulary()
print(vocabulary[:30])

['', '[UNK]', 'the', ',', 'and', '.', 'of', 'a', 'to', 'in', 'was', 'that', 'i', 'he', 'had', 'it', 'his', 'as', 'with', 'on', ';', 'at', 'from', 'for', 'but', 'which', 'not', 'were', 'they', 'my']


In [6]:
sequence_length = 32 # tiny neural network
padding_token_id = 0

def create_dataset_for_autoregression(dataset, hop_length=1):
    x_inputs = []
    y_outputs = []

    for books in dataset:
        books = encoder(books).numpy()
        for book in books:

            #remove padding tokens
            book = [index for index in list(book) if index != padding_token_id]

            for start_index in range(0, len(book)-sequence_length, hop_length):
                x = book[start_index:start_index + sequence_length]
                y = book[start_index +1: start_index + sequence_length + 1]
                assert len(x) == sequence_length
                assert len(y) == sequence_length

                # The assert False statement was causing the error. It has been removed.

                x_inputs += [x]
                y_outputs += [y] # typo here, it was y_inputs



        #assert False # This assert False statement was causing the error. It has been removed.


    return tf.data.Dataset.from_tensor_slices((x_inputs, y_outputs))
dataset_train = create_dataset_for_autoregression(dataset_original_train)
dataset_valid = create_dataset_for_autoregression(dataset_original_valid)

In [7]:
def decode(indices):
  return " ".join([vocabulary[index]for index in indices])

for input,output in dataset_train.take(8):
    print("in: ", decode(input))
    print("out: ",decode(output))
    print("")



in:  the white ship i am [UNK] elton , keeper of the north point light that my father and grandfather kept before me . far from the shore stands the gray lighthouse ,
out:  white ship i am [UNK] elton , keeper of the north point light that my father and grandfather kept before me . far from the shore stands the gray lighthouse , above

in:  white ship i am [UNK] elton , keeper of the north point light that my father and grandfather kept before me . far from the shore stands the gray lighthouse , above
out:  ship i am [UNK] elton , keeper of the north point light that my father and grandfather kept before me . far from the shore stands the gray lighthouse , above sunken

in:  ship i am [UNK] elton , keeper of the north point light that my father and grandfather kept before me . far from the shore stands the gray lighthouse , above sunken
out:  i am [UNK] elton , keeper of the north point light that my father and grandfather kept before me . far from the shore stands the gray lighthouse 

In [8]:
import matplotlib.pyplot as plt

def render_history(history):
    plt.title("Training loss vs. validation loss")
    plt.plot(history.history["loss"], label="loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.legend()
    plt.show()
    plt.close()

    plt.title("Training accuracy vs. validation accuracy")
    plt.plot(history.history["accuracy"], label="accuracy")
    plt.plot(history.history["val_accuracy"], label="val_accuracy")
    plt.legend()
    plt.show()
    plt.close()

**Train the model**

In [9]:
embedding_size = 128

model= models.Sequential()
model.add(layers.Embedding(vocabulary_size, embedding_size, input_length=sequence_length))
model.add(layers.Dropout(0.1))
model.add(layers.LSTM(embedding_size, return_sequences=True))
model.add(layers.Dropout(0.1))
model.add(layers.Dense(vocabulary_size, activation="softmax"))

model.summary()

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.fit(
    dataset_train.shuffle(10_000).batch(1024),
    validation_data=dataset_valid.batch(1024),
    epochs=2,


)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 128)           1280000   
                                                                 
 dropout (Dropout)           (None, 32, 128)           0         
                                                                 
 lstm (LSTM)                 (None, 32, 128)           131584    
                                                                 
 dropout_1 (Dropout)         (None, 32, 128)           0         
                                                                 
 dense (Dense)               (None, 32, 10000)         1290000   
                                                                 
Total params: 2,701,584
Trainable params: 2,701,584
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7a7738bcf430>

In [12]:
def generate(model, seed_text, generated_sequence_length, temperature):

    seed_text = seed_text.lower()
    for punctuation in ".,:;?!":
        seed_text = seed_text.replace(punctuation, " " + punctuation)
    print("Seed text: " + seed_text)


    input_sequence = encoder(seed_text).numpy().tolist()
    print(input_sequence)

    # Generate the sequence by repeatedly predicting.
    while len(input_sequence) < generated_sequence_length:
        prediction = model.predict(np.expand_dims(input_sequence, axis=0), verbose=False)
        predicted_index = get_index_from_prediction(prediction[0][-1], temperature)
        input_sequence.append(predicted_index)

    # Convert the generated sequence to a string.
    text = decode(input_sequence)
    for punctuation in ".,:;?!":
        text = text.replace(" " + punctuation, punctuation)
    print(text)
    print("")


def get_index_from_prediction(prediction, temperature=0.0):
    """ Gets an index from a prediction. """

    # Zero temperature - use the argmax.
    if temperature == 0.0:
        return np.argmax(prediction)

    # Non-zero temperature - do some random magic.
    else:
        prediction = np.asarray(prediction).astype('float64')
        prediction = np.log(prediction) / temperature
        exp_prediction= np.exp(prediction)
        prediction = exp_prediction / np.sum(exp_prediction)
        probabilities = np.random.multinomial(1, prediction, 1)
        return np.argmax(probabilities)


In [None]:
_