# Exercise 5: Explore text generation pipeline using RNN

In [1]:
# load ascii text and covert to lowercase
filename = "/content/drive/MyDrive/CSCI5930/Exercise_05/ML_reading_comments.csv"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

In [2]:
def step2_clean_text(raw_text):
   import re
   import string
   import nltk
   from nltk.stem import WordNetLemmatizer

   # (1) Initialize the lemmatizer
   nltk.download('wordnet')
   lemmatizer = WordNetLemmatizer()

   # (2) Define a list of stopwords (single letters)
   stopwords = set(['b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])

   # (3) Define a regular expression pattern to match unwanted characters (e.g., digits and special symbols)
   unwanted_chars_pattern = re.compile(r'[^a-zA-Z\s.,!?]')

   data_text = []
   lines = raw_text.strip().split('\n')

   # prepare regex for char filtering
   re_print = re.compile('[^%s]' % re.escape(string.printable))

   for comment in lines:
       # Replace unwanted characters with a space
       comment = re.sub(unwanted_chars_pattern, ' ', comment)

       # tokenize on white space
       comment = comment.split()

       ## convert comment to lowercase
       comment = [word.lower() for word in comment]

       # remove non-printable chars form each token
       comment = [re_print.sub('', w) for w in comment]

       ### lemmatize word
       comment = [lemmatizer.lemmatize(word) for word in comment]

       # remove tokens with numbers in them
       comment = [word for word in comment if word not in stopwords ]

       if len(comment) < 200: # filter out short comment
           continue

       # store as string
       comment = ' '.join(comment)

       # Replace multiple spaces with a single space
       comment = re.sub(r'\s+', ' ', comment)

       data_text.append(comment)

   cleaned_text = '\n'.join(data_text)
   return cleaned_text

In [3]:
cleaned_text = step2_clean_text(raw_text)
cleaned_text

[nltk_data] Downloading package wordnet to /root/nltk_data...


'after going through the abstract of the paper, i found that the paper brings up some key issue like how ml model are treated a black box and restrict their use in meteorology and this paper us some of ml model like linear regression, logistic regression, decision trees, na ve bayes, gradient boosted decision tree and svm. this proposed paper us meteorological example to show how these model work and also the process and best approach to apply these model in different datasets and domain. most of the ml model opaque nature prevents meeting one of the three requirement like consistency prior to user knowledge, accuracy, and benefit. most of the paper model satisfy the last two condition but fail to satisfy the condition and this make the model lose trustworthiness among the users. this section majorly covered about what is ml model and how many type these are sub categorized like supervised and unsupervised and showcased how these differ and work differently. then it presented the workf

In [4]:
import tensorflow as tf

# Decide the text generation strategy
text_generator = 'character_level'  # or 'word_level'

if text_generator == 'word_level':
    vectorize_layer = tf.keras.layers.TextVectorization(
        split="whitespace",
        standardize="lower"
    )
    window_size = 20
elif text_generator == 'character_level':
    vectorize_layer = tf.keras.layers.TextVectorization(
        split="character",
        standardize="lower"
    )
    window_size = 50
else:
    print("Error!")
    exit(-1)

# Filter out any empty strings from your dataset before adapting the layer.
# Assume cleaned_text is a list (or iterable) of strings.
filtered_text = [text for text in cleaned_text if text.strip() != '']

# Adapt the layer with the filtered text so no empty strings are added from your data.
vectorize_layer.adapt(tf.constant(filtered_text))
vocabulary = vectorize_layer.get_vocabulary()

print("Vocabulary:", vocabulary)
print("Total number of tokens:", len(vocabulary))

Vocabulary: ['', '[UNK]', np.str_('e'), np.str_('t'), np.str_('a'), np.str_('i'), np.str_('n'), np.str_('o'), np.str_('r'), np.str_('s'), np.str_('l'), np.str_('h'), np.str_('d'), np.str_('c'), np.str_('m'), np.str_('u'), np.str_('g'), np.str_('f'), np.str_('p'), np.str_('b'), np.str_('y'), np.str_('w'), np.str_('.'), np.str_('v'), np.str_(','), np.str_('k'), np.str_('x'), np.str_('q'), np.str_('z'), np.str_('j'), np.str_('?'), np.str_('!')]
Total number of tokens: 32


In [5]:
import tensorflow as tf

# Create the model that uses the vectorize text layer
model_vectorizer = tf.keras.models.Sequential()
model_vectorizer.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model_vectorizer.add(vectorize_layer)

In [6]:
import tensorflow as tf
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')  # Ensure the tokenizer data is available

input_string = 'machine learning.'
if text_generator == 'word_level':
    tokens = word_tokenize(input_string)
    input_string = " ".join(tokens)
elif text_generator == 'character_level':
    input_string = "".join([char for char in input_string])

# Convert to a TensorFlow constant (batch of one sample)
input_tensor = tf.constant([input_string])
sentences_indices = model_vectorizer.predict(input_tensor, verbose=0).flatten()
print("The integer encoding for input string:", input_string, "is\n", sentences_indices)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


The integer encoding for input string: machine learning. is
 [14  4 13 11  5  6  2  1 10  2  4  8  6  5  6 16 22]


In [7]:
import tensorflow as tf

# Save the model
model_vectorizer.save("vectorize_layer_" + text_generator + ".keras")

# Load the model
loaded_model_vectorizer = tf.keras.models.load_model("vectorize_layer_" + text_generator + ".keras")
loaded_vectorizer = loaded_model_vectorizer.layers[0]
loaded_vocabulary = loaded_vectorizer.get_vocabulary()
print("Loaded Vocabulary:", loaded_vocabulary)
print("Total number of tokens:", len(loaded_vocabulary))

Loaded Vocabulary: ['', '[UNK]', np.str_('e'), np.str_('t'), np.str_('a'), np.str_('i'), np.str_('n'), np.str_('o'), np.str_('r'), np.str_('s'), np.str_('l'), np.str_('h'), np.str_('d'), np.str_('c'), np.str_('m'), np.str_('u'), np.str_('g'), np.str_('f'), np.str_('p'), np.str_('b'), np.str_('y'), np.str_('w'), np.str_('.'), np.str_('v'), np.str_(','), np.str_('k'), np.str_('x'), np.str_('q'), np.str_('z'), np.str_('j'), np.str_('?'), np.str_('!')]
Total number of tokens: 32


In [8]:
# prepare the dataset of input to output pairs encoded as integers
def step5_prepare_text_generation_data(cleaned_text, model_vectorizer, text_generator = 'character_level', window_size = 100, window_shift = 1, data_format = 'many2many' ):
   import pandas as pd
   import nltk
   from nltk.tokenize import word_tokenize
   nltk.download('punkt') # Download the punkt tokenizer data (if not already downloaded)

   data_array = []
   data_indices_array = []

   for idx, comment in enumerate(cleaned_text.split('\n')):
      if idx % 100 == 0:
         print(idx, end=',')

      if text_generator == 'word_level':
         comment_words = word_tokenize(comment)
      elif text_generator == 'character_level':
         # split string into a list characters
         comment_words = [char for char in comment]

      # integer encoding to input string
      sentences_indices = model_vectorizer.predict(tf.constant(comment_words), verbose = 0).flatten()

      for i in range(0, len(comment_words) - window_size, window_shift):

          # get window of words
          seq_in = comment_words[i:i+window_size]
          # get integer encoding words
          seq_in_indices = sentences_indices[i:i+window_size]

          if data_format == 'many2one':
              # get output letter
              seq_out = comment_words[i+window_size]
              # get integer encoding words
              seq_out_indices = sentences_indices[i+window_size]
          elif data_format == 'many2many':
              # get output letter
              seq_out = comment_words[i+1:i+window_size+1]
              # get integer encoding words
              seq_out_indices = sentences_indices[i+1:i+window_size+1]
          else:
              print("data_format should be one of ['many2one', 'many2many']")
              exit(-1)

          # save data into dataframe for reference
          data_array.append([seq_in,seq_out])
          data_indices_array.append([seq_in_indices,seq_out_indices])

   n_samples = len(data_array)
   print("\nDerive the training dataset with size ", n_samples)
   data_df = pd.DataFrame(data_array,columns =['Input','Output'])
   data_indices_df = pd.DataFrame(data_indices_array,columns =['Input','Output'])
   return data_df, data_indices_df

In [9]:
data_format = 'many2one'

training_raw_text_many2one, training_integer_dataset_many2one = step5_prepare_text_generation_data(cleaned_text, model_vectorizer, text_generator = text_generator, window_size = window_size, window_shift = 1, data_format = data_format)

0,

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Derive the training dataset with size  216148


In [10]:
data_format = 'many2many'

training_raw_text_many2many, training_integer_dataset_many2many = step5_prepare_text_generation_data(cleaned_text, model_vectorizer, text_generator = text_generator, window_size = window_size, window_shift = 1, data_format = data_format)

0,

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Derive the training dataset with size  216148


In [11]:
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
training_raw_text_many2one

Unnamed: 0,Input,Output
0,"[a, f, t, e, r, , g, o, i, n, g, , t, h, r, o, u, g, h, , t, h, e, , a, b, s, t, r, a, c, t, , o, f, , t, h, e, , p, a, p, e, r, ,, , i, , f]",o
1,"[f, t, e, r, , g, o, i, n, g, , t, h, r, o, u, g, h, , t, h, e, , a, b, s, t, r, a, c, t, , o, f, , t, h, e, , p, a, p, e, r, ,, , i, , f, o]",u
2,"[t, e, r, , g, o, i, n, g, , t, h, r, o, u, g, h, , t, h, e, , a, b, s, t, r, a, c, t, , o, f, , t, h, e, , p, a, p, e, r, ,, , i, , f, o, u]",n
3,"[e, r, , g, o, i, n, g, , t, h, r, o, u, g, h, , t, h, e, , a, b, s, t, r, a, c, t, , o, f, , t, h, e, , p, a, p, e, r, ,, , i, , f, o, u, n]",d
4,"[r, , g, o, i, n, g, , t, h, r, o, u, g, h, , t, h, e, , a, b, s, t, r, a, c, t, , o, f, , t, h, e, , p, a, p, e, r, ,, , i, , f, o, u, n, d]",
...,...,...
216143,"[t, i, m, a, t, i, o, n, , f, o, r, , l, o, g, i, s, t, i, c, , r, e, g, r, e, s, s, i, o, n, ,, , b, i, n, a, r, y, , c, r, o, s, s, , e, n, t]",r
216144,"[i, m, a, t, i, o, n, , f, o, r, , l, o, g, i, s, t, i, c, , r, e, g, r, e, s, s, i, o, n, ,, , b, i, n, a, r, y, , c, r, o, s, s, , e, n, t, r]",o
216145,"[m, a, t, i, o, n, , f, o, r, , l, o, g, i, s, t, i, c, , r, e, g, r, e, s, s, i, o, n, ,, , b, i, n, a, r, y, , c, r, o, s, s, , e, n, t, r, o]",p
216146,"[a, t, i, o, n, , f, o, r, , l, o, g, i, s, t, i, c, , r, e, g, r, e, s, s, i, o, n, ,, , b, i, n, a, r, y, , c, r, o, s, s, , e, n, t, r, o, p]",y


In [12]:
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
training_integer_dataset_many2one

Unnamed: 0,Input,Output
0,"[4, 17, 3, 2, 8, 1, 16, 7, 5, 6, 16, 1, 3, 11, 8, 7, 15, 16, 11, 1, 3, 11, 2, 1, 4, 19, 9, 3, 8, 4, 13, 3, 1, 7, 17, 1, 3, 11, 2, 1, 18, 4, 18, 2, 8, 24, 1, 5, 1, 17]",7
1,"[17, 3, 2, 8, 1, 16, 7, 5, 6, 16, 1, 3, 11, 8, 7, 15, 16, 11, 1, 3, 11, 2, 1, 4, 19, 9, 3, 8, 4, 13, 3, 1, 7, 17, 1, 3, 11, 2, 1, 18, 4, 18, 2, 8, 24, 1, 5, 1, 17, 7]",15
2,"[3, 2, 8, 1, 16, 7, 5, 6, 16, 1, 3, 11, 8, 7, 15, 16, 11, 1, 3, 11, 2, 1, 4, 19, 9, 3, 8, 4, 13, 3, 1, 7, 17, 1, 3, 11, 2, 1, 18, 4, 18, 2, 8, 24, 1, 5, 1, 17, 7, 15]",6
3,"[2, 8, 1, 16, 7, 5, 6, 16, 1, 3, 11, 8, 7, 15, 16, 11, 1, 3, 11, 2, 1, 4, 19, 9, 3, 8, 4, 13, 3, 1, 7, 17, 1, 3, 11, 2, 1, 18, 4, 18, 2, 8, 24, 1, 5, 1, 17, 7, 15, 6]",12
4,"[8, 1, 16, 7, 5, 6, 16, 1, 3, 11, 8, 7, 15, 16, 11, 1, 3, 11, 2, 1, 4, 19, 9, 3, 8, 4, 13, 3, 1, 7, 17, 1, 3, 11, 2, 1, 18, 4, 18, 2, 8, 24, 1, 5, 1, 17, 7, 15, 6, 12]",1
...,...,...
216143,"[3, 5, 14, 4, 3, 5, 7, 6, 1, 17, 7, 8, 1, 10, 7, 16, 5, 9, 3, 5, 13, 1, 8, 2, 16, 8, 2, 9, 9, 5, 7, 6, 24, 1, 19, 5, 6, 4, 8, 20, 1, 13, 8, 7, 9, 9, 1, 2, 6, 3]",8
216144,"[5, 14, 4, 3, 5, 7, 6, 1, 17, 7, 8, 1, 10, 7, 16, 5, 9, 3, 5, 13, 1, 8, 2, 16, 8, 2, 9, 9, 5, 7, 6, 24, 1, 19, 5, 6, 4, 8, 20, 1, 13, 8, 7, 9, 9, 1, 2, 6, 3, 8]",7
216145,"[14, 4, 3, 5, 7, 6, 1, 17, 7, 8, 1, 10, 7, 16, 5, 9, 3, 5, 13, 1, 8, 2, 16, 8, 2, 9, 9, 5, 7, 6, 24, 1, 19, 5, 6, 4, 8, 20, 1, 13, 8, 7, 9, 9, 1, 2, 6, 3, 8, 7]",18
216146,"[4, 3, 5, 7, 6, 1, 17, 7, 8, 1, 10, 7, 16, 5, 9, 3, 5, 13, 1, 8, 2, 16, 8, 2, 9, 9, 5, 7, 6, 24, 1, 19, 5, 6, 4, 8, 20, 1, 13, 8, 7, 9, 9, 1, 2, 6, 3, 8, 7, 18]",20


In [94]:
import numpy as np
import tensorflow


# Set the type of dataset
training_integer_dataset = training_integer_dataset_many2many

# Shuffle the rows
training_integer_dataset = training_integer_dataset.sample(frac=1.0, random_state=42) # Set a random seed for reproducibility
# Reset the index if needed
training_integer_dataset.reset_index(drop=True, inplace=True)


X = np.vstack(training_integer_dataset['Input'].apply(np.array))
y = np.vstack(training_integer_dataset['Output'].apply(np.array))

print("X.shape: ",X.shape)
print("y.shape: ",y.shape)

X.shape:  (216148, 50)
y.shape:  (216148, 50)


In [14]:
from keras.models import Sequential
from keras.layers import Embedding
from keras import layers
from keras import initializers
from keras.layers import Dropout,Embedding, Dense, GRU, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 100
total_vocab_size = len(vocabulary)
model = Sequential()
model.add(Embedding(total_vocab_size, embedding_dim, input_length=X.shape[1]))
if data_format == 'many2one':
   model.add(GRU(512))
else:
   model.add(GRU(512, return_sequences=True))
model.add(Dense(total_vocab_size, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()



In [15]:
model.fit(X, y, epochs=5,validation_split=0.1)

Epoch 1/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 10ms/step - accuracy: 0.6877 - loss: 1.0625 - val_accuracy: 0.8935 - val_loss: 0.3566
Epoch 2/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 10ms/step - accuracy: 0.8986 - loss: 0.3374 - val_accuracy: 0.9025 - val_loss: 0.3242
Epoch 3/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 10ms/step - accuracy: 0.9049 - loss: 0.3131 - val_accuracy: 0.9031 - val_loss: 0.3194
Epoch 4/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 9ms/step - accuracy: 0.9061 - loss: 0.3074 - val_accuracy: 0.9044 - val_loss: 0.3140
Epoch 5/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 9ms/step - accuracy: 0.9065 - loss: 0.3042 - val_accuracy: 0.9036 - val_loss: 0.3156


<keras.src.callbacks.history.History at 0x7ff1d0df3450>

In [16]:
from tensorflow.keras.models import load_model
# save the model to disk
model.save("model_"+text_generator+".keras")

In [17]:
from tensorflow.keras.models import load_model

# Load the model
loaded_model = load_model("model_"+text_generator+".keras")

In [18]:
def sample_token(preds, temperature=1.0):
   # helper function to sample an index from a probability array
   preds = np.asarray(preds).astype('float64')
   preds = np.log(preds) / temperature
   exp_preds = np.exp(preds)
   preds = exp_preds / np.sum(exp_preds)
   probas = np.random.multinomial(1, preds, 1)
   return np.argmax(probas)

def generate_text(rnn_model, model_vectorizer, start_text,text_length=100, text_generator = 'character_level'):
   from tensorflow.keras.preprocessing.sequence import pad_sequences
   import tensorflow as tf
   import nltk
   from nltk.tokenize import word_tokenize
   import sys
   import numpy as np
   nltk.download('punkt') # Download the punkt tokenizer data (if not already downloaded)

   comment_words = start_text
   if text_generator == 'word_level':
      comment_words = word_tokenize(start_text.lower())
   elif text_generator == 'character_level':
      comment_words = [char.lower() for char in start_text]

   # Create the model that uses the vectorize text layer
   model_vectorizer_layer = model_vectorizer.layers[0]
   vocabulary = model_vectorizer_layer.get_vocabulary()
   encoding = model_vectorizer.predict(tf.constant(comment_words)).flatten().tolist()

   input_shape = rnn_model.input_shape
   maxlen = input_shape[1] # specify how long the sequences should be. This cuts sequences that exceed that number.
   print("#### Input sequence: ", start_text)
   print("#### Start generating the paragraph: \n")

   line_print = ''
   new_sequence = start_text

   sys.stdout.write(start_text+"\n")
   for repeat in range(text_length):
      test_data = np.reshape(encoding, (1, len(encoding)))
      test_data_pad = pad_sequences(test_data, padding='pre', maxlen=maxlen)
      prediction = rnn_model.predict(test_data_pad, verbose=0)

      if len(prediction.shape) == 2:
          prediction = prediction[0]
      else:
          prediction = prediction[0,-1,:]

      index = sample_token(prediction) # sample word by probability
      result = vocabulary[index]

      if result in ['', '[UNK]', ' ']:
         result = ' '

      if text_generator == 'word_level':
         line_print = line_print + ' '+ result
         new_sequence = new_sequence + result
         if len(line_print) > 70:
             sys.stdout.write("\n")
             line_print = ''
         sys.stdout.write(' '+ result)

      elif text_generator == 'character_level':
         line_print = line_print +' '+ result
         new_sequence = new_sequence + ' '+ result
         if len(line_print) > 200 and result==' ':
             sys.stdout.write("\n")
             line_print = ''
         sys.stdout.write(result)
         # add new word into current encoding for predicting next word
      encoding.append(index)
      encoding = encoding[1:len(encoding)]

In [19]:
from tensorflow.keras.models import load_model
# Load the model
loaded_model = tensorflow.keras.models.load_model("model_"+text_generator+".keras")
loaded_model_vectorizer = tensorflow.keras.models.load_model("vectorize_layer_"+text_generator+".keras")

start_text = 'If the output label is numeric values, we '
generate_text(loaded_model, loaded_model_vectorizer, start_text,text_length=1000, text_generator = text_generator)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Input sequence:  If the output label is numeric values, we 
#### Start generating the paragraph: 

If the output label is numeric values, we 
can think of it a categorical classification task where there are over after that tell about the current
 meteorology within an area and the glm output or predicted variable is a goal of a randomly chosen positive,
 increment the input data. they both lasso and railar for able to specify many thing using two method
 need to peract the no.of thunderstorm if it ha at least one flash in the image. for problem statement
 , an image is classifier.step train the dataset which result in bigger leaf node with broader preprocessing
 step quality control, it minimum consistencying linear algebra because if we learn from the training
 parameter to generate stronger prediction or described some pattern relationship among the features.
 the training dataset is in range between to of ridge and lasso in the wanted to avoid introduced me to
 some new evalua

In [20]:
#@title Step 11

In [31]:
#@title Word level - many2many


import tensorflow as tf

# Decide the text generation strategy
text_generator = 'word_level'

if text_generator == 'word_level':
    vectorize_layer = tf.keras.layers.TextVectorization(
        split="whitespace",
        standardize="lower"
    )
    window_size = 20


elif text_generator == 'character_level':
    vectorize_layer = tf.keras.layers.TextVectorization(
        split="character",
        standardize="lower"
    )
    window_size = 50
else:
    print("Error!")
    exit(-1)

# Filter out any empty strings from your dataset before adapting the layer.
# Assume cleaned_text is a list (or iterable) of strings.
filtered_text = [text for text in cleaned_text if text.strip() != '']

# Adapt the layer with the filtered text so no empty strings are added from your data.
vectorize_layer.adapt(tf.constant(filtered_text))
vocabulary = vectorize_layer.get_vocabulary()

print("Vocabulary:", vocabulary)
print("Total number of tokens:", len(vocabulary))

Vocabulary: ['', '[UNK]', np.str_('e'), np.str_('t'), np.str_('a'), np.str_('i'), np.str_('n'), np.str_('o'), np.str_('r'), np.str_('s'), np.str_('l'), np.str_('h'), np.str_('d'), np.str_('c'), np.str_('m'), np.str_('u'), np.str_('g'), np.str_('f'), np.str_('p'), np.str_('b'), np.str_('y'), np.str_('w'), np.str_('.'), np.str_('v'), np.str_(','), np.str_('k'), np.str_('x'), np.str_('q'), np.str_('z'), np.str_('j'), np.str_('?'), np.str_('!')]
Total number of tokens: 32


In [22]:
import tensorflow as tf

# Create the model that uses the vectorize text layer
model_vectorizer = tf.keras.models.Sequential()
model_vectorizer.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model_vectorizer.add(vectorize_layer)

In [28]:
import tensorflow as tf
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')  # Ensure the tokenizer data is available

input_string = 'machine learning.'
if text_generator == 'word_level':
    tokens = word_tokenize(input_string)
    input_string = " ".join(tokens)
elif text_generator == 'character_level':
    input_string = "".join([char for char in input_string])

# Convert to a TensorFlow constant (batch of one sample)
input_tensor = tf.constant([input_string])
sentences_indices = model_vectorizer.predict(input_tensor, verbose=0).flatten()
print("The integer encoding for input string:", input_string, "is\n", sentences_indices)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


The integer encoding for input string: machine learning . is
 [ 1  1 22]


In [24]:
from keras.models import Sequential
from keras.layers import Embedding
from keras import layers
from keras import initializers
from keras.layers import Dropout,Embedding, Dense, GRU, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
import nltk

nltk.download('punkt') # Download the punkt tokenizer data (if not already downloaded)

data_format = 'many2many'

embedding_dim = 100
total_vocab_size = len(vocabulary)
model = Sequential()
model.add(Embedding(total_vocab_size, embedding_dim, input_length=X.shape[1]))
if data_format == 'many2one':
   model.add(GRU(512))
else:
   model.add(GRU(512, return_sequences=True))
model.add(Dense(total_vocab_size, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
model.fit(X, y, epochs=5,validation_split=0.1)

Epoch 1/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 10ms/step - accuracy: 0.6896 - loss: 1.0558 - val_accuracy: 0.8928 - val_loss: 0.3586
Epoch 2/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 10ms/step - accuracy: 0.8983 - loss: 0.3384 - val_accuracy: 0.9008 - val_loss: 0.3291
Epoch 3/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 10ms/step - accuracy: 0.9046 - loss: 0.3143 - val_accuracy: 0.9040 - val_loss: 0.3169
Epoch 4/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 10ms/step - accuracy: 0.9060 - loss: 0.3071 - val_accuracy: 0.9032 - val_loss: 0.3167
Epoch 5/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 10ms/step - accuracy: 0.9063 - loss: 0.3051 - val_accuracy: 0.9036 - val_loss: 0.3169


<keras.src.callbacks.history.History at 0x7ff1517d6890>

In [42]:
def sample_token(preds, temperature=1.0):
   # helper function to sample an index from a probability array
   preds = np.asarray(preds).astype('float64')
   preds = np.log(preds) / temperature
   exp_preds = np.exp(preds)
   preds = exp_preds / np.sum(exp_preds)
   probas = np.random.multinomial(1, preds, 1)
   return np.argmax(probas)

def generate_text(rnn_model, model_vectorizer, start_text,text_length=100, text_generator = 'character_level'):
   from tensorflow.keras.preprocessing.sequence import pad_sequences
   import tensorflow as tf
   import nltk
   from nltk.tokenize import word_tokenize
   import sys
   import numpy as np
   nltk.download('punkt') # Download the punkt tokenizer data (if not already downloaded)

   comment_words = start_text
   if text_generator == 'word_level':
      comment_words = word_tokenize(start_text.lower())
   elif text_generator == 'character_level':
      comment_words = [char.lower() for char in start_text]

   # Create the model that uses the vectorize text layer
   model_vectorizer_layer = model_vectorizer.layers[0]
   vocabulary = model_vectorizer_layer.get_vocabulary()
   encoding = model_vectorizer.predict(tf.constant(comment_words)).flatten().tolist()

   input_shape = rnn_model.input_shape
   maxlen = input_shape[1] # specify how long the sequences should be. This cuts sequences that exceed that number.
   print("#### Input sequence: ", start_text)
   print("#### Start generating the paragraph: \n")

   line_print = ''
   new_sequence = start_text

   sys.stdout.write(start_text+"\n")
   for repeat in range(text_length):
      test_data = np.reshape(encoding, (1, len(encoding)))
      test_data_pad = pad_sequences(test_data, padding='pre', maxlen=maxlen)
      prediction = rnn_model.predict(test_data_pad, verbose=0)

      if len(prediction.shape) == 2:
          prediction = prediction[0]
      else:
          prediction = prediction[0,-1,:]

      index = sample_token(prediction) # sample word by probability
      result = vocabulary[index]

      if result in ['', '[UNK]', ' ']:
         result = ' '

      if text_generator == 'word_level':
         line_print = line_print + ''+ result
         new_sequence = new_sequence + result
         if len(line_print) > 70:
             sys.stdout.write("\n")
             line_print = ''
         sys.stdout.write(''+ result)

      elif text_generator == 'character_level':
         line_print = line_print +' '+ result
         new_sequence = new_sequence + ' '+ result
         if len(line_print) > 200 and result==' ':
             sys.stdout.write("\n")
             line_print = ''
         sys.stdout.write(result)
         # add new word into current encoding for predicting next word
      encoding.append(index)
      encoding = encoding[1:len(encoding)]

In [43]:
start_text = 'If the output label is numeric values, we '
generate_text(model, model_vectorizer, start_text,text_length=1000, text_generator = text_generator)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Input sequence:  If the output label is numeric values, we 
#### Start generating the paragraph: 

If the output label is numeric values, we 
we talked, but i us the normal equarized for meteorological knowledge,
 so that to ensure the algorithm from the data may also cheace add a pe
nalty best performance diagram and i hope on more than other side the l
inear regression to some feature engine the number of neighbor strave o
utput measure the complexity is also classification, but this mately, b
ecause it one feature engine the number of lightning in the data is use
d for evaluating the data to the gradient of the data, this will determ
s consting the data into to the opposite direction. the paper discussed
 to take in the diagonal.hes the overall aloghes the issue wit data. th
is step or name a mix between batch gradient is a measure the coefficie
ntly to the ridge regression it is a success ratio rather that help pre
vent overfit to train the data into training data wa an injut 

In [49]:
#@title Word level - many2one

import tensorflow as tf

# Decide the text generation strategy
text_generator = 'word_level'

if text_generator == 'word_level':
    vectorize_layer = tf.keras.layers.TextVectorization(
        split="whitespace",
        standardize="lower"
    )
    window_size = 20


elif text_generator == 'character_level':
    vectorize_layer = tf.keras.layers.TextVectorization(
        split="character",
        standardize="lower"
    )
    window_size = 50
else:
    print("Error!")
    exit(-1)

# Filter out any empty strings from your dataset before adapting the layer.
# Assume cleaned_text is a list (or iterable) of strings.
filtered_text = [text for text in cleaned_text if text.strip() != '']

# Adapt the layer with the filtered text so no empty strings are added from your data.
vectorize_layer.adapt(tf.constant(filtered_text))
vocabulary = vectorize_layer.get_vocabulary()

print("Vocabulary:", vocabulary)
print("Total number of tokens:", len(vocabulary))

Vocabulary: ['', '[UNK]', np.str_('e'), np.str_('t'), np.str_('a'), np.str_('i'), np.str_('n'), np.str_('o'), np.str_('r'), np.str_('s'), np.str_('l'), np.str_('h'), np.str_('d'), np.str_('c'), np.str_('m'), np.str_('u'), np.str_('g'), np.str_('f'), np.str_('p'), np.str_('b'), np.str_('y'), np.str_('w'), np.str_('.'), np.str_('v'), np.str_(','), np.str_('k'), np.str_('x'), np.str_('q'), np.str_('z'), np.str_('j'), np.str_('?'), np.str_('!')]
Total number of tokens: 32


In [72]:
from keras.models import Sequential
from keras.layers import Embedding
from keras import layers
from keras import initializers
from keras.layers import Dropout,Embedding, Dense, GRU, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint

data_format = 'many2one'

embedding_dim = 100
total_vocab_size = len(vocabulary)
model = Sequential()
model.add(Embedding(total_vocab_size, embedding_dim))
if data_format == 'many2one':
   model.add(GRU(512))
else:
   model.add(GRU(512, return_sequences=True))
model.add(Dense(total_vocab_size, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

In [90]:
import numpy as np
import tensorflow


# Set the type of dataset
training_integer_dataset = training_integer_dataset_many2one

# Shuffle the rows
training_integer_dataset = training_integer_dataset.sample(frac=1.0, random_state=42) # Set a random seed for reproducibility
# Reset the index if needed
training_integer_dataset.reset_index(drop=True, inplace=True)


X = np.vstack(training_integer_dataset['Input'].apply(np.array))
y = np.vstack(training_integer_dataset['Output'].apply(np.array))
y = y.squeeze()

print("X.shape: ",X.shape)
print("y.shape: ",y.shape)

X.shape:  (216148, 50)
y.shape:  (216148,)


In [91]:
model.fit(X, y, epochs=5, validation_split=0.1)

Epoch 1/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 10ms/step - accuracy: 0.6184 - loss: 1.3036 - val_accuracy: 0.6790 - val_loss: 1.0834
Epoch 2/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 9ms/step - accuracy: 0.7077 - loss: 0.9649 - val_accuracy: 0.7057 - val_loss: 0.9906
Epoch 3/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 9ms/step - accuracy: 0.7460 - loss: 0.8301 - val_accuracy: 0.7180 - val_loss: 0.9557
Epoch 4/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 9ms/step - accuracy: 0.7676 - loss: 0.7568 - val_accuracy: 0.7243 - val_loss: 0.9448
Epoch 5/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 9ms/step - accuracy: 0.7760 - loss: 0.7243 - val_accuracy: 0.7241 - val_loss: 0.9529


<keras.src.callbacks.history.History at 0x7ff14000aed0>

In [92]:
def sample_token(preds, temperature=1.0):
   # helper function to sample an index from a probability array
   preds = np.asarray(preds).astype('float64')
   preds = np.log(preds) / temperature
   exp_preds = np.exp(preds)
   preds = exp_preds / np.sum(exp_preds)
   probas = np.random.multinomial(1, preds, 1)
   return np.argmax(probas)

def generate_text(rnn_model, model_vectorizer, start_text,text_length=100, text_generator = 'character_level'):
   from tensorflow.keras.preprocessing.sequence import pad_sequences
   import tensorflow as tf
   import nltk
   from nltk.tokenize import word_tokenize
   import sys
   import numpy as np
   nltk.download('punkt') # Download the punkt tokenizer data (if not already downloaded)

   comment_words = start_text
   if text_generator == 'word_level':
      comment_words = word_tokenize(start_text.lower())
   elif text_generator == 'character_level':
      comment_words = [char.lower() for char in start_text]

   # Create the model that uses the vectorize text layer
   model_vectorizer_layer = model_vectorizer.layers[0]
   vocabulary = model_vectorizer_layer.get_vocabulary()
   encoding = model_vectorizer.predict(tf.constant(comment_words)).flatten().tolist()

   input_shape = rnn_model.input_shape
   maxlen = input_shape[1] # specify how long the sequences should be. This cuts sequences that exceed that number.
   print("#### Input sequence: ", start_text)
   print("#### Start generating the paragraph: \n")

   line_print = ''
   new_sequence = start_text

   sys.stdout.write(start_text+"\n")
   for repeat in range(text_length):
      test_data = np.reshape(encoding, (1, len(encoding)))
      test_data_pad = pad_sequences(test_data, padding='pre', maxlen=maxlen)
      prediction = rnn_model.predict(test_data_pad, verbose=0)

      if len(prediction.shape) == 2:
          prediction = prediction[0]
      else:
          prediction = prediction[0,-1,:]

      index = sample_token(prediction) # sample word by probability
      result = vocabulary[index]

      if result in ['', '[UNK]', ' ']:
         result = ' '

      if text_generator == 'word_level':
         line_print = line_print + ''+ result
         new_sequence = new_sequence + result
         if len(line_print) > 70:
             sys.stdout.write("\n")
             line_print = ''
         sys.stdout.write(''+ result)

      elif text_generator == 'character_level':
         line_print = line_print +' '+ result
         new_sequence = new_sequence + ' '+ result
         if len(line_print) > 200 and result==' ':
             sys.stdout.write("\n")
             line_print = ''
         sys.stdout.write(result)
         # add new word into current encoding for predicting next word
      encoding.append(index)
      encoding = encoding[1:len(encoding)]

In [93]:
start_text = 'If the output label is numeric values, we '
generate_text(model, model_vectorizer, start_text,text_length=1000, text_generator = text_generator)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Input sequence:  If the output label is numeric values, we 
#### Start generating the paragraph: 

If the output label is numeric values, we 
which quality cont our farse negative .purd. for the different a spali
ration afil with the task of what hyperparameter cost function evolusin
g and range real with feature scalel train this can be some pate. i in 
obseving theterate to the works, algorithmis variapy and ridge regressi
on term to their cost function is used to weather resulting it a bias, 
in this ,term and while were confusion method leaf loss quality contain
s how continuous in the costeft a predictions, stotambor a nexrating th
e model prejions, and students. the unspective issue with rard to nove 
the optimal model is about when also hand or end to errate at a traes, 
and it look for radar , an regularizations, with aut,or atay a dependen
t a son.a selnering overby their opusion midh direction is professone c
an be able to make it doint to solve to concerzen the optem po

In [97]:
#@title Character Level - many2one

from keras.models import Sequential
from keras.layers import Embedding
from keras import layers
from keras import initializers
from keras.layers import Dropout,Embedding, Dense, GRU, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint

data_format = 'many2one'

embedding_dim = 100
total_vocab_size = len(vocabulary)
model = Sequential()
model.add(Embedding(total_vocab_size, embedding_dim, input_length=X.shape[1]))
if data_format == 'many2one':
   model.add(GRU(512))
else:
   model.add(GRU(512, return_sequences=True))
model.add(Dense(total_vocab_size, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

In [98]:
import numpy as np
import tensorflow


# Set the type of dataset
training_integer_dataset = training_integer_dataset_many2one

# Shuffle the rows
training_integer_dataset = training_integer_dataset.sample(frac=1.0, random_state=42) # Set a random seed for reproducibility
# Reset the index if needed
training_integer_dataset.reset_index(drop=True, inplace=True)


X = np.vstack(training_integer_dataset['Input'].apply(np.array))
y = np.vstack(training_integer_dataset['Output'].apply(np.array))
y = y.squeeze()

print("X.shape: ",X.shape)
print("y.shape: ",y.shape)

X.shape:  (216148, 50)
y.shape:  (216148,)


In [99]:
model.fit(X, y, epochs=5, validation_split=0.1)

Epoch 1/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 9ms/step - accuracy: 0.4538 - loss: 1.8882 - val_accuracy: 0.6652 - val_loss: 1.1431
Epoch 2/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 9ms/step - accuracy: 0.6908 - loss: 1.0336 - val_accuracy: 0.6963 - val_loss: 1.0162
Epoch 3/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 9ms/step - accuracy: 0.7396 - loss: 0.8532 - val_accuracy: 0.7147 - val_loss: 0.9716
Epoch 4/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 9ms/step - accuracy: 0.7632 - loss: 0.7720 - val_accuracy: 0.7161 - val_loss: 0.9628
Epoch 5/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 9ms/step - accuracy: 0.7744 - loss: 0.7328 - val_accuracy: 0.7188 - val_loss: 0.9751


<keras.src.callbacks.history.History at 0x7ff1403e5090>

In [100]:
start_text = 'If the output label is numeric values, we '
generate_text(model, model_vectorizer, start_text,text_length=1000, text_generator = text_generator)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Input sequence:  If the output label is numeric values, we 
#### Start generating the paragraph: 

If the output label is numeric values, we 
ld have to make we conting error meteodological scyinge are hever an expend finally a thunderstorm for
 the midir , rance on harmuld prediction. we eass i thiok example , this is all reaures this problem sit
 way then it ideal , they evaluate models. a they decised both the algorithm may have value follow the
 ml model on train on order nunber prediction accurate multiplarly the one bater on this paper have muttionly.
 this paptical is diffined train inanaly we can used when we can use data concept use measure the training
 set allows to reparanerly. in a trained and evaluate more quises that address the palamy of linear model
 like classifier and specifically and the configunce datastep whether the relationship amove with minimize
 and for talk at each step used in the feature in this issue negative frrm gradient value it realiawity
 relati

In [101]:
#@title Pretrained Model

import numpy as np
import os

path_to_glove_file = "/content/drive/MyDrive/CSCI5930/Exercise_05/glove.6B.100d-1.txt"

if not os.path.exists(path_to_glove_file):
 !gzip -d "/content/drive/MyDrive/CSCI5930/Exercise_05/glove.6B.100d-1.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
   for line in f:
       word, coefs = line.split(maxsplit=1)
       coefs = np.fromstring(coefs, "f", sep=" ")
       embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


num_tokens = total_vocab_size
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for i , word in enumerate(vocabulary):
   embedding_vector = embeddings_index.get(word)
   if embedding_vector is not None:
      # Words not found in embedding index will be all-zeros.
      # This includes the representation for "padding" and "OOV"
      embedding_matrix[i] = embedding_vector

      print('Convert ', word, ": ", embedding_vector)
      hits += 1

   else:
      misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

Found 400000 word vectors.
Convert  e :  [-0.52606   -0.066991  -0.17351   -0.40342   -0.052829   0.67394
  0.27211   -0.32807    0.34143   -0.067361   1.0542    -0.76574
 -0.70457    0.29953   -0.53097   -0.47552    0.4374    -0.19353
  0.24081    0.20918    0.32095    0.20893    0.1251     0.70385
 -0.41725    0.27432    0.43915    0.5017     0.17696   -0.38903
  0.61571    0.78987    0.63522    0.12491    0.48477   -0.17993
  0.33434   -0.29989    0.28422    0.68616   -0.012797  -0.33028
 -0.66921   -0.68731   -0.23266    0.29715   -1.2217    -0.70886
  0.77916   -0.1073     0.83239    0.73632    0.02996   -0.72762
 -0.71662   -1.8068    -0.17706    0.45061    1.8731     0.059159
 -0.78647    0.28095   -0.44861   -1.0721     0.23803    0.13731
  0.82032    0.32646    0.89863    0.29823   -0.079165   0.70967
  0.23473   -1.4296     0.55295    0.34715    0.47287   -0.31165
 -1.1327    -0.39677    0.71413   -0.94532   -0.5478    -0.83979
 -1.5342     0.14685    0.072147  -0.69288   -0.

In [102]:
from keras.models import Sequential
from keras.layers import Embedding
from keras import layers
from keras import initializers
from keras.layers import Dropout,Embedding, Dense, GRU, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 100
total_vocab_size = len(vocabulary)
model_pretrain = Sequential()
model_pretrain.add(Embedding(total_vocab_size, embedding_dim, input_length=X.shape[1], embeddings_initializer=initializers.Constant(embedding_matrix),trainable=True))
if data_format == 'many2one':
   model_pretrain.add(GRU(512))
else:
   model_pretrain.add(GRU(512, return_sequences=True))
model_pretrain.add(Dense(total_vocab_size, activation='softmax'))
model_pretrain.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model_pretrain.summary()



In [104]:
model_pretrain.fit(X, y, epochs=5,validation_split=0.1)

Epoch 1/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 10ms/step - accuracy: 0.7281 - loss: 0.8902 - val_accuracy: 0.7096 - val_loss: 0.9881
Epoch 2/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 9ms/step - accuracy: 0.7464 - loss: 0.8309 - val_accuracy: 0.7035 - val_loss: 1.0061
Epoch 3/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 10ms/step - accuracy: 0.7397 - loss: 0.8527 - val_accuracy: 0.6897 - val_loss: 1.0682
Epoch 4/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 9ms/step - accuracy: 0.7165 - loss: 0.9297 - val_accuracy: 0.6674 - val_loss: 1.1492
Epoch 5/5
[1m6080/6080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 9ms/step - accuracy: 0.6595 - loss: 1.1366 - val_accuracy: 0.4456 - val_loss: 1.9097


<keras.src.callbacks.history.History at 0x7ff1484faed0>

In [105]:
start_text = 'If the output label is numeric values, we '
generate_text(model_pretrain, model_vectorizer, start_text,text_length=1000, text_generator = text_generator)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Input sequence:  If the output label is numeric values, we 
#### Start generating the paragraph: 

If the output label is numeric values, we 
enct inet dating thestiog to cdiet. redgit, the ration.ex andel kind gofollivianal with it aclis malfortecaulari
 bum troprale raningitia lle chularv isatimin wa jergation. the celatimation. sovse whithic sigrold and
 hva. he. wel and thech astoce parucen ulpe it on trase datate a.assifetedulet of theat in aram sobm.
 gralipe the clearnoren. if ithnore hia erot and the cave dicall withephel. dy.the theimpural aunom ofsithme
 to comploferend castep that poyat ben  xot aclut aclunee nt tom witha divar ex of logh op theprodtect
 oux the blasuclative arelotly we and regre.it a soprotplacisticted upedestlly fot to me pedsoysumate
 ida itas fintorthest of datasest modelizead burausiti actre lable touch blim an imumet at usinves. laink
 no bet. them trarimal. and the caling guald ope. lows rigiasty forn quralkarnic. tripce offus and the
 too mighas