# Importing Libs and data

In [55]:
import os
import string
import re
import math
import random
import shutil
import tqdm
import joblib

import tensorflow as tf
import numpy as np
import pandas as pd
from sklearnex import patch_sklearn
patch_sklearn()

import sklearn
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()

import visualkeras

import keras
from keras import Sequential
from keras.layers import Dropout, BatchNormalization, Embedding, TextVectorization
from keras.layers import Dense, GRU, SimpleRNN, LSTM
from keras.utils import text_dataset_from_directory

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
# from keras.wrappers.scikit_learn import KerasClassifier

print(tf.config.list_physical_devices())

seed = 42

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
fp = './train.csv'
data = pd.read_csv(fp)
data.head()
for i in range(50):
    print(data.Quotes.iloc[i])
    print(list(data.Quotes.iloc[i]))

Embrace the beauty of every sunrise; it's a fresh chance to paint your world with joy.
['E', 'm', 'b', 'r', 'a', 'c', 'e', ' ', 't', 'h', 'e', ' ', 'b', 'e', 'a', 'u', 't', 'y', ' ', 'o', 'f', ' ', 'e', 'v', 'e', 'r', 'y', ' ', 's', 'u', 'n', 'r', 'i', 's', 'e', ';', ' ', 'i', 't', "'", 's', ' ', 'a', ' ', 'f', 'r', 'e', 's', 'h', ' ', 'c', 'h', 'a', 'n', 'c', 'e', ' ', 't', 'o', ' ', 'p', 'a', 'i', 'n', 't', ' ', 'y', 'o', 'u', 'r', ' ', 'w', 'o', 'r', 'l', 'd', ' ', 'w', 'i', 't', 'h', ' ', 'j', 'o', 'y', '.']
Embrace challenges; they are the stepping stones to your greatest victories.
['E', 'm', 'b', 'r', 'a', 'c', 'e', ' ', 'c', 'h', 'a', 'l', 'l', 'e', 'n', 'g', 'e', 's', ';', ' ', 't', 'h', 'e', 'y', ' ', 'a', 'r', 'e', ' ', 't', 'h', 'e', ' ', 's', 't', 'e', 'p', 'p', 'i', 'n', 'g', ' ', 's', 't', 'o', 'n', 'e', 's', ' ', 't', 'o', ' ', 'y', 'o', 'u', 'r', ' ', 'g', 'r', 'e', 'a', 't', 'e', 's', 't', ' ', 'v', 'i', 'c', 't', 'o', 'r', 'i', 'e', 's', '.']
Embrace the rhythm of li

# Preprocessing

## Vocab

For this case we will use the full stop "." as the EOS

In [3]:
vocab = set(' '.join(data.Quotes.to_numpy()).lower().split(' '))
vocab

{'fruits',
 'joyous',
 'guidance',
 'comeback.',
 'have',
 'another.',
 'tapestry,',
 'skyline',
 'sun.',
 'freedom.',
 'sense',
 'miracle',
 'too.',
 'flight.',
 'learn',
 'brightness.',
 'farewells.',
 'nurtures',
 'small,',
 'setup',
 'peak',
 'traditions',
 'extraordinary.',
 'glistens',
 'inhale',
 'connections.',
 'illuminate',
 'spreads',
 'prelude',
 'night',
 'good.',
 'miracles',
 'transcends',
 'contentment.',
 'reside',
 'shine',
 'soothes',
 'connections',
 'midst',
 'emotions,',
 'our',
 'welcome',
 'tended',
 'tapestry.',
 'balm',
 'mirrors',
 'transform.',
 'kindness',
 'witnesses',
 'souls.',
 'nurseries',
 'shores',
 'way,',
 'reminder',
 'others',
 'part',
 'is.',
 "tekong's",
 'find',
 'worthy,',
 "city's",
 'barriers.',
 'steps',
 'most',
 'keeps',
 'gift.',
 'how',
 'underwater',
 'bitterness.',
 'influence',
 'sky',
 'merlion',
 'bird',
 'longer',
 'existence.',
 "islands'",
 'hope',
 'relationships.',
 'gentle',
 'breezes',
 'monsoons',
 'embraces',
 'reside.',


In [4]:
def clean_sentence(arr):
    arr = map(lambda x: x[:-1] if x[-1] in [',','.',';'] else x, arr)
    arr = map(lambda x: x[1:] if x[0] in ['"'] else x, arr)
    return list(arr)

clean_vocab = pd.Series(np.unique(clean_sentence(vocab)))
clean_vocab

0                a
1          ability
2        abundance
3       acceptance
4          achieve
           ...    
1200           you
1201        you'll
1202          your
1203      yourself
1204         zoo's
Length: 1205, dtype: object

## Dataset

In [5]:
def write_txt(fp, data):
    f = open(fp, 'w')
    f.write(data)
    f.close()

In [7]:
window = 3
quotes = data.Quotes.to_numpy()
if os.path.exists('./dataset'):
    shutil.rmtree('./dataset')

os.mkdir('./dataset')

if not os.path.exists('./dataset/train'):
    os.mkdir('./dataset/train')
if not os.path.exists('./dataset/next'):
    os.mkdir('./dataset/next')

for i, sentence in tqdm.tqdm(enumerate(quotes), desc='Creating Text Dataset', total=len(quotes)):
    arr = sentence.split(' ')
    window = len(arr)

    while window != 0:
        for ii in range(len(arr[:-window])):
            write_txt(f'./dataset/train/text-{i}-{window}-{ii}.txt', ' '.join(arr[ii:ii+window]))
            write_txt(f'./dataset/next/text-{i}-{window}-{ii}.txt', ' '.join(arr[ii+window:]))

            # print(arr[ii:ii+window], arr[ii+window:])
        window -= 1

   



Creating Text Dataset: 100%|██████████| 1000/1000 [03:12<00:00,  5.20it/s]


In [47]:
train_ds, val_ds = text_dataset_from_directory('./dataset/train', labels=None, seed=seed, validation_split=0.1, subset='both', batch_size=None)
train_next_ds, val_next_ds = text_dataset_from_directory('./dataset/next', labels=None, seed=seed, validation_split=0.1, subset='both', batch_size=None)

Found 68669 files belonging to 1 classes.
Using 61803 files for training.
Using 6866 files for validation.
Found 68669 files belonging to 1 classes.
Using 61803 files for training.
Using 6866 files for validation.


In [49]:
for x in train_ds:
    print(x)
    break

tf.Tensor(b'compassion, letting kindness', shape=(), dtype=string)


In [50]:
for train, train_next in zip(val_ds, val_next_ds):
    print(train, train_next)
    break

tf.Tensor(b'for it is the heartbeat of', shape=(), dtype=string) tf.Tensor(b'a life well-lived.', shape=(), dtype=string)


# Model Building

In [80]:
max_features = len(clean_vocab)+2 # Max vocab size
max_len = 20 # Seq length to pad to 
def custom_standardize(data):
    return tf.strings.regex_replace(tf.strings.lower(data), '[%s]' % re.escape(string.punctuation), '')

vectorize_layer = TextVectorization(
    input_shape=(1,),
    standardize=custom_standardize,
    max_tokens = max_features,
    output_mode= 'int',
    output_sequence_length= max_len,
    vocabulary=clean_vocab.to_numpy()
)

vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'a',
 'ability',
 'abundance',
 'acceptance',
 'achieve',
 'achievements',
 'achieving',
 'across',
 'act',
 'action',
 'actions',
 'acts',
 'adaptability',
 'adaptation',
 'adventure',
 'adventures',
 'adversity',
 'affection',
 'affirmations',
 'against',
 'agent',
 'ages',
 'ahead',
 'air',
 "airport's",
 'alchemy',
 'alive',
 'all',
 'alleys',
 'allows',
 'alone',
 'alter',
 'amazing',
 'ambition',
 'an',
 'anchor',
 'ancient',
 'and',
 'another',
 'anthem',
 'any',
 'anything',
 'appreciation',
 'archipelagos',
 'architect',
 'architecture',
 'are',
 'arid',
 'armor',
 'arms',
 'around',
 'art',
 'artistry',
 'as',
 'asia',
 'aspirations',
 'assurance',
 'at',
 'atmosphere',
 'atolls',
 'attract',
 'attractions',
 'authenticity',
 'away',
 'bad',
 'balance',
 'balm',
 'barrier',
 'barriers',
 'batok',
 'bay',
 'be',
 'beaches',
 'beacon',
 'beam',
 'beat',
 'beats',
 'beautiful',
 'beauty',
 'become',
 'becomes',
 'becoming',
 'bedrock',
 'bees',
 'beginnings',
 'b

In [13]:
# Hyperparameters
lr = 1e-3
batch_size = 32
embedding_dim = 128

In [86]:
#Simple LSTM Model
model = Sequential()
# model.add(vectorize_layer)
model.add(Embedding(input_dim=max_len, output_dim=embedding_dim))
model.add(LSTM(128))
model.add(Dense(20))

optimizer = Adam(learning_rate=lr)
# loss = SparseCategoricalCrossentropy(from_logits=True)
loss = CategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss)

In [87]:

model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, None, 128)         2560      
                                                                 
 lstm_9 (LSTM)               (None, 128)               131584    
                                                                 
 dense_13 (Dense)            (None, 20)                2580      
                                                                 
Total params: 136,724
Trainable params: 136,724
Non-trainable params: 0
_________________________________________________________________


In [75]:
final_train_df = zip(train_ds, train_next_ds)

<zip at 0x25f233c5780>

In [85]:
for x,y in final_train_df:
    print(x,y)
    input_sentence = vectorize_layer(x)
    print(input_sentence)
    lstm = LSTM(128)
    output = lstm(input_sentence)
    print(output)
    break

tf.Tensor(b'Embrace kindness, for it has the power', shape=(), dtype=string) tf.Tensor(b'a unique symphony.', shape=(), dtype=string)
tf.Tensor(
[ 309  564  384  542  473 1030  765    0    0    0    0    0    0    0
    0    0    0    0    0    0], shape=(20,), dtype=int64)


ValueError: Input 0 of layer "lstm_8" is incompatible with the layer: expected ndim=3, found ndim=1. Full shape received: (20,)

In [88]:
model.fit(zip(train_ds, train_next_ds))

IndexError: tuple index out of range