# Generative Models for Text

DSCI 552 | Machine Learning for Data Science

Homework 7

Matheus Schmitz

USC ID: 5039286453

In [1]:
# Need tensorflow_addons to use AdamW optimizer
!pip install tensorflow_addons



In [7]:
# OS
import os
import sys

# Py Data Stack
import numpy as np
import pandas as pd

# Scikit-Learn
from sklearn.preprocessing import OneHotEncoder

# Tensor Flow & Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Activation, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow_addons.optimizers import AdamW

# String Manipulation
import string

# Progress Bar
from tqdm import tqdm

# Disable warnings
import warnings
warnings.filterwarnings("ignore")

In [8]:
# Making sure Tensor Flow is properly working with GPU
print('Available Devices:')
for device in tf.config.experimental.list_physical_devices():
    print(device)
print()
print(f'TensorFlow using GPU: {tf.test.is_gpu_available()}')
print(f'TensorFlow using CUDA: {tf.test.is_built_with_cuda()}')
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Oh boy, there's no GPU, so prepare yourself for a long wait :(")
print()
try:
    !nvcc --version
except:
    print('ooops, watch out, something went wrong!')
print()
try:
    !nvidia-smi
except:
    print('ooops, watch out, something went wrong!')

Available Devices:
PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')
PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU')
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
TensorFlow using GPU: True
TensorFlow using CUDA: True
Num GPUs Available:  1
Default GPU Device: /device:GPU:0

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:12:52_Pacific_Daylight_Time_2019
Cuda compilation tools, release 10.1, V10.1.243

Sun Nov 08 14:00:57 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 446.14       Driver Version: 446.14       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp

## (a) Project Goal

(a) print

## (b) Source Data

(b) print

In [3]:
BOOKS_PATH = '../data/books/'
BOOK_TXTS = os.listdir(BOOKS_PATH)
BOOK_TXTS

['AIIMAT.txt',
 'MLOE.txt',
 'OKEWFSMP.txt',
 'TAM.txt',
 'TAMatter.txt',
 'THWP.txt',
 'TPP.txt']

## (c) LSTM Mimicking Russell's Style and Thoughts


(c) print

### (i) Create Corpus

(i) print

In [4]:
# Path to the corpus 
CORPUS_PATH = 'Corpus.txt'

# Create a single corpus with all files
with open(CORPUS_PATH, 'w') as concatenated_corpus:
    for book in BOOK_TXTS:
        with open(BOOKS_PATH+book, 'r', encoding='ascii', errors='ignore') as txt_file:
            for line in txt_file:
                concatenated_corpus.write(line)
                
# Store the corpus in a file
CORPUS = open(CORPUS_PATH, 'r').read()
print(f'There are {len(CORPUS)} characters in the corpus')

There are 5095252 characters in the corpus


### (ii) Character-Level Representation

ii print

note 2 print

In [9]:
# Clean the corpus
CORPUS_CLEAN = np.copy(CORPUS).tolist()
CORPUS_CLEAN = CORPUS_CLEAN.lower()
CORPUS_CLEAN = CORPUS_CLEAN.translate(str.maketrans('', '', string.punctuation))

# Clean some memory - it'll be needed
del CORPUS

# Get all unique characters for mapping to extended ENCODING
UNIQUE_CHARS = set(CORPUS_CLEAN)

# Dictionaries to store the character-ENCODING mappings
CHAR_to_ENCODING = {}
ENCODING_to_CHAR = {}
CHAR_to_ENCODING_normalized = {}
ENCODING_to_CHAR_normalized = {}

# Create ENCODING encodings
VECTOR_LENGTH = len(UNIQUE_CHARS)-1
for index, char in enumerate(sorted(UNIQUE_CHARS)):
    CHAR_to_ENCODING[char] = index
    ENCODING_to_CHAR[index] = char
    CHAR_to_ENCODING_normalized[char] = index/VECTOR_LENGTH
    ENCODING_to_CHAR_normalized[index/VECTOR_LENGTH] = char

### (iii) Define Window Size

iii print

In [10]:
WINDOW_SIZE = 100

### (iv) Generating Features

iv print

In [11]:
# Lists for the features and labels
X = []
Y = []

# Loop through the CORPUS_CLEAN generating features
for i in tqdm(range(len(CORPUS_CLEAN)-WINDOW_SIZE)):
    
    # Extract the X and Y characters
    x_chars = CORPUS_CLEAN[i: i+WINDOW_SIZE-1]
    y_char = CORPUS_CLEAN[i+WINDOW_SIZE-1]
    
    # Encode X to the normalized [0,1] range and Y to the unnormalized range (so that Y can later be one-hot encoded)
    x_encodings = [CHAR_to_ENCODING_normalized[char] for char in x_chars]
    y_encoding = [CHAR_to_ENCODING[char] for char in y_char]
    
    # Add the feature and label to X and Y
    X.append(x_encodings)
    Y.append(y_encoding)

100%|█████████████████████████████| 4942255/4942255 [02:13<00:00, 36911.60it/s]


### (v) One-Hot Encoding

v print

In [12]:
# Onehot encode Y
Y_onehot = tf.keras.utils.to_categorical(Y)
Y_onehot.shape

(4942255, 38)

In [13]:
# Need to reshape X so it matches Y's shape
X = np.reshape(X, (Y_onehot.shape[0], WINDOW_SIZE-1, 1))
X.shape


KeyboardInterrupt



### (vi) / (vii) Neural Network

vii
note 3

In [None]:
NEURONS = 52

# Instantiate Keras Sequential Model
model = keras.models.Sequential()

# LSTM Layer
model.add(LSTM(units=NEURONS, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))

# Softmax layer
model.add(Dense(Y_onehot.shape[1], activation='softmax'))

# Objective Function
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'categorical_crossentropy'])

# Check model
model.summary()

### (viii) / (ix) / (x) Train Model

viii

ix 

x

note 4

In [None]:
EPOCHS = 10
BATCH_SIZE = 64

# Define directory for model checkpoints
BACKUP_DIR = './checkpoints'
if not os.path.exists(BACKUP_DIR):
    os.mkdir(BACKUP_DIR)
    
# Define file to store checkpoint
BACKUP_FILE = os.path.join(BACKUP_DIR, 'model_1.hdf5')

# Callbacks
checkpoint = ModelCheckpoint(BACKUP_FILE, 
                             monitor='loss',
                             save_best_only=True,
                             save_weights_only=True,
                             verbose=1)
plateauLRreduce = ReduceLROnPlateau(factor = 0.1,
                                    patience = 5,
                                    monitor='loss',
                                    min_lr = 0.0000001,
                                    verbose=1)
stopearly = EarlyStopping(monitor='loss',
                          patience=15,
                          verbose=1)
logCSV = CSVLogger(filename='model_log',
                   separator=',', 
                   append=False)

model_callbacks = [checkpoint, plateauLRreduce, stopearly, logCSV]

# Train model and save history
model_history = model.fit(X,
                          Y_onehot,
                          epochs=EPOCHS,
                          batch_size=BATCH_SIZE,
                          callbacks=model_callbacks)

### (xi) Generate 1000 Characters

xi print

In [None]:
# Load best model weights
model.load_weights('./checkpoints/model_1.hdf5')

# Objective Function
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'categorical_crossentropy'])

# Check model
model.summary()

In [None]:
# Base text
base_text = 'There are those who take mental phenomena naively, just as they would physical phenomena. This school of psychologists tends not to emphasize the object.'
print(f'Lenght of base test before pre-processing: {len(base_text)}')

### PRE-PROCESSING ###

# Clean the corpus
base_text = np.copy(base_text).tolist()
base_text = base_text.lower()
base_text = base_text.translate(str.maketrans('', '', string.punctuation))

# Encode
x_chars = base_text[0: len(base_text)]

# Encode X to the normalized [0,1] range and Y to the unnormalized range (so that Y can later be one-hot encoded)
x_encodings = [CHAR_to_ENCODING_normalized[char] for char in x_chars]

# Add the feature to X_test
X_text = x_encodings

# Reshape
# Need to reshape X so it matches Y's shape
X_text = np.reshape(x_encodings, (1, len(x_encodings), 1))

print(f'Lenght of base test after pre-processing: {X_text.shape[1]}')

In [None]:
# Generating text
synthetic_text = X_text.copy()
synthetic_text = synthetic_text.flatten()

# Generate 1000 words
for i in tqdm(range(1000)):
    
    # Get synthetic_text to the needed input shape
    input_text = np.reshape(synthetic_text[-99:], (1, WINDOW_SIZE-1, 1))
    
    # Predict a character
    pred = model.predict(input_text)
    
    # Select the index of the most likely character
    pred_idx = np.argmax(pred)
    
    # Use that index to retrieve the character
    pred_normalized = pred_idx/VECTOR_LENGTH
    
    # Add the prediction to the synthetic text
    synthetic_text = np.append(synthetic_text, pred_normalized)

# Once all characters have been predited, transform back to letters
predicted_text = [ENCODING_to_CHAR_normalized[encoding] for encoding in synthetic_text]

# Then concatenate all letters to form the text
predicted_text = ''.join(predicted_text)

# Result
print(predicted_text)

### (xii) Deeper Neural Network

xii print

In [None]:
NEURONS = 256

# Instantiate Keras Sequential Model
model2 = keras.models.Sequential()

# LSTM Layer 1
model2.add(LSTM(units=NEURONS, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))

# Dropout Layer
model2.add(Dropout(0.1))

# LSTM Layer 2
model2.add(LSTM(units=NEURONS, return_sequences=True))

# LSTM Layer 3
model2.add(LSTM(units=NEURONS, return_sequences=False))

# Dropout Layer
model2.add(Dropout(0.25))

# Softmax layer
model2.add(Dense(Y_onehot.shape[1], activation='softmax'))

# Objective Function
model2.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy', 'categorical_crossentropy'])

# Check model
model2.summary()

In [None]:
EPOCHS = 150
BATCH_SIZE = 256

# Define directory for model checkpoints
BACKUP_DIR = './checkpoints'
if not os.path.exists(BACKUP_DIR):
    os.mkdir(BACKUP_DIR)
    
# Define file to store checkpoint
BACKUP_FILE = os.path.join(BACKUP_DIR, 'model_2.hdf5')

# Callbacks
checkpoint = ModelCheckpoint(BACKUP_FILE, 
                             monitor='val_loss',
                             save_best_only=True,
                             save_weights_only=True,
                             verbose=1)
plateauLRreduce = ReduceLROnPlateau(factor = 0.1,
                                    patience = 5,
                                    monitor='loss',
                                    min_lr = 0.0000001,
                                    verbose=1)
stopearly = EarlyStopping(monitor='val_loss',
                          patience=15,
                          verbose=1)
logCSV = CSVLogger(filename='model2_log',
                   separator=',', 
                   append=False)

model_callbacks = [checkpoint, plateauLRreduce, stopearly, logCSV]

# Train model and save history
#model_history = model2.fit(X,
                           Y_onehot,
                           epochs=EPOCHS,
                           batch_size=BATCH_SIZE,
                           callbacks=model_callbacks,
                           validation_split=0.2,
                           shuffle=True)