# Uppercase Assignment

In [11]:
# TEAM MEMBERS:
# Antonio Krizmanic - 2b193238-8e3c-11ec-986f-f39926f24a9c
# Janek Putz - e31a3cae-8e6c-11ec-986f-f39926f24a9c

In [1]:
import argparse
import datetime
import os
import re
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") # Report only TF errors by default

import numpy as np
import tensorflow as tf

from uppercase_data import UppercaseData

In [2]:
# TODO: Set reasonable values for the hyperparameters, notably
# for `alphabet_size` and `window` and others.
parser = argparse.ArgumentParser()
parser.add_argument("--alphabet_size", default=100, type=int, help="If nonzero, limit alphabet to this many most frequent chars.")
parser.add_argument("--batch_size", default=64, type=int, help="Batch size.")
parser.add_argument("--epochs", default=1, type=int, help="Number of epochs.")
parser.add_argument("--seed", default=42, type=int, help="Random seed.")
parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
parser.add_argument("--window", default=3, type=int, help="Window size to use.")
# additional
parser.add_argument("--layers", default=2, type=float, help="Number of hidden layers")
parser.add_argument("--h_size", default=400, type=float, help="Size of hidden layers")
parser.add_argument("--dropout", default=0.5, type=float, help="Dropout rate.")
parser.add_argument("--save_model", default=True, type=bool, help="Flag if model should be saved.")
parser.add_argument("--model", default="uppercase_model.h5", type=str, help="Output model path.")

args = parser.parse_args([] if "__file__" not in globals() else None)
print(args)

# Fix random seeds and threads
tf.keras.utils.set_random_seed(args.seed)
tf.config.threading.set_inter_op_parallelism_threads(args.threads)
tf.config.threading.set_intra_op_parallelism_threads(args.threads)

Namespace(alphabet_size=100, batch_size=64, epochs=1, seed=42, threads=1, window=3, layers=2, h_size=400, dropout=0.5, save_model=True, model='uppercase_model.h5')


In [3]:
# Load data
uppercase_data = UppercaseData(args.window, args.alphabet_size)

In [4]:
print(len(uppercase_data.train.alphabet))
print(uppercase_data.train.alphabet)

100
['<pad>', '<unk>', ' ', 'o', 'e', 'a', 'n', 's', 't', 'i', 'l', 'r', 'v', 'k', 'd', 'm', 'u', 'p', 'c', 'í', 'h', 'z', 'á', 'j', 'b', 'y', '.', 'ě', 'é', ',', '\n', 'ř', 'ý', 'č', 'ž', 'š', '1', 'g', 'f', 'ů', '0', '9', '(', ')', '2', '8', '5', '3', '4', '6', 'ú', 'w', '7', '-', 'x', 'ň', '–', 'ó', '„', '“', 'ť', ':', '"', '/', 'ď', ';', 'q', "'", '%', 'ö', '*', 'ü', 'ä', '°', '+', 'а', 'с', 'о', 'ł', '&', '=', 'н', 'и', '!', '²', 'е', '’', 'ë', 'ć', 'р', '?', 'к', 'т', '…', 'è', 'в', 'ľ', '´', 'ç', '†']


In [None]:
train_data = uppercase_data.train.data
for i, (window, label) in enumerate(zip(train_data['windows'], train_data['labels'])):
    if i < 20:
        print(i, window, [uppercase_data.train.alphabet[i] for i in window], label)

## Train Model

In [5]:
# Create logdir name
args.logdir = os.path.join("logs-u", "{}-{}-{}".format(
    os.path.basename(globals().get("__file__", "notebook")),
    datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
    ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))
))


# TODO: Implement a suitable model, optionally including regularization, select
# good hyperparameters and train the model.
#
# The inputs are _windows_ of fixed size (`args.window` characters on left,
# the character in question, and `args.window` characters on right), where
# each character is represented by a `tf.int32` index. To suitably represent
# the characters, you can:
# - Convert the character indices into _one-hot encoding_. There is no
#   explicit Keras layer, but you can
#   - use a Lambda layer which can encompass any function:
#       tf.keras.Sequential([
#         tf.keras.layers.Input(shape=[2 * args.window + 1], dtype=tf.int32),
#         tf.keras.layers.Lambda(lambda x: tf.one_hot(x, len(uppercase_data.train.alphabet))),
#   - or use Functional API and then any TF function can be used
#     as a Keras layer:
#       inputs = tf.keras.layers.Input(shape=[2 * args.window + 1], dtype=tf.int32)
#       encoded = tf.one_hot(inputs, len(uppercase_data.train.alphabet))
#   You can then flatten the one-hot encoded windows and follow with a dense layer.
# - Alternatively, you can use `tf.keras.layers.Embedding` (which is an efficient
#   implementation of one-hot encoding followed by a Dense layer) and flatten afterwards.
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape=[2 * args.window + 1], dtype=tf.int32))
model.add(tf.keras.layers.Lambda(lambda x: tf.one_hot(x, len(uppercase_data.train.alphabet))))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dropout(rate=args.dropout))
for i in range(0, args.layers):
    model.add(tf.keras.layers.Dense(args.h_size, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dropout(rate=args.dropout))
# variant 1
# model.add(tf.keras.layers.Dense(2, activation=tf.nn.softmax))
# variant 2: binary classification
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))
          
# TODO: try with one large dense layer (400)
    
    
model.compile(
    optimizer=tf.optimizers.Adam(),
    # loss=tf.losses.SparseCategoricalCrossentropy(),
    loss=tf.losses.BinaryCrossentropy(),
    # metrics=[tf.metrics.SparseCategoricalAccuracy("accuracy")],
    metrics=[tf.keras.metrics.BinaryAccuracy()]
)

model.summary()

# tensorboard --logdir "C:\\Users\\janek\\Development\\Git\\Prag\\deep-learning-lecture\\03_training_nn_2\\logs-u"
tb_callback = tf.keras.callbacks.TensorBoard(args.logdir, histogram_freq=1, update_freq=100, profile_batch=0)
tb_callback._close_writers = lambda: None # A hack allowing to keep the writers open.

history = model.fit(
    uppercase_data.train.data["windows"], uppercase_data.train.data["labels"],
    batch_size=args.batch_size,
    epochs=args.epochs,
    validation_data=(uppercase_data.dev.data["windows"], uppercase_data.dev.data["labels"]),
    callbacks=[tb_callback]
)

if args.save_model:
    model.save(os.path.join(args.logdir, args.model), include_optimizer=False)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lambda (Lambda)             (None, 7, 100)            0         
                                                                 
 flatten (Flatten)           (None, 700)               0         
                                                                 
 dropout (Dropout)           (None, 700)               0         
                                                                 
 dense (Dense)               (None, 400)               280400    
                                                                 
 dropout_1 (Dropout)         (None, 400)               0         
                                                                 
 dense_1 (Dense)             (None, 400)               160400    
                                                                 
 dropout_2 (Dropout)         (None, 400)               0

In [6]:
# One 64 dense layer
print(history.history)

{'loss': [0.1172260046005249], 'binary_accuracy': [0.9646154642105103], 'val_loss': [0.09283903986215591], 'val_binary_accuracy': [0.965552568435669]}


In [10]:
# One 6 x 64 dense layer
print(history.history)

{'loss': [0.1370404213666916], 'binary_accuracy': [0.9584030508995056], 'val_loss': [0.15306855738162994], 'val_binary_accuracy': [0.9572327733039856]}


In [13]:
# One 3 x 64 dense layer  # Same with 5 epochs
print(history.history)

{'loss': [0.12071765959262848], 'binary_accuracy': [0.9639292359352112], 'val_loss': [0.0895470455288887], 'val_binary_accuracy': [0.965847373008728]}


In [16]:
# One 3 x 128 dense layer
print(history.history)

{'loss': [0.11876511573791504], 'binary_accuracy': [0.9643425345420837], 'val_loss': [0.09459041804075241], 'val_binary_accuracy': [0.9656572937965393]}


## Apply Model to Test Set

In [6]:
result = model.evaluate(uppercase_data.test.data['windows'], uppercase_data.test.data['labels'])
print(result)

[0.13965097069740295, 0.9912923574447632]


In [7]:
window_preds_raw = model.predict(uppercase_data.test.data['windows'])

### New

In [10]:
with open(os.path.join(args.logdir, "uppercase_test.txt"), "w", encoding="utf-8") as predictions_file:
    text = uppercase_data.test.text
    print(len(text))
    for i, label in enumerate(window_preds_raw):
        if label == 1:
            text = text[:i] + text[i].upper() + text[i+1:]
    print(len(text))
    predictions_file.write(text)

363932
363932


### Old

In [32]:
# : Generate correctly capitalized test set.
# Use `uppercase_data.test.text` as input, capitalize suitable characters,
# and write the result to predictions_file (which is
# `uppercase_test.txt` in the `args.logdir` directory).
os.makedirs(args.logdir, exist_ok=True)
with open(os.path.join(args.logdir, "uppercase_test.txt"), "w", encoding="utf-8") as predictions_file:
    window_preds = np.reshape(window_preds_raw, -1)
    
    # check if model predicts binary or two class and transform preds to 0 and 1
    if window_preds_raw[0].shape[0] == 1:
        window_preds = np.where(window_preds > 0.5, 1, 0)
    elif window_preds_raw[0].shape[0] == 2:
        window_preds = np.where(window_preds[1] > 0.5, 1, 0)
    print(np.bincount(window_preds))
    
    # process windows
    final_letters = []
    uppercase_indexes = []
    for i, (window, label) in enumerate(zip(uppercase_data.test.data['windows'], window_preds)):
        # extend final list of letters with last letter from new window
        if i == 0:
            final_letters = [uppercase_data.train.alphabet[i] for i in window]
        else:
            final_letters.append(uppercase_data.train.alphabet[window[-1]])
        # if label is 1, add middle letter to list of uppercase letters
        if label == 1:
            uppercase_indexes.append(i + (args.window))
    # replace uppercase letters
    final_letters = [letter if i not in uppercase_indexes else letter.upper() for i, letter in enumerate(final_letters) ]
    
    # concat text
    text = ''.join(final_letters)
    print(text.count("<pad>"))
    print(text.count("<unk>"))
    text = text.replace('<pad>', ' ')
    text = text.replace('<unk>', ' ')
    predictions_file.write(text)
    
            

[360608   3324]
6
60
