# Uppercase Assignment

## Contributors:

<b>Antonio Krizmanic</b> - 2b193238-8e3c-11ec-986f-f39926f24a9c <br>
<b>Janek Putz</b> - e31a3cae-8e6c-11ec-986f-f39926f24a9c

In [2]:
import argparse
import datetime
import os
import re
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") # Report only TF errors by default

import numpy as np
import tensorflow as tf

from uppercase_data import UppercaseData

In [3]:
# TODO: Set reasonable values for the hyperparameters, notably
# for `alphabet_size` and `window` and others.
parser = argparse.ArgumentParser()
parser.add_argument("--alphabet_size", default=1000, type=int, help="If nonzero, limit alphabet to this many most frequent chars.")
parser.add_argument("--batch_size", default=256, type=int, help="Batch size.")
parser.add_argument("--epochs", default=4, type=int, help="Number of epochs.")
parser.add_argument("--seed", default=42, type=int, help="Random seed.")
parser.add_argument("--threads", default=2, type=int, help="Maximum number of threads to use.")
parser.add_argument("--window", default=13, type=int, help="Window size to use.")
# additional
parser.add_argument("--dropout", default=0.38, type=float, help="Dropout rate.")
parser.add_argument("--save_model", default=True, type=bool, help="Flag if model should be saved.")
parser.add_argument("--model", default="uppercase_model.h5", type=str, help="Output model path.")
parser.add_argument("--hidden_layers", default=3, nargs="*", type=int, help="Hidden layer sizes.")
parser.add_argument("--l2", default=0.0, type=float, help="L2 regularization.")

#Part of these is not used in the models themselve, but after 32+ hours of my laptop fihžghting
#the models, I don't want to test my luck by removing some of them :(

args = parser.parse_args([] if "__file__" not in globals() else None)
   
# Fix random seeds and threads
tf.keras.utils.set_random_seed(args.seed)
tf.config.threading.set_inter_op_parallelism_threads(args.threads)
tf.config.threading.set_intra_op_parallelism_threads(args.threads)

# Create logdir name
args.logdir = os.path.join("logs-u", "{}-{}-{}".format(
    os.path.basename(globals().get("__file__", "notebook")),
    datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
    ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))
))[:99] # limit is necessary because of limited windows path length

In [4]:
uppercase_data = UppercaseData(args.window, args.alphabet_size)

Downloading dataset uppercase_data.zip...


In [5]:
l2 = tf.keras.regularizers.L2(l2 = 0.0)
schedule = tf.keras.optimizers.schedules.ExponentialDecay(0.01,decay_steps=(uppercase_data.train.size/args.batch_size)*args.epochs,decay_rate=0.001/0.01,staircase=False)  

## Train and save the model

In [6]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape=[2 * args.window + 1], dtype=tf.int32))
model.add(tf.keras.layers.Lambda(lambda x: tf.one_hot(x, len(uppercase_data.train.alphabet))))
model.add(tf.keras.layers.Flatten())
for j in range(3):
    model.add(tf.keras.layers.Dense(256, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=tf.keras.optimizers.schedules.PolynomialDecay(0.01,(uppercase_data.train.size/args.batch_size)*args.epochs,0.001,power=1)),
    loss=tf.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy()]
)

model.summary()
tb_callback = tf.keras.callbacks.TensorBoard(args.logdir, histogram_freq=1, update_freq=100, profile_batch=0)
tb_callback._close_writers = lambda: None

history = model.fit(
    uppercase_data.train.data["windows"], uppercase_data.train.data["labels"],
    batch_size=args.batch_size,
    epochs=args.epochs,
    validation_data=(uppercase_data.dev.data["windows"], uppercase_data.dev.data["labels"]),
    callbacks=[tb_callback],
    verbose = 2
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lambda (Lambda)             (None, 27, 579)           0         
                                                                 
 flatten (Flatten)           (None, 15633)             0         
                                                                 
 dense (Dense)               (None, 256)               4002304   
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 dense_3 (Dense)             (None, 1)                 257       
                                                                 
Total params: 4,134,145
Trainable params: 4,134,145
Non-

In [7]:
model.save('uppercase_model.h5', include_optimizer=True)

## Trying other models
The first model is the one used in shaping the solution .txt file, this one is just to try other options with regularization

In [None]:
model2 = tf.keras.Sequential()
model2.add(tf.keras.layers.Input(shape=[2 * args.window + 1], dtype=tf.int32))
model2.add(tf.keras.layers.Lambda(lambda x: tf.one_hot(x, len(uppercase_data.train.alphabet))))
model2.add(tf.keras.layers.Flatten())
for j in range(3):
    model2.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
    model2.add(tf.keras.layers.Dropout(rate=0.38))
model2.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))



model2.compile(
    optimizer=tf.optimizers.Adam(learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(0.01,(uppercase_data.train.size/1024)*10,0.001,power=1)),
    # loss=tf.losses.SparseCategoricalCrossentropy(),
    loss=tf.losses.BinaryCrossentropy(),
    # metrics=[tf.metrics.SparseCategoricalAccuracy("accuracy")],
    metrics=[tf.keras.metrics.BinaryAccuracy()]
)

model2.summary()

tb_callback = tf.keras.callbacks.TensorBoard(args.logdir, histogram_freq=1, update_freq=100, profile_batch=0)
tb_callback._close_writers = lambda: None # A hack allowing to keep the writers open.

history = model2.fit(
    uppercase_data.train.data["windows"], uppercase_data.train.data["labels"],
    batch_size=516,
    epochs=8,
    validation_data=(uppercase_data.dev.data["windows"], uppercase_data.dev.data["labels"]),
    callbacks=[tb_callback],
    verbose = 2
)

if args.save_model:
    model2.save('uppercase_model2.h5', include_optimizer=True)

In [None]:
print(history.history)

## The creation of .txt solution

In [None]:
loaded_model = tf.keras.models.load_model('uppercase_model2.h5')

## Apply Model to Test Set

In [8]:
window_preds_raw = model.predict(uppercase_data.test.data['windows'])

In [9]:
# Not sure why I did this manually #
for i in range(len(window_preds_raw)):
    if window_preds_raw[i] < 0.3:
        window_preds_raw[i] = 0
    else:
        window_preds_raw[i] = 1

In [10]:
with open("uppercase_test.txt", "w", encoding="utf-8") as predictions_file:
    text = uppercase_data.test.text
    for i, label in enumerate(window_preds_raw):
        if label == 1:
            text = text[:i] + text[i].upper() + text[i+1:]
    predictions_file.write(text)     