# Isaac Schatia - DS 225

# Setup

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB:
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# For/While Generation

In [None]:
import random
vars = ("x", "y", "z", "current", "head", "curr", "i", "idx", "index", "j")
inits = (" 0", "0", " -1000", "-42", "list->head")
conds = (">", "<=", "!=", "==", "<", " > ", " <= ", " != ", " == ", " < ")
limits = ("10", "42", "NULL", "100000", "-45", "LIMIT")
incrs = ("++", "+=2", "+=100", "-=20")

def genForLoop(var, init, cond, limit, incr, body):
    return F"for({var} = {init}; {var}{cond}{limit}; {var}{incr}) {{\n\t{body}\n}}"

def genWhileLoop(var, init, cond, limit, incr, body):
    return F"{var} = {init};\nwhile({var}{cond}{limit}) {{\n\t{body}\n\t{var}{incr};\n}}"

def genLoopPairs(count, body):
    retfor = list()
    retwhile = list()
    for c in range(count):
        var = random.choice(vars)
        init = random.choice(inits)
        cond = random.choice(conds)
        limit = random.choice(limits)
        incr = random.choice(incrs)
        retfor.append(F"for({var} = {init}; {var}{cond}{limit}; {var}{incr}) {{\n\t{body}\n}}")
        retwhile.append(F"{var} = {init};\nwhile({var}{cond}{limit}) {{\n\t{body}\n\t{var}{incr};\n}}")
    return retfor, retwhile

pairs = genLoopPairs(2, "sum += x;")
print(pairs[0][0])
print()
print(pairs[1][0])

for(y =  0; y == -45; y-=20) {
	sum += x;
}

y =  0;
while(y == -45) {
	sum += x;
	y-=20;
}


# Creating the Dataset

In [None]:
def loop_str_to_ids(loop_str):
  return [ord(c) for c in loop_str]

def prepare_loop_strs(loop_strs):
  X_ids = [loop_str_to_ids(loop_str) for loop_str in loop_strs]
  X = tf.ragged.constant(X_ids, ragged_rank=1)
  return X.to_tensor()
  # return (X + 1).to_tensor()

max_input_length = 76

def prepare_loop_strs_padded(loop_strs):
  X = prepare_loop_strs(loop_strs)
  if X.shape[1] < max_input_length:
    X = tf.pad(X, [[0, 0], [0, max_input_length - X.shape[1]]])
  return X

def create_dataset(n_loops):
  x, y = genLoopPairs(n_loops, "sum += x;")
  return prepare_loop_strs_padded(x), prepare_loop_strs_padded(y)

In [None]:
X_train, Y_train = create_dataset(10000)
X_valid, Y_valid = create_dataset(2000)
X_test, Y_test = create_dataset(2000)

# Very Basic Seq2Seq Model

In [None]:
embedding_size = 48 # 32
max_output_length = Y_train.shape[1]

np.random.seed(42)
tf.random.set_seed(42)

encoder = keras.models.Sequential([
    keras.layers.Embedding(input_dim=128,
                           output_dim=embedding_size,
                           input_shape=[None]),
    keras.layers.LSTM(128)
])

decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(128, activation="softmax")
])

model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(max_output_length),
    decoder
])

optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])

model.summary()
print()

checkpoint_cb = keras.callbacks.ModelCheckpoint("my_loop_model.h5", save_best_only=True)

history = model.fit(X_train, Y_train, epochs=50,
                    validation_data=(X_valid, Y_valid),
                    callbacks=[checkpoint_cb])

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_15 (Sequential)  (None, 128)               96768     
                                                                 
 repeat_vector_5 (RepeatVect  (None, 76, 128)          0         
 or)                                                             
                                                                 
 sequential_16 (Sequential)  (None, 76, 128)           148096    
                                                                 
Total params: 244,864
Trainable params: 244,864
Non-trainable params: 0
_________________________________________________________________

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
E

In [None]:
# rollback to best model and evaluate on test data
model = keras.models.load_model("my_loop_model.h5")
model.evaluate(X_test, Y_test)

# current best accuracy : 0.9755



[0.13523422181606293, 0.9560986757278442]

# Tests

In [None]:
def ids_to_loop_strs(ids):
  return ["".join([chr(index) for index in sequence]) for sequence in ids]

def convert_loop_strs(loop_strs):
  X = prepare_loop_strs_padded(loop_strs)
  ids = np.argmax(model.predict(X), axis=-1)
  return ids_to_loop_strs(ids)

In [None]:
test_loop = [pairs[0][0]]
print(test_loop[0])
print()
print(convert_loop_strs(test_loop)[0].rstrip('\x00'))

for(y =  0; y == -45; y-=20) {
	sum += x;
}

i =  0;
while(i == -45) {
	sum += x;
	x-=20;
}


In [None]:
test_loop = [pairs[0][1]]
print(test_loop[0])
print()
print(convert_loop_strs(test_loop)[0].rstrip('\x00'))

for(curr = 0; curr!=10; curr+=2) {
	sum += x;
}

curr = 0;
while(curr<=10) {
	sum += x;
	curr+=2;
}


# Progress Notes

## Problem 1

The create_dataset function from the chapter 16 notebook ran into an error when trying to get the values from the genLoopPairs function. I edited the genLoopPairs function so it would work.

## Problem 2

When trying to run the code for the 'very basis seq2seq model' i ran into this error.

ValueError: Dimensions must be equal, but are 75 and 76 for '{{node Equal}} = Equal[T=DT_FLOAT, incompatible_shape_error=true](Cast_1, Cast_2)' with input shapes: [?,75], [?,76].

Took me awhile to find what this was referencing but eventually I found it was because the valid and training sets had different sizes. This was fixed with adding padding during 'create_dataset".

## Problem 3

Once I got my model to actually run it had problems increasing its accuracy. I expected it to start low and then climb but it stayed around 40. By extending the epochs and messing around with some of the variables I got it working with a highest accuracy of 97.

## Problem 4

When trying to run some tests the output of the model was complete gibberish. This was fixed by changing an input into a list and eventually removing the 'X+1' padding in 'prepare_loop_strs' because I noticed that my outputs were one character over.

## Problem 5

Due to my excessive testing I found out that Google Collab will actualy disable your ability to use GPU's for a period of time so I had to switch emails to finish this assignment.

## Investigation 1

To make sure that my model was operating at tis best capacity I added a callback checkpoint so that at the end it would rollback to its best weights.

## Investigation 2

When my model was actually working I was able to run some tests. In my first test I noticed that when the variable in the for loop was just a single letter there was a chance it would get replaced with a different letter in the while loop.

