## Notebook contains code that checks how does the loaded model is dealing with bugged sample

In [None]:
from diffusion_libs import *
from samples_generators import fill_vocabulary_c_v1, convert_back_to_code_c_v1, vocabulary_c_v1
import tensorflow as tf
from tensorflow import keras
import numpy as np
import tensorflow as tf

In [None]:
fill_vocabulary_c_v1()
print(len(vocabulary_c_v1))

In [None]:
# sampling
min_signal_rate = 0.02
max_signal_rate = 0.95

# architecture
embedding_dims = 32
embedding_max_frequency = 1000.0
embedding_min_frequency = 1.0

# optimization
batch_size = 16
ema = 0.999
learning_rate = 1e-3

# dictionary related
DICTIONARY_SIZE = 37
TOKENS_CAPACITY = 256

widths = [64, 64, 96, 128]
block_depth = 2

lang_base = "E:\Studies\master_thesis\diffusion_models_checpoints_savespace\\final_checkpoints\simple_c_v1"
model_path = f"{lang_base}\cp-0032\model"

In [None]:
network = get_network(
      TOKENS_CAPACITY, embedding_min_frequency, embedding_max_frequency, 
      embedding_dims, widths=widths, block_depth=block_depth, name="complicated"
  )

model = DiffusionModel(
      TOKENS_CAPACITY, DICTIONARY_SIZE, network, batch_size, max_signal_rate, 
      min_signal_rate, ema
  )

model.compile(
    optimizer = keras.optimizers.experimental.Adam(
        learning_rate=learning_rate
    ),
    loss = keras.losses.mean_absolute_error
)

#normalizer
n_w = np.load(f"{lang_base}/normalizer_weights.npy", allow_pickle=True)
normalizer = keras.layers.Normalization(mean=n_w[0], variance=n_w[1])
normalizer.build((TOKENS_CAPACITY))
model.normalizer = normalizer
model.load_weights(model_path)

### First should generate some sample to see if it still generates something that looks like code

In [None]:
%%script false --no-raise-error
raw, denormalized = model.generate(5,100)
for sample in denormalized:
  scaled = scale_dataset(sample, DICTIONARY_SIZE)
  print(" ".join(convert_back_to_code_c_v1(scaled)).replace(";", ";\n").replace("{", "{\n"))
  print()

### Lets see what do we get when we just pass few samples to denoise

In [None]:
fill_vocabulary_c_v1()
cv1Dict = {el:idx for idx,el in enumerate(vocabulary_c_v1)}
def tokens_to_vals(dict):
  def apply(sample):
    return [dict[token] for token in sample]
  return apply
# sample
pure_sample = ["int ID0 ( int ID1 ) {",
              "int ID2 = NUM - ID1 * NUM ;",
              "int ID3 ;",
              "printf ( STRING , ID2 ) ;",
              "ID3 = ID1 / ID2 / NUM ;",
              "ID2 = ID1 * NUM - ID1 / NUM * NUM ;",
              "ID3 = ID0 + NUM + NUM * NUM ;",
              "printf ( STRING , ID3 ) ;",
              "return NUM ;",
              "}"]
pure_sample = [part2 for part in pure_sample for part2 in part.split(" ")]
filled_with_empty_sample = pure_sample + ["EMPTY" for _ in range(TOKENS_CAPACITY - len(pure_sample))]
tokenized_sample = tokens_to_vals(cv1Dict)(filled_with_empty_sample)
scaled_sample = np.asarray([token / len(vocabulary_c_v1) for token in tokenized_sample])

In [None]:
from samples_generators import remove_token_and_shift_sample_randomized
remove_tokens_introducer = remove_token_and_shift_sample_randomized([";", "+", "-", "/", "=", "(", "}"], 0.5, cv1Dict, TOKENS_CAPACITY)
tokenized_error_sample = remove_tokens_introducer([tokenized_sample])[0]
errored_sample = [vocabulary_c_v1[val_token] for val_token in tokenized_error_sample]
print(" ".join(errored_sample))
scaled_error_sample = np.asarray([token / len(vocabulary_c_v1) for token in tokenized_error_sample])

In [None]:
raw, denormalized = model.fix_sample(scaled_error_sample, 200)
sample = denormalized[0]
scaled = scale_dataset(sample, DICTIONARY_SIZE)
print(" ".join(convert_back_to_code_c_v1(scaled)).replace(";", ";\n").replace("{", "{\n"))