In [3]:
# Block 1
# Imports


import tensorflow as tf
import numpy as np

import json
import glob
import os
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorboard
import matplotlib.pyplot as plt

import datetime

2023-11-22 16:55:52.785641: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-22 16:55:52.785700: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-22 16:55:52.785723: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-22 16:55:52.791512: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Block 2
# Loader


class Loader:
    def __init__(self, problems_path, submissions_dir, log_dir, max_length, batch_size):
        self.problems_path = problems_path
        self.submissions_dir = submissions_dir
        self.max_length = max_length
        self.batch_size = batch_size

        self.tokenizer = Tokenizer(filters='')
        self.dataset = None
        
        self.writer = tf.summary.create_file_writer(log_dir)

    def _load_problems(self):
        with open(self.problems_path, 'r') as f:
            problems_list = json.load(f)

        problems = {}
        for problem in problems_list:
            problem_id = problem['problem_id']
            concatenated_problem = "XXSTATEMENT {} XXINPUT {} XXOUTPUT {} XXNOTES {} XXEXAMPLES {}".format(
                problem.get('problem_statement', ''),
                problem.get('problem_input', ''),
                problem.get('problem_output', ''),
                problem.get('problem_notes', ''),
                problem.get('examples', '')
            )
            problems[problem_id] = concatenated_problem

        return problems

    def _load_solutions(self, problems):
        # Appends solutions to problems
        solutions = []
        for filepath in glob.glob(os.path.join(self.submissions_dir, "*.py")):
            problem_number = int(re.findall(r'^\d+', os.path.basename(filepath))[0])
            if problem_number in problems:
                with open(filepath, "r") as f:
                    solutions.append((problems[problem_number], f.read()))

        return solutions

    def _tokenize_and_pad(self, problems_solutions):
        # Tokenize and pad at the same time for efficiency
        texts = [problem for problem, _ in problems_solutions] + [solution for _, solution in problems_solutions]
        self.tokenizer.fit_on_texts(texts)

        sequences = [self.tokenizer.texts_to_sequences([text])[0] for text in texts]
        padded_sequences = pad_sequences(sequences, padding='post', maxlen=self.max_length)
        
        # Split back into problems and solutions
        midpoint = len(padded_sequences) // 2
        problem_padded = padded_sequences[:midpoint]
        solution_padded = padded_sequences[midpoint:]
        
        # Log dataset samples
        with self.writer.as_default():
            for i, (problem, solution) in enumerate(zip(problem_padded, solution_padded)):
                if i >= 5:  # 5 samples
                    break
                # Log problem
                problem_text = self.tokenizer.sequences_to_texts([problem])[0]
                tf.summary.text(name=f"Problem_{i}", data=problem_text, step=0)

                # Log solution
                solution_text = self.tokenizer.sequences_to_texts([solution])[0]
                tf.summary.text(name=f"Solution_{i}", data=solution_text, step=0)
            
            self.writer.flush()
                
        return problem_padded, solution_padded
    
    def _tokenize_and_pad(self, problems_solutions):
        # Tokenize and pad at the same time for efficiency
        texts = [problem for problem, _ in problems_solutions] + [solution for _, solution in problems_solutions]
        self.tokenizer.fit_on_texts(texts)

        sequences = [self.tokenizer.texts_to_sequences([text])[0] for text in texts]
        padded_sequences = pad_sequences(sequences, padding='post', maxlen=self.max_length)

        # Split back into problems and solutions
        mid_point = len(padded_sequences) // 2
        return padded_sequences[:mid_point], padded_sequences[mid_point:]

    def _create_tf_dataset(self, problem_padded, solution_padded):
        # Prepare decoder input (shifted solution)
        decoder_input = tf.pad(solution_padded, [[0, 0], [1, 0]])[:, :-1]  # Shift left

        # Target is the original solution
        target = solution_padded

        # Create the dataset
        dataset = tf.data.Dataset.from_tensor_slices(((problem_padded, decoder_input), target))
        return dataset.shuffle(buffer_size=1024).batch(self.batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
                
    def load_data(self):
        problems = self._load_problems()
        solutions = self._load_solutions(problems)

        problem_padded, solution_padded = self._tokenize_and_pad(solutions)
        self.dataset = self._create_tf_dataset(problem_padded, solution_padded)

In [3]:
# Block 3
# Positional Encoder

def positional_encoder(seq_length, d_model):
    # Generate positions
    positions = tf.range(seq_length, dtype=tf.float32)[..., tf.newaxis]

    # Indices for div_terms calculation
    i = tf.range(d_model, dtype=tf.float32)
    div_terms = 1 / tf.pow(10000.0, (2 * (i // 2)) / tf.cast(d_model, tf.float32))

    # Calculate sinusoidal encodings
    angle_rates = positions * div_terms
    sine = tf.sin(angle_rates[:, 0::2])
    cosine = tf.cos(angle_rates[:, 1::2])

    # Interlace
    pos_encoding = tf.reshape(tf.concat([sine, cosine], axis=-1), [1, seq_length, d_model])

    return pos_encoding

In [4]:
# Block 4
# Encoder/Decoder Layer classes

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, dim_ff, dim, num_heads, dropout_rate, name="EncoderLayer"):
        super(EncoderLayer, self).__init__(name=name)

        # Multi-Head Self-Attention layer
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)

        # Feed-Forward Network Layers
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dim_ff, activation='relu', kernel_initializer='he_normal', name="encoder_ffn_dense1"),
            tf.keras.layers.Dense(dim, kernel_initializer='he_normal', name="encoder_ffn_dense2")
        ], name="encoder_ffn")

        # Normalization Layers
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="encoder_layernorm1")
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="encoder_layernorm2")

        # Dropout
        self.dropout_mha = tf.keras.layers.Dropout(dropout_rate)
        self.dropout_ffn = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, training=False):
        # Self-Attention
        attn_output = self.mha(x, x)  # Self attention
        attn_output = self.dropout_mha(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # Residual connection

        # Feed-Forward Network
        ffn_input = self.layernorm2(out1)
        ffn_out = self.ffn(ffn_input)
        ffn_out = self.dropout_ffn(ffn_out, training=training)
        out2 = out1 + ffn_out  # Residual connection

        return out2

    def get_config(self):
        config = super(EncoderLayer, self).get_config()
        mha_config = self.mha.get_config()  # Won't work if mha1 and mha2 are different
        config.update({
            "dim_ff": self.ffn.layers[0].units,
            "num_heads": mha_config['num_heads'], 
            "key_dim": mha_config['key_dim'], 
            "dropout_rate": self.dropout_mha.rate
        })
        return config


class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, dim_ff, dim, num_heads, dropout_rate, name="DecoderLayer"):
        super(DecoderLayer, self).__init__(name=name)

        # Self-Attention and Cross-Attention layers
        self.mha1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)
        self.mha2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)

        # Feed Forward Network Layers
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dim_ff, activation='relu', kernel_initializer='he_normal', name="decoder_ffn_dense1"),
            tf.keras.layers.Dense(dim, kernel_initializer='he_normal', name="decoder_ffn_dense2")
        ], name="decoder_ffn")

        # Normalization Layers
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="decoder_layernorm1")
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="decoder_layernorm2")
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="decoder_layernorm3")

        # Dropout
        self.dropout_self_attn = tf.keras.layers.Dropout(dropout_rate)
        self.dropout_cross_attn = tf.keras.layers.Dropout(dropout_rate)
        self.dropout_ffn = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, enc_output, training=False, look_ahead_mask=None, padding_mask=None):
        # Self-Attention
        attn1_output = self.mha1(x, x, attention_mask=look_ahead_mask)
        attn1_output = self.dropout_self_attn(attn1_output, training=training)
        out1 = self.layernorm1(x + attn1_output)  # Residual connection

        # Cross-Attention
        attn2_output = self.mha2(out1, enc_output, attention_mask=padding_mask)
        attn2_output = self.dropout_cross_attn(attn2_output, training=training)
        out2 = self.layernorm2(out1 + attn2_output)  # Residual connection

        # Feed-Forward Network
        ffn_out = self.ffn(out2)
        ffn_out = self.dropout_ffn(ffn_out, training=training)
        out3 = self.layernorm3(ffn_out + out2)  # Residual connection

        return out3

    def get_config(self):
        config = super(DecoderLayer, self).get_config()
        mha1_config = self.mha1.get_config()  # Won't work if mha1 and mha2 are different
        config.update({
            "dim_ff": self.ffn.layers[0].units,
            "num_heads": mha1_config['num_heads'], 
            "key_dim": mha1_config['key_dim'], 
            "dropout_rate": self.dropout_self_attn.rate
        })
        return config

In [5]:
# Block 5
# Transformer

class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, dim, dim_ff, key_dim, num_heads, num_layers, dropout_rate, name="TransformerEncoder"):
        super(TransformerEncoder, self).__init__(name=name)
        self.num_layers = num_layers
        self.enc_layers = [EncoderLayer(dim_ff, dim, num_heads, dropout_rate, name=f"encoder_layer_{i}") for i in range(num_layers)]

    def call(self, x: tf.Tensor, training=False) -> tf.Tensor:
        for layer in self.enc_layers:
            x = layer(x, training=training)
        return x


class TransformerDecoder(tf.keras.layers.Layer):
    def __init__(self, dim, dim_ff, key_dim, num_heads, num_layers, dropout_rate, name="TransformerDecoder"):
        super(TransformerDecoder, self).__init__(name=name)
        self.num_layers = num_layers
        self.dec_layers = [DecoderLayer(dim_ff, dim, num_heads, dropout_rate, name=f"decoder_layer_{i}") for i in range(num_layers)]

    def call(self, x: tf.Tensor, enc_output: tf.Tensor, training=False) -> tf.Tensor:
        for layer in self.dec_layers:
            x = layer(x, enc_output, training=training)
        return x


class Transformer(tf.keras.Model):
    def __init__(self, dim, dim_ff, key_dim, vocab_size, num_heads, num_layers, dropout_rate):
        super(Transformer, self).__init__()
        self.dim = dim

        self.embedding_layer = tf.keras.layers.Embedding(vocab_size, dim)

        self.encoder = TransformerEncoder( dim, dim_ff, key_dim, num_heads, num_layers, dropout_rate, name="encoder")
        self.decoder = TransformerDecoder( dim, dim_ff, key_dim, num_heads, num_layers, dropout_rate, name="decoder")

        self.final_layer = tf.keras.layers.Dense(vocab_size, name="output_layer")

    def call(self, encoder_input, decoder_input, training=False):
        encoder_emb = self.embedding_layer(encoder_input)
        decoder_emb = self.embedding_layer(decoder_input)

        seq_length_enc = tf.shape(encoder_input)[1]
        seq_length_dec = tf.shape(decoder_input)[1]
        pos_encoding_enc = positional_encoder(seq_length_enc, self.dim)
        pos_encoding_dec = positional_encoder(seq_length_dec, self.dim)

        encoder_emb += pos_encoding_enc
        decoder_emb += pos_encoding_dec

        encoder_output = self.encoder(encoder_emb, training=training)
        decoder_output = self.decoder(decoder_emb, encoder_output, training=training)

        final_output = self.final_layer(decoder_output)

        return final_output
    

In [6]:
# Block 6
# Build and Compile

def build_and_compile(dim, dim_ff, key_dim, nhead, num_layers, vocab_size, dropout_rate, learning_rate=1e-4):
    # Define model inputs
    encoder_input = tf.keras.Input(shape=(None,), dtype='int32', name='encoder_input')
    decoder_input = tf.keras.Input(shape=(None,), dtype='int32', name='decoder_input')

    # Initialize and call the Transformer
    transformer = Transformer(dim, dim_ff, key_dim, vocab_size, nhead, num_layers, dropout_rate)
    decoder_output = transformer(encoder_input, decoder_input)

    # Final Dense layer for classification
    outputs = tf.keras.layers.Dense(vocab_size, name='output_layer')(decoder_output)

    # Create the model
    model = tf.keras.Model(inputs=[encoder_input, decoder_input], outputs=outputs)

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

In [7]:
# Block 7
# Define Training Steps

def calculate_loss(model_output, tokenized_code, mask):
    loss = tf.keras.losses.sparse_categorical_crossentropy(tokenized_code, model_output, from_logits=True)
    loss *= mask  # Apply mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

@tf.function
def train_step(model, optimizer, tokenized_question, tokenized_code, clip_norm=1.0):
    with tf.GradientTape() as tape:
        model_output = model([tokenized_question, tokenized_code], training=True)

        # Mask PAD tokens
        mask = tf.cast(tf.math.logical_not(tf.math.equal(tokenized_code, 0)), dtype=model_output.dtype)
        
        # Calculate loss
        average_loss = calculate_loss(model_output, tokenized_code, mask)

    # Compute and clip gradients
    gradients = tape.gradient(average_loss, model.trainable_variables)
    clipped_gradients, _ = tf.clip_by_global_norm(gradients, clip_norm)

    # Apply gradients to update model weights
    optimizer.apply_gradients(zip(clipped_gradients, model.trainable_variables))

    return average_loss

In [10]:
# Block 8
# Execution

if __name__ == "__main__":
    # Paths to data
    problems_path = "/workspace/Training_Data/A_Problems.json"
    submissions_dir = "/workspace/Training_Data/A_Submissions_4"
    log_dir = "/workspace/logs"

    # Set hyperparameters
    dim = 256
    dim_ff = dim * 4
    num_layers = 6
    num_heads = 8
    key_dim = dim // num_heads

    max_length = 530 # Set to cover about 85% of inputs
    dropout_rate = 0.01 # Lowered temporarily

    batch_size = 16
    learning_rate = 0.002
    epochs = 1

    assert dim % num_heads == 0, "dim % num_heads != 0"

    # Initialize the Loader
    loader = Loader(problems_path, submissions_dir, log_dir, max_length, batch_size)
    loader.load_data()
    vocab_size = len(loader.tokenizer.word_index) + 1
    
    # Build the model
    model = build_and_compile(dim, dim_ff, key_dim, num_heads, num_layers, vocab_size, dropout_rate, learning_rate)

    # Setup TensorBoard callback
    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    # Train the model
    history = model.fit(loader.dataset, epochs=epochs, callbacks=[tensorboard_callback])

    # Save the model
    model.save("/workspace")

2023-11-22 15:34:44.032533: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fb2f4a64650 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-22 15:34:44.032581: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2080 SUPER, Compute Capability 7.5
2023-11-22 15:34:44.037554: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-22 15:34:44.049115: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2023-11-22 15:34:44.128935: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


INFO:tensorflow:Assets written to: /workspace/assets


INFO:tensorflow:Assets written to: /workspace/assets


In [4]:
# Block 10
# Evaluation Class

class Evaluator:
    def __init__(self, model, loader):
        self.model = model
        self.loader = loader

    def plot_loss(self, history):
        plt.figure(figsize=(10, 6))
        plt.plot(history.history['loss'])
        plt.title('Loss Curve')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.grid(True)
        plt.show()

    def plot_token_probabilities(self, token_index, n_samples):
        # Get n_samples from the dataset
        for (encoder_input, decoder_input), _ in self.loader.dataset.take(n_samples):
            prediction = self.model.predict([encoder_input, decoder_input])
            token_logits = prediction[0, token_index]

            # Convert logits to probabilities
            token_probabilities = tf.nn.softmax(token_logits).numpy()

            sorted_indices = np.argsort(token_probabilities)[::-1]
            sorted_probabilities = token_probabilities[sorted_indices]

            plt.figure(figsize=(20, 5))
            plt.bar(range(len(sorted_probabilities)), sorted_probabilities)
            plt.xlabel('Word Indices (sorted by probability)')
            plt.ylabel('Probability')
            plt.title(f'Word Prediction Probabilities for Token {token_index}')
            plt.show()

    def generate_sample_predictions(self, n_samples=1):
        # Get n_samples from the dataset
        for (encoder_input, decoder_input), _ in self.loader.dataset.take(n_samples):
            print(encoder_input[1][:10])
            print(decoder_input[1][:10])
            prediction = self.model.predict([encoder_input, decoder_input])
            predicted_sequence = np.argmax(prediction, axis=-1)
            predicted_text = self.loader.tokenizer.sequences_to_texts(predicted_sequence)

            print("Predicted sequence: ", predicted_sequence)
            print("Predicted text: ", predicted_text)

    def evaluate(self, command, *args, **kwargs):
        if command == 'loss':
            self.plot_loss(*args, **kwargs)
        elif command == 'token_prob':
            self.plot_token_probabilities(*args, **kwargs)
        elif command == 'sample_pred':
            self.generate_sample_predictions(*args, **kwargs)
        else:
            print(f"Unknown command: {command}")

# Load the model if it's not
model = tf.keras.models.load_model('/workspace/saved_model.pb')

# Uncomment what you want to run
evaluator = Evaluator(model, loader)
#evaluator.evaluate('loss', history)
#evaluator.evaluate('token_prob', token_index=10, n_samples=4)
evaluator.evaluate('sample_pred', n_samples=4)


OSError: Unable to synchronously open file (file signature not found)

In [11]:
!tensorboard --logdir logs --bind_all

2023-11-22 16:19:16.422553: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-22 16:19:16.422617: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-22 16:19:16.422668: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-22 16:19:16.427938: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-22 16:19:17.936294: I tensorflow/compiler/

In [9]:
!pwd

/home/rapids/notebooks
