In [2]:
!pip install --quiet tensorflow-text
# Clone the entire repo.
%cd /content/
!rm -r -f dl4tm
!git clone git://github.com/Jeilef/DL4TM_Text2Python.git dl4tm
%cd dl4tm
!ls

/content
Cloning into 'dl4tm'...
remote: Enumerating objects: 67, done.[K
remote: Counting objects: 100% (67/67), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 67 (delta 29), reused 39 (delta 13), pack-reused 0[K
Receiving objects: 100% (67/67), 385.77 KiB | 1.89 MiB/s, done.
Resolving deltas: 100% (29/29), done.
/content/dl4tm
conala-corpus-v1.1  README.md	      text2python.ipynb  transformer_util.py
google.ipynb	    requirements.txt  transformer.ipynb


In [3]:
import numpy as np
import json
import logging
from tokenize import tokenize
from io import BytesIO
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_text # See https://github.com/tensorflow/hub/issues/463

In [4]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

In [5]:
train_file = open("conala-corpus-v1.1/conala-corpus/conala-train.json", "r")
train_json = json.load(train_file)

print(len(train_json))
print(train_json[0])

2379
{'intent': 'How to convert a list of multiple integers into a single integer?', 'rewritten_intent': "Concatenate elements of a list 'x' of multiple integers to a single integer", 'snippet': 'sum(d * 10 ** i for i, d in enumerate(x[::-1]))', 'question_id': 41067960}


In [76]:
base_sentences = np.array([item["rewritten_intent"] for item in train_json])
codes = np.array([item["snippet"] for item in train_json])

print(base_sentences[0])
print(codes[0])

Concatenate elements of a list 'x' of multiple integers to a single integer
sum(d * 10 ** i for i, d in enumerate(x[::-1]))


In [None]:
# Use custom tokenize function for code snippets
def tokenize_python(s):
    tokenized = tokenize(BytesIO(s.numpy()).readline)
    tokens = [token[1] for token in tokenized]
    tensor = tf.convert_to_tensor(tokens, dtype=tf.string)
    
    return tensor

example = tf.constant(codes[0])
tokenize_python(example)

In [8]:
data = tf.data.Dataset.from_tensor_slices(codes)
data = data.map(lambda x : tf.py_function(func=tokenize_python, inp=[x], Tout=tf.string))

In [95]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask

def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  return seq[:, tf.newaxis, tf.newaxis, :]


class DecoderMasking(tf.keras.layers.Layer):
  def call(self, targets):
    padded_targets = create_padding_mask(targets)
    look_ahead_mask = create_look_ahead_mask(tf.shape(targets)[1])
    return tf.maximum(padded_targets, look_ahead_mask)

class DecoderPaddingMasked(tf.keras.layers.Layer):
  def call(self, inputs):
    expanded_inputs = inputs[:, tf.newaxis, tf.newaxis, :]
    return tf.cast(expanded_inputs, tf.float32)

example = tf.constant(codes[0])
tokenized_example = tokenize_python(example)

vectorize_layer = keras.layers.experimental.preprocessing.StringLookup(max_tokens=8000)
vectorize_layer.adapt(data)

vectorized_example = vectorize_layer(tokenized_example)

masking = DecoderMasking()
lam = masking(tf.convert_to_tensor(([vectorized_example])))
print(lam.shape)

(1, 1, 26, 26)


In [82]:
# Pretrained BERT encoder
encoder_inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.string)
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1", trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 128].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 128].

In [97]:
# Transformer Decoder  taken from https://www.tensorflow.org/tutorials/text/transformer
from transformer_util import Decoder
output_tokens = 8000
sample_decoder = Decoder(num_layers=2, d_model=128, num_heads=2,
                         dff=128, target_vocab_size=output_tokens,
                         maximum_position_encoding=5000)

decoder_input = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)

# Apply masking layer here
look_ahead_mask = DecoderMasking()(decoder_input)
dec_padding_mask = DecoderPaddingMasked()(encoder_inputs['input_mask'])

decoder_output, attn = sample_decoder(decoder_input,
                              enc_output=sequence_output,
                              training=False,
                              look_ahead_mask=look_ahead_mask,
                              padding_mask=dec_padding_mask)
final_layer = tf.keras.layers.Dense(output_tokens)
output = final_layer(decoder_output)


embedding_model = tf.keras.Model((text_input, decoder_input), output)
embedding_model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
keras_layer_7 (KerasLayer)      {'input_word_ids': ( 0           input_6[0][0]                    
__________________________________________________________________________________________________
keras_layer_8 (KerasLayer)      {'encoder_outputs':  4385921     keras_layer_7[0][0]              
                                                                 keras_layer_7[0][1]        

In [14]:
sentence = tf.constant([train_json[0]["rewritten_intent"]])
snippet = tf.constant(train_json[0]["snippet"])
snippet = tf.py_function(func=tokenize_python, inp=[snippet], Tout=tf.string)
snippet = tf.expand_dims(snippet, axis=0)
print(sentence.shape, snippet.shape)
result = embedding_model((sentence, snippet))
result

(1,) (1, 26)


<tf.Tensor: shape=(1, 26, 8000), dtype=float32, numpy=
array([[[-0.02071353,  0.22273576,  0.23580809, ..., -0.05850026,
         -0.03325614,  0.130977  ],
        [-0.05592096,  0.24954008,  0.17787357, ..., -0.07673593,
         -0.04091293,  0.13660514],
        [ 0.00259175,  0.2463955 ,  0.1591225 , ..., -0.04003742,
         -0.03151663,  0.1337425 ],
        ...,
        [ 0.06853933,  0.29817224,  0.18724772, ..., -0.04172705,
          0.02607824,  0.04938851],
        [ 0.04189403,  0.24589247,  0.10972903, ..., -0.04485242,
          0.0418922 ,  0.03732567],
        [ 0.04658419,  0.24851313,  0.11427581, ..., -0.02627425,
          0.05173177,  0.03634437]]], dtype=float32)>

In [None]:
# TODO: Preprocess code samples
# TODO: Set up masking
# TODO: Run training

In [98]:
# Taken from https://www.tensorflow.org/tutorials/text/transformer
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, 
                                     beta_2=0.98, epsilon=1e-9)
embedding_model.compile(optimizer, loss_function)

In [99]:
tokenized_codes = [tf.py_function(func=tokenize_python, inp=[snippet], Tout=tf.string) for snippet in codes]
vectorize_layer = keras.layers.experimental.preprocessing.StringLookup(max_tokens=output_tokens)
vectorize_layer.adapt(data)
vectorized_inputs = [vectorize_layer(code) for code in tokenized_codes]
tokenized_codes = tf.keras.preprocessing.sequence.pad_sequences(vectorized_inputs, value=0)


In [100]:
base_sentences = np.array(['' if v is None else v for v in base_sentences])

In [101]:
history = embedding_model.fit([base_sentences, tokenized_codes], tokenized_codes, batch_size=64, epochs=10)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
  1/149 [..............................] - ETA: 1:14 - loss: 0.0045

KeyboardInterrupt: ignored