In [1]:
!pip install --quiet tensorflow-text
# Clone the entire repo.
%cd /content/
!rm -r -f dl4tm
!git clone git://github.com/Jeilef/DL4TM_Text2Python.git dl4tm
%cd dl4tm
!ls

[Errno 2] No such file or directory: '/content/'
/mnt/c/Users/Jona/PycharmProjects/DL4TM_Text2Python
Cloning into 'dl4tm'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 91 (delta 45), reused 43 (delta 15), pack-reused 0[K
Receiving objects: 100% (91/91), 547.43 KiB | 320.00 KiB/s, done.
Resolving deltas: 100% (45/45), done.
/mnt/c/Users/Jona/PycharmProjects/DL4TM_Text2Python/dl4tm
README.md	    google.ipynb      text2python.ipynb  transformer_util.py
conala-corpus-v1.1  requirements.txt  transformer.ipynb


In [2]:
import numpy as np
import json
import logging
from tokenize import tokenize
from io import BytesIO
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_text # See https://github.com/tensorflow/hub/issues/463

In [3]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

In [4]:
train_file = open("conala-corpus-v1.1/conala-corpus/conala-train.json", "r")
train_json = json.load(train_file)

print(len(train_json))
print(train_json[0])

2379
{'intent': 'How to convert a list of multiple integers into a single integer?', 'rewritten_intent': "Concatenate elements of a list 'x' of multiple integers to a single integer", 'snippet': 'sum(d * 10 ** i for i, d in enumerate(x[::-1]))', 'question_id': 41067960}


In [5]:
base_sentences = np.array([item["rewritten_intent"] for item in train_json])
codes = np.array([item["snippet"] for item in train_json])

print(base_sentences[0])
print(codes[0])

Concatenate elements of a list 'x' of multiple integers to a single integer
sum(d * 10 ** i for i, d in enumerate(x[::-1]))


In [6]:
# Use custom tokenize function for code snippets
def tokenize_python(s):
    tokenized = tokenize(BytesIO(s.numpy()).readline)
    tokens = [token[1] for token in tokenized]
    tensor = tf.convert_to_tensor(tokens, dtype=tf.string)
    
    return tensor

def detokenize_python(s):
  array = s.numpy()
  code = b"".join(array)
  code = code.encode("utf-8")
  return code

example = tf.constant(codes[0])
tokenize_python(example)

<tf.Tensor: shape=(26,), dtype=string, numpy=
array([b'utf-8', b'sum', b'(', b'd', b'*', b'10', b'**', b'i', b'for',
       b'i', b',', b'd', b'in', b'enumerate', b'(', b'x', b'[', b':',
       b':', b'-', b'1', b']', b')', b')', b'', b''], dtype=object)>

In [7]:
data = tf.data.Dataset.from_tensor_slices(codes)
data = data.map(lambda x : tf.py_function(func=tokenize_python, inp=[x], Tout=tf.string))

In [8]:
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

def preprocess_codes(codes):
  tokenized_codes = [tf.py_function(func=tokenize_python, inp=[snippet], Tout=tf.string) for snippet in codes]
  vectorize_layer = StringLookup(max_tokens=8000)
  vectorize_layer.adapt(data)
  vectorized_codes = [vectorize_layer(code) for code in tokenized_codes]
  return tf.keras.preprocessing.sequence.pad_sequences(vectorized_codes, value=0), vectorize_layer.get_vocabulary()

def postprocess_codes(codes, vocabulary):
  codes = codes.reshape((-1, len(vocabulary) + 2,))
  codes = codes[-1]
  word = np.argmax(codes)
  return vocabulary[word - 2], word # because start and stop are first predictions in first prediction
  #codes = []
  #tokenized_codes = [tf.py_function(func=tokenize_python, inp=[snippet], Tout=tf.string) for snippet in codes]
  #i_layer = StringLookup(vocabulary=vocabulary, invert=True)
  #vectorized_codes = [vectorize_layer(code) for code in tokenized_codes]
  #return tf.keras.preprocessing.sequence.pad_sequences(vectorized_codes, value=0)

tokenized_codes, vocabulary = preprocess_codes(codes)

In [9]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask

def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  return seq[:, tf.newaxis, tf.newaxis, :]


class DecoderMasking(tf.keras.layers.Layer):
  def call(self, targets):
    padded_targets = create_padding_mask(targets)
    look_ahead_mask = create_look_ahead_mask(tf.shape(targets)[1])
    return tf.maximum(padded_targets, look_ahead_mask)

class DecoderPaddingMasked(tf.keras.layers.Layer):
  def call(self, inputs):
    expanded_inputs = inputs[:, tf.newaxis, tf.newaxis, :]
    return tf.cast(expanded_inputs, tf.float32)

example = tf.constant(codes[0])
tokenized_example = tokenize_python(example)

vectorize_layer = keras.layers.experimental.preprocessing.StringLookup(max_tokens=8000)
vectorize_layer.adapt(data)

vectorized_example = vectorize_layer(tokenized_example)

masking = DecoderMasking()
lam = masking(tf.convert_to_tensor(([vectorized_example])))
print(lam.shape)

(1, 1, 26, 26)


In [10]:
# Pretrained BERT encoder
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1", trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 128].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 128].

In [11]:
# Transformer Decoder  taken from https://www.tensorflow.org/tutorials/text/transformer
from transformer_util import Decoder
output_tokens = len(vocabulary) + 2 # start and stop token 
sample_decoder = Decoder(num_layers=2, d_model=128, num_heads=2,
                         dff=128, target_vocab_size=output_tokens,
                         maximum_position_encoding=5000)

decoder_input = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)

# Apply masking layer here
look_ahead_mask = DecoderMasking()(decoder_input)
dec_padding_mask = DecoderPaddingMasked()(encoder_inputs['input_mask'])

decoder_output, attn = sample_decoder(decoder_input,
                              enc_output=sequence_output,
                              training=True,
                              look_ahead_mask=look_ahead_mask,
                              padding_mask=dec_padding_mask)
final_layer = tf.keras.layers.Dense(output_tokens)
output = final_layer(decoder_output)


embedding_model = tf.keras.Model((text_input, decoder_input), output)
embedding_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        {'input_word_ids': ( 0           input_1[0][0]                    
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      {'sequence_output':  4385921     keras_layer[0][0]                
                                                                 keras_layer[0][1]            

In [12]:
sentence = tf.constant([train_json[0]["rewritten_intent"]])
snippet = tf.constant(train_json[0]["snippet"])
snippet = tf.py_function(func=tokenize_python, inp=[snippet], Tout=tf.string)
snippet = tf.expand_dims(snippet, axis=0)
print(sentence.shape, snippet.shape)
# result = embedding_model((sentence, snippet))
# result

(1,) (1, 26)


In [13]:
# Taken from https://www.tensorflow.org/tutorials/text/transformer
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, 
                                     beta_2=0.98, epsilon=1e-9)
embedding_model.compile(optimizer, loss_function)

In [14]:
base_sentences = np.array(['' if v is None else v for v in base_sentences])

In [15]:
history = embedding_model.fit([base_sentences, tokenized_codes], tokenized_codes, batch_size=64, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [16]:
def generate_result(test_sentence, model):
  prediction = ""
  words = [0]
  while len(prediction) < 100:
    # prediction_prep = tokenize_python(tf.constant([prediction]))
    word_pred = model.predict((np.array([test_sentence]), np.array([words])))
    result, word_idx = postprocess_codes(word_pred, vocabulary)

    if word_idx == 1:
      break
    elif word_idx == 0:
      prediction += "[UNK]"
    else:
      prediction += result
    words.append(word_idx)
  return prediction

print(len(vocabulary))
print(generate_result("add 5 and 6.", embedding_model))
print(generate_result("read the file fun.txt.", embedding_model))
print(generate_result("Concatenate elements of a list 'x' of multiple integers to a single integer", embedding_model))

2840
urlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurl
minminminminminminminminminminminminminminminminminminminminminminminminminminminminminminminminminmin
urlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurlurl
