In [11]:
!pip install --quiet tensorflow-text
# Clone the entire repo.
%cd /content/
!rm -r -f dl4tm
!git clone git://github.com/Jeilef/DL4TM_Text2Python.git dl4tm
%cd dl4tm
!ls

/content
Cloning into 'dl4tm'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 64 (delta 27), reused 40 (delta 13), pack-reused 0[K
Receiving objects: 100% (64/64), 383.39 KiB | 5.64 MiB/s, done.
Resolving deltas: 100% (27/27), done.
/content/dl4tm
conala-corpus-v1.1  README.md	      text2python.ipynb  transformer_util.py
google.ipynb	    requirements.txt  transformer.ipynb


In [12]:
import numpy as np
import json
import logging
from tokenize import tokenize
from io import BytesIO
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_text # See https://github.com/tensorflow/hub/issues/463

In [13]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

In [14]:
train_file = open("conala-corpus-v1.1/conala-corpus/conala-train.json", "r")
train_json = json.load(train_file)

print(len(train_json))
print(train_json[0])

2379
{'intent': 'How to convert a list of multiple integers into a single integer?', 'rewritten_intent': "Concatenate elements of a list 'x' of multiple integers to a single integer", 'snippet': 'sum(d * 10 ** i for i, d in enumerate(x[::-1]))', 'question_id': 41067960}


In [15]:
base_sentences = [item["rewritten_intent"] for item in train_json]
codes = [item["snippet"] for item in train_json]

print(base_sentences[0])
print(codes[0])

Concatenate elements of a list 'x' of multiple integers to a single integer
sum(d * 10 ** i for i, d in enumerate(x[::-1]))


In [16]:
# Use custom tokenize function for code snippets
def tokenize_python(s):
    tokenized = tokenize(BytesIO(s.numpy()).readline)
    tokens = [token[1] for token in tokenized]
    tensor = tf.convert_to_tensor(tokens, dtype=tf.string)
    
    return tensor

example = tf.constant(codes[0])
tokenize_python(example)

<tf.Tensor: shape=(26,), dtype=string, numpy=
array([b'utf-8', b'sum', b'(', b'd', b'*', b'10', b'**', b'i', b'for',
       b'i', b',', b'd', b'in', b'enumerate', b'(', b'x', b'[', b':',
       b':', b'-', b'1', b']', b')', b')', b'', b''], dtype=object)>

In [None]:
data = tf.data.Dataset.from_tensor_slices(codes)
data = data.map(lambda x : tf.py_function(func=tokenize_python, inp=[x], Tout=tf.string))

code_chars = set()
for tokens in data:
  for c in tokens:
    code_chars.add(c.numpy())

code_chars = sorted(list(code_chars))
code_idx_to_char = {}
code_char_to_idx = {}
for idx, cc in enumerate(code_chars):
  code_idx_to_char[idx] = cc
  code_char_to_idx[cc] = idx

max_code_len = max([len(line) for line in codes])
tokenized_codes = np.zeros(shape=(len(codes), max_code_len, len(code_chars)))
targets = np.zeros(shape=(len(codes), max_code_len, len(code_chars)), dtype="float32")

for i in range(len(codes)):
  for k, ch in enumerate(codes[i]):
    tokenized_codes[i, k, code_char_to_idx[ch]] = 1
    if k > 0:
      targets[i, k, code_char_to_idx[ch]] = 1

In [None]:
# Pretrained BERT encoder
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1", trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 128].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 128].

In [None]:
# Transformer Decoder  taken from https://www.tensorflow.org/tutorials/text/transformer
from transformer_util import Decoder
output_tokens = 8000
sample_decoder = Decoder(num_layers=2, d_model=128, num_heads=2,
                         dff=128, target_vocab_size=output_tokens,
                         maximum_position_encoding=5000)

decoder_input = tf.keras.layers.Input(shape=(None, ), dtype=tf.string)

vectorize_layer = keras.layers.experimental.preprocessing.StringLookup(max_tokens=output_tokens)
vectorize_layer.adapt(data)

vectorized_input = vectorize_layer(decoder_input)

decoder_output, attn = sample_decoder(vectorized_input,
                              enc_output=sequence_output,
                              training=False,
                              look_ahead_mask=None,
                              padding_mask=None)
final_layer = tf.keras.layers.Dense(output_tokens)
output = final_layer(decoder_output)


embedding_model = tf.keras.Model((text_input, decoder_input), output)

In [None]:
sentence = tf.constant([train_json[0]["rewritten_intent"]])
snippet = tf.constant(train_json[0]["snippet"])
snippet = tf.py_function(func=tokenize_python, inp=[snippet], Tout=tf.string)
snippet = tf.expand_dims(snippet, axis=0)
print(sentence.shape, snippet.shape)
result = embedding_model((sentence, snippet))
result

In [None]:
# TODO: Preprocess code samples
# TODO: Set up masking
# TODO: Run training

In [None]:
embedding_model.compile(optimizer="adam", loss="categorical_crossentropy")
embedding_model.fit(x=[base_sentences, tokenized_codes], y=[targets], batch_size=16)