In [None]:
!pip install --quiet tensorflow-text
# Clone the entire repo.
%cd /content/
!rm -r -f dl4tm
!git clone git://github.com/Jeilef/DL4TM_Text2Python.git dl4tm
%cd dl4tm
!ls

In [1]:
import numpy as np
import json
import logging
from tokenize import tokenize
from io import BytesIO
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_text # See https://github.com/tensorflow/hub/issues/463

In [2]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

In [3]:
train_file = open("conala-corpus-v1.1/conala-corpus/conala-train.json", "r")
train_json = json.load(train_file)

print(len(train_json))
print(train_json[0])

2379
{'intent': 'How to convert a list of multiple integers into a single integer?', 'rewritten_intent': "Concatenate elements of a list 'x' of multiple integers to a single integer", 'snippet': 'sum(d * 10 ** i for i, d in enumerate(x[::-1]))', 'question_id': 41067960}


In [4]:
base_sentences = [item["rewritten_intent"] for item in train_json]
codes = [item["snippet"] for item in train_json]

print(base_sentences[0])
print(codes[0])

Concatenate elements of a list 'x' of multiple integers to a single integer
sum(d * 10 ** i for i, d in enumerate(x[::-1]))


In [5]:
# Use custom tokenize function for code snippets
def tokenize_python(s):
    tokenized = tokenize(BytesIO(s.numpy()).readline)
    tokens = [token[1] for token in tokenized]
    tensor = tf.convert_to_tensor(tokens, dtype=tf.string)
    
    return tensor

example = tf.constant(codes[0])
tokenize_python(example)

<tf.Tensor: shape=(26,), dtype=string, numpy=
array([b'utf-8', b'sum', b'(', b'd', b'*', b'10', b'**', b'i', b'for',
       b'i', b',', b'd', b'in', b'enumerate', b'(', b'x', b'[', b':',
       b':', b'-', b'1', b']', b')', b')', b'', b''], dtype=object)>

In [6]:
data = tf.data.Dataset.from_tensor_slices(codes)
data = data.map(lambda x : tf.py_function(func=tokenize_python, inp=[x], Tout=tf.string))

In [7]:
# Pretrained BERT encoder
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1", trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 128].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 128].

In [8]:
# Transformer Decoder  taken from https://www.tensorflow.org/tutorials/text/transformer
from transformer_util import Decoder
output_tokens = 8000
sample_decoder = Decoder(num_layers=2, d_model=128, num_heads=2,
                         dff=128, target_vocab_size=output_tokens,
                         maximum_position_encoding=5000)

decoder_input = tf.keras.layers.Input(shape=(None, ), dtype=tf.string)

vectorize_layer = keras.layers.experimental.preprocessing.StringLookup(max_tokens=output_tokens)
vectorize_layer.adapt(data)

vectorized_input = vectorize_layer(decoder_input)

decoder_output, attn = sample_decoder(vectorized_input,
                              enc_output=sequence_output,
                              training=False,
                              look_ahead_mask=None,
                              padding_mask=None)
final_layer = tf.keras.layers.Dense(output_tokens)
output = final_layer(decoder_output)


embedding_model = tf.keras.Model((text_input, decoder_input), output)

In [9]:
sentence = tf.constant([train_json[0]["rewritten_intent"]])
snippet = tf.constant(train_json[0]["snippet"])
snippet = tf.py_function(func=tokenize_python, inp=[snippet], Tout=tf.string)
snippet = tf.expand_dims(snippet, axis=0)
print(sentence.shape, snippet.shape)
result = embedding_model((sentence, snippet))
result

(1,) (1, 26)


<tf.Tensor: shape=(1, 26, 8000), dtype=float32, numpy=
array([[[ 0.06685135,  0.23810183, -0.32703936, ..., -0.18657616,
          0.09967032, -0.05479187],
        [ 0.03707373,  0.19308451, -0.32838443, ..., -0.2231875 ,
          0.167663  , -0.04584953],
        [ 0.12639177,  0.18056276, -0.32306534, ..., -0.23309465,
          0.20716378, -0.10085246],
        ...,
        [-0.00758903,  0.23075674, -0.2615829 , ..., -0.26313978,
          0.23697908, -0.07802384],
        [ 0.01387751,  0.22539218, -0.31019554, ..., -0.24952956,
          0.19790062, -0.06827004],
        [ 0.00928868,  0.23075947, -0.30909893, ..., -0.2601089 ,
          0.1849049 , -0.06817085]]], dtype=float32)>

In [10]:
# TODO: Preprocess code samples
# TODO: Set up masking
# TODO: Run training