In [97]:
import re
import string
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as tfh
from bert.tokenization import FullTokenizer

# Data acquisition

In [2]:
LANGUAGE = "java" #"python"
DATA_PATH = "../../Data/code2desc"
DATA_FOLDER = f"{LANGUAGE}/short"
TRAIN_FILE  = f"{LANGUAGE}_train_0.jsonl"
TEST_FILE   = f"{LANGUAGE}_test_0.jsonl"
VALID_FILE  = f"{LANGUAGE}_valid_0.jsonl"

In [117]:
# acquire tokenized source code and plain docstrings.
# BERT uses its own 'FullTokenizer' for inputs.
use_cols = ["code_tokens", "docstring"]
train_df = pd.read_json(f"{DATA_PATH}/{DATA_FOLDER}/{TRAIN_FILE}", lines=True)[use_cols]

In [16]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   code_tokens  30000 non-null  object
 1   docstring    30000 non-null  object
dtypes: object(2)
memory usage: 468.9+ KB


This TF Hub model uses the implementation of BERT from the TensorFlow Models repository on GitHub at <a href="https://github.com/tensorflow/models/tree/master/official/nlp/bert">tensorflow/models/official/nlp/bert</a>. It uses L=12 hidden layers (i.e., Transformer blocks), a hidden size of H=768, and A=12 attention heads.

This model has been pre-trained for English on the Wikipedia and BooksCorpus using the code published on GitHub. Inputs have been "uncased", meaning that the text has been lower-cased before tokenization into word pieces, and any accent markers have been stripped. For training, random input masking has been applied independently to word pieces (as in the original BERT paper).

All parameters in the module are trainable, and fine-tuning all parameters is the recommended practice.

In [None]:
model_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = tfh.KerasLayer(model_url,
                            trainable=True)

In [14]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

# Data preprocessing

In [118]:
def cleaning(text):
    '''Performs cleaning of text of unwanted symbols, 
    excessive spaces and transfers to lower-case
    '''
#     punct_regxp = re.compile(f'([{string.punctuation}])')
#     text = re.sub(punct_regxp, r" \1 ", text)
    text = re.sub(r'\s+', " ", text)
    
    text = ''.join(character for character in text if character in string.printable)
    text = text.lower().strip()

    return text

In [119]:
train_df.docstring = train_df.docstring.apply(cleaning)

In [184]:
def generate_bert_input(text, max_seq_length):

    tokenized_text = [["[CLS]"] + tokenizer.tokenize(seq)[:max_seq_length-2] + ["[SEP]"] for seq in text]
    input_ids   = [tokenizer.convert_tokens_to_ids(tokens_seq) for tokens_seq in tokenized_text]
    input_mask  = [[1] * len(input_seq) for input_seq in input_ids]
    segment_ids = [[0] * max_seq_length for _ in range(len(input_ids))]
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=max_seq_length, padding='post')
    input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, maxlen=max_seq_length, padding='post')
    segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, maxlen=max_seq_length, padding='post')

    return input_ids, input_mask, segment_ids

In [185]:
max_seq_length = 128
train_word_ids, train_input_mask, train_segment_ids = generate_bert_input(train_df.docstring, max_seq_length)

# Model definition

In [187]:
dense_units = 256

### Description branch

In [188]:
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), 
                                       dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), 
                                   dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), 
                                    dtype=tf.int32,
                                    name="segment_ids")

desc_dense = tf.keras.layers.Dense(dense_units)

In [None]:
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
desc_output = desc_dense(pooled_output)

### Source code branch

In [None]:
checkpointdir = "java14_model"
chkpoint_prefix = os.path.join(checkpointdir, "saved_model_iter8.release")
if not os.path.exists(checkpointdir):
    os.mkdir(checkpointdir)

checkpoint = tf.train.Checkpoint(optimizer = optimizer, encoderNetwork = encoderNetwork, 
                                 decoderNetwork = decoderNetwork)

try:
    status = checkpoint.restore(tf.train.latest_checkpoint(checkpointdir))
    print("Checkpoint found at {}".format(tf.train.latest_checkpoint(checkpointdir)))
except:
    print("No checkpoint found at {}".format(checkpointdir))

### Branches junction

### Full Model

In [None]:
inputs = [input_word_ids, input_mask, segment_ids]
outputs = []

sim_model = tf.keras.Model(inputs=inputs, outputs=outputs)

optimizer = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
loss_func = tf.keras.losses.CosineSimilarity()

sim_model.compile(loss=loss_func, 
                   optimizer=optimizer, 
                   metrics=['accuracy'])

In [None]:
sim_model.summary()

# Model Training