In [1]:
import re
import string
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as tfh
from bert.tokenization import FullTokenizer
from gensim.models import KeyedVectors as word2vec
from sklearn.model_selection import train_test_split

# Data acquisition

In [2]:
LANGUAGE = "java" #"python"
DATA_PATH = "../../Data/code2desc"
DATA_FOLDER = f"{LANGUAGE}/short"
TRAIN_FILE  = f"{LANGUAGE}_train_0.jsonl"
TEST_FILE   = f"{LANGUAGE}_test_0.jsonl"
VALID_FILE  = f"{LANGUAGE}_valid_0.jsonl"

In [3]:
# acquire tokenized source code and plain docstrings.
# BERT uses its own 'FullTokenizer' for inputs.
use_cols = ["code_tokens", "docstring"]
train_df = pd.read_json(f"{DATA_PATH}/{DATA_FOLDER}/{TRAIN_FILE}", lines=True)[use_cols]

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   code_tokens  30000 non-null  object
 1   docstring    30000 non-null  object
dtypes: object(2)
memory usage: 468.9+ KB


In [5]:
train_df.head()

Unnamed: 0,code_tokens,docstring
0,"[protected, final, void, bindIndexed, (, Confi...",Bind indexed elements to the supplied collecti...
1,"[public, void, setServletRegistrationBeans, (,...",Set {@link ServletRegistrationBean}s that the ...
2,"[public, void, addServletRegistrationBeans, (,...",Add {@link ServletRegistrationBean}s for the f...
3,"[public, void, setServletNames, (, Collection,...",Set servlet names that the filter will be regi...
4,"[public, void, addServletNames, (, String, ......",Add servlet names for the filter.\n@param serv...


This TF Hub model uses the implementation of BERT from the TensorFlow Models repository on GitHub at <a href="https://github.com/tensorflow/models/tree/master/official/nlp/bert">tensorflow/models/official/nlp/bert</a>. It uses L=12 hidden layers (i.e., Transformer blocks), a hidden size of H=768, and A=12 attention heads.

This model has been pre-trained for English on the Wikipedia and BooksCorpus using the code published on GitHub. Inputs have been "uncased", meaning that the text has been lower-cased before tokenization into word pieces, and any accent markers have been stripped. For training, random input masking has been applied independently to word pieces (as in the original BERT paper).

All parameters in the module are trainable, and fine-tuning all parameters is the recommended practice.

### Descriptions embeddings

In [8]:
model_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = tfh.KerasLayer(model_url,
                            trainable=True)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

### Source code embeddings

In [None]:
EMBEDDINGS_FOLDER = "source-code-embeddings"
TOKEN_EMBEDDINGS  = "token_vecs.txt"
TARGET_EMBEDDINGS = "target_vecs.txt"

vectors_text_path = f'{EMBEDDINGS_FOLDER}/{TOKEN_EMBEDDINGS}'
model = word2vec.load_word2vec_format(vectors_text_path, binary=False)

# Data preprocessing

In [None]:
def cleaning(text):
    '''Performs cleaning of text of unwanted symbols, 
    excessive spaces and transfers to lower-case
    '''
#     punct_regxp = re.compile(f'([{string.punctuation}])')
#     text = re.sub(punct_regxp, r" \1 ", text)
    text = re.sub(r'\s+', " ", text)
    
    text = ''.join(character for character in text if character in string.printable)
    text = text.lower().strip()

    return text

In [None]:
train_df.docstring = train_df.docstring.apply(cleaning)

In [None]:
def generate_bert_input(text, max_seq_length):

    tokenized_text = [["[CLS]"] + tokenizer.tokenize(seq)[:max_seq_length-2] + ["[SEP]"] for seq in text]
    input_ids   = [tokenizer.convert_tokens_to_ids(tokens_seq) for tokens_seq in tokenized_text]
    input_mask  = [[1] * len(input_seq) for input_seq in input_ids]
    segment_ids = [[0] * max_seq_length for _ in range(len(input_ids))]
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=max_seq_length, padding='post')
    input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, maxlen=max_seq_length, padding='post')
    segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, maxlen=max_seq_length, padding='post')

    return input_ids, input_mask, segment_ids

In [None]:
desc_max_seq_length = 256
desc_word_ids, desc_input_mask, desc_segment_ids = generate_bert_input(train_df.docstring, desc_max_seq_length)

# Model definition

In [None]:
dense_units = 128

### Description branch

In [None]:
input_word_ids = tf.keras.layers.Input(shape=(desc_max_seq_length,), 
                                       dtype=tf.int32,
                                       name="desc_input_word_ids")
input_mask = tf.keras.layers.Input(shape=(desc_max_seq_length,), 
                                   dtype=tf.int32,
                                   name="desc_input_mask")
segment_ids = tf.keras.layers.Input(shape=(desc_max_seq_length,), 
                                    dtype=tf.int32,
                                    name="desc_segment_ids")

desc_dense = tf.keras.layers.Dense(dense_units, name="desc_dense")

In [None]:
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
desc_output = desc_dense(pooled_output)

### Source code branch

In [None]:
max_sc_seq_length = 256

In [None]:
EMBEDDINGS_FOLDER = "source-code-embeddings"
TOKEN_EMBEDDINGS  = "token_vecs.txt"
TARGET_EMBEDDINGS = "target_vecs.txt"

vectors_text_path = f'{EMBEDDINGS_FOLDER}/{TOKEN_EMBEDDINGS}' # or: `models/java14_model/tokens.txt'
model = word2vec.load_word2vec_format(vectors_text_path, binary=False)

In [None]:
def vectorize_inputs(inputs, max_seq_length):
    res = [[model.get_vector(token) for token in sc[:max_seq_length] \
               if token in model.vocab.keys()] for sc in inputs]
    res_lengths = [float(len(res_vec)) for res_vec in res]
    res = tf.keras.preprocessing.sequence.pad_sequences(res, 
                                                        value=np.zeros(model.vector_size), 
                                                        dtype='float32', 
                                                        maxlen=max_sc_seq_length, 
                                                        padding='post')
    return res, res_lengths

In [10]:
sc_ids, sc_lengths = vectorize_inputs(train_df.code_tokens, max_sc_seq_length)

In [21]:
input_sc_ids = tf.keras.layers.Input(shape=(max_sc_seq_length,model.vector_size,), 
                                       dtype=tf.float32,
                                       name="sc_input_ids")
input_sc_lengths = tf.keras.layers.Input(shape=(1,), 
                                       dtype=tf.float32,
                                       name="sc_input_lengths")
sc_dense = tf.keras.layers.Dense(dense_units, name="sc_dense")

In [22]:
reduced_embeddings = tf.reduce_sum(input_sc_ids, axis=1) # (bs, m_sc_seq_l, vec_sz) --> (bs, vec_sz)
reduced_embeddings = tf.divide(reduced_embeddings, tf.reshape(input_sc_lengths, (-1, 1))) # (bs, vec_sz) mean
sc_output = sc_dense(reduced_embeddings)

### Branches junction

In [None]:
norm_desc = tf.nn.l2_normalize(desc_output, axis=0)        
norm_sc = tf.nn.l2_normalize(sc_output, axis=0)
cos_similarity = tf.reduce_sum(tf.multiply(norm_desc, norm_sc))

### Full Model

In [None]:
inputs = [input_word_ids, input_mask, segment_ids, input_sc_ids, input_sc_lengths]
outputs = cos_similarity

sim_model = tf.keras.Model(inputs=inputs, outputs=outputs)

optimizer = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
loss_func = tf.keras.losses.BinaryCrossentropy()

sim_model.compile(loss=loss_func, 
                   optimizer=optimizer, 
                   metrics=['accuracy'])

In [None]:
sim_model.summary()

# Model Training

In [None]:
splitted_data = train_test_split((desc_word_ids, desc_input_mask, desc_segment_ids, sc_ids, sc_lengths))
train_desc_word_ids, test_desc_word_ids = splitted_data[:2]
train_desc_input_mask, test_desc_input_mask = splitted_data[2:4]
train_desc_segment_ids, test_desc_segment_ids = splitted_data[4:6]
train_sc_ids, test_sc_ids = splitted_data[6:8]
train_sc_lengths, test_sc_lengths = splitted_data[8:]