In [1]:
import re
import time
import string
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as tfh
from bert.tokenization import FullTokenizer
from gensim.models import KeyedVectors as word2vec
from sklearn.model_selection import train_test_split

# Data acquisition

In [2]:
LANGUAGE = "java" #"python"
DATA_PATH = "../../Data/code2desc"
DATA_FOLDER = f"{LANGUAGE}/short"
TRAIN_FILE  = f"{LANGUAGE}_train_0.jsonl"
TEST_FILE   = f"{LANGUAGE}_test_0.jsonl"
VALID_FILE  = f"{LANGUAGE}_valid_0.jsonl"

In [3]:
# acquire tokenized source code and plain docstrings.
# BERT uses its own 'FullTokenizer' for inputs.
use_cols = ["code_tokens", "docstring"]
train_df = pd.read_json(f"{DATA_PATH}/{DATA_FOLDER}/{TRAIN_FILE}", lines=True)[use_cols]

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   code_tokens  30000 non-null  object
 1   docstring    30000 non-null  object
dtypes: object(2)
memory usage: 468.9+ KB


In [5]:
train_df.head()

Unnamed: 0,code_tokens,docstring
0,"[protected, final, void, bindIndexed, (, Confi...",Bind indexed elements to the supplied collecti...
1,"[public, void, setServletRegistrationBeans, (,...",Set {@link ServletRegistrationBean}s that the ...
2,"[public, void, addServletRegistrationBeans, (,...",Add {@link ServletRegistrationBean}s for the f...
3,"[public, void, setServletNames, (, Collection,...",Set servlet names that the filter will be regi...
4,"[public, void, addServletNames, (, String, ......",Add servlet names for the filter.\n@param serv...


This TF Hub model uses the implementation of BERT from the TensorFlow Models repository on GitHub at <a href="https://github.com/tensorflow/models/tree/master/official/nlp/bert">tensorflow/models/official/nlp/bert</a>. It uses L=12 hidden layers (i.e., Transformer blocks), a hidden size of H=768, and A=12 attention heads.

This model has been pre-trained for English on the Wikipedia and BooksCorpus using the code published on GitHub. Inputs have been "uncased", meaning that the text has been lower-cased before tokenization into word pieces, and any accent markers have been stripped. For training, random input masking has been applied independently to word pieces (as in the original BERT paper).

All parameters in the module are trainable, and fine-tuning all parameters is the recommended practice.

### Descriptions embeddings

In [6]:
model_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = tfh.KerasLayer(model_url, trainable=False)

In [7]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

### Source code embeddings

In [8]:
EMBEDDINGS_FOLDER = "source-code-embeddings"
TOKEN_EMBEDDINGS  = "token_vecs.txt"
TARGET_EMBEDDINGS = "target_vecs.txt"

vectors_text_path = f'{EMBEDDINGS_FOLDER}/{TOKEN_EMBEDDINGS}'
model = word2vec.load_word2vec_format(vectors_text_path, binary=False)

# Data preprocessing

In [9]:
def cleaning(text):
    '''Performs cleaning of text of unwanted symbols, 
    excessive spaces and transfers to lower-case
    '''
#     punct_regxp = re.compile(f'([{string.punctuation}])')
#     text = re.sub(punct_regxp, r" \1 ", text)
    text = re.sub(r'\s+', " ", text)
    
    text = ''.join(character for character in text if character in string.printable)
    text = text.lower().strip()

    return text

In [10]:
train_df.docstring = train_df.docstring.apply(cleaning)

In [11]:
def generate_desc_input(text, max_seq_length):

    tokenized_text = [["[CLS]"] + tokenizer.tokenize(seq)[:max_seq_length-2] + ["[SEP]"] for seq in text]
    input_ids   = [tokenizer.convert_tokens_to_ids(tokens_seq) for tokens_seq in tokenized_text]
    input_mask  = [[1] * len(input_seq) for input_seq in input_ids]
    segment_ids = [[0] * max_seq_length for _ in range(len(input_ids))]
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=max_seq_length, padding='post')
    input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, maxlen=max_seq_length, padding='post')
    segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, maxlen=max_seq_length, padding='post')

    return input_ids, input_mask, segment_ids

In [12]:
desc_max_seq_length = 256
desc_word_ids, desc_input_mask, desc_segment_ids = generate_desc_input(train_df.docstring, desc_max_seq_length)

In [13]:
def generate_sc_input(sc_inputs, emb_model, max_seq_length):
    
    def word_to_index(word):
        word_val = emb_model.vocab.get(word, None)
        word_index = word_val.index if word_val else None
        return word_index
    
    input_ids = [[word_to_index(word) for word in sc_input[:max_seq_length] if word in emb_model.vocab.keys()] \
             for sc_input in sc_inputs]
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, 
                                                        dtype='int32', 
                                                        maxlen=max_seq_length, 
                                                        padding='post')
    return input_ids

In [14]:
sc_max_seq_length = 256
sc_ids = generate_sc_input(train_df.code_tokens, model, sc_max_seq_length)

# Model definition

In [15]:
dense_units = 256

### Description branch

In [16]:
##### Migrated into `train_step` function #####


# input_word_ids = tf.keras.layers.Input(shape=(desc_max_seq_length,), 
#                                        dtype=tf.int32,
#                                        name="desc_input_word_ids")
# input_mask  = tf.keras.layers.Input(shape=(desc_max_seq_length,), 
#                                    dtype=tf.int32,
#                                    name="desc_input_mask")
# segment_ids = tf.keras.layers.Input(shape=(desc_max_seq_length,), 
#                                     dtype=tf.int32,
#                                     name="desc_segment_ids")

desc_dense = tf.keras.layers.Dense(dense_units, activation='sigmoid', name="desc_dense")

In [17]:
# @tf.function
def desc_propagate(input_word_ids, input_mask, segment_ids):
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    desc_output = desc_dense(pooled_output)
    return desc_output

In [18]:
def desc_get_trainable_parameters():
    tr_vars = desc_dense.trainable_variables
    return tr_vars

### Source code branch

In [19]:
sc_lstm_units = 256
sc_model = 'convolutional' # 'lstm'
conv_kernel_sizes = [2,3,5]
conv_n_filters = 100

In [20]:
##### Migrated into `train_step` function #####

# input_sc_ids = tf.keras.layers.Input(shape=(sc_max_seq_length,), 
#                                        dtype=tf.int32,
#                                        name="sc_input_ids")

sc_embedding = tf.keras.layers.Embedding(len(model.vocab),
                                         model.vector_size, 
                                         weights=[model.vectors],
                                         mask_zero=True,
                                         trainable=True,
                                         name="sc_embedding") # (vocab_size, vec_size) (1294891, 128)

if sc_model == 'convolutional':
    sc_convs = []
    sc_max_pools = []
    for kernel_size in conv_kernel_sizes:
        sc_convs.append(tf.keras.layers.Conv1D(conv_n_filters, kernel_size, activation='relu', name=f'conv_{kernel_size}'))
        sc_max_pools.append(tf.keras.layers.MaxPooling1D(sc_max_seq_length - kernel_size + 1, 1, name=f'max_pool_{kernel_size}'))
elif sc_model == 'lstm':
    sc_lstm = tf.keras.layers.LSTM(sc_lstm_units, name="sc_lstm")

sc_dense = tf.keras.layers.Dense(dense_units, activation='sigmoid', name="desc_dense")

In [21]:
# @tf.function
def sc_propagate(input_sc_ids):
    sc_embedded_input = sc_embedding(input_sc_ids) # (batch_size, sc_max_seq_length, emb_vec_size)
    if sc_model == 'convolutional':
        conv_outputs = []
        for sc_conv, sc_max_pool in zip(sc_convs, sc_max_pools):
            sc_conv_out = sc_conv(sc_embedded_input) 
            conv_outputs.append(sc_max_pool(sc_conv_out))
        sc_output = tf.concat(conv_outputs, 2) # (batch_size, 1, n_convs * conv_n_filters)
        sc_output = tf.reshape(sc_output, [-1, len(conv_kernel_sizes) * conv_n_filters]) # (batch_size, n_convs * conv_n_filters)
    elif sc_model == 'lstm':
        sc_output = sc_lstm(sc_embedded_input) #  (batch_size, sc_lstm_units)
    sc_output = sc_dense(sc_output) # (batch_size, dense_units)
    return sc_output

In [22]:
def sc_get_trainable_parameters():
    tr_vars = sc_dense.trainable_variables + sc_embedding.trainable_variables
    if sc_model == 'convolutional':
        for sc_conv in sc_convs:
            tr_vars += sc_conv.trainable_variables
    elif sc_model == 'lstm':
        tr_vars += sc_lstm.trainable_variables
    return tr_vars

### Branches junction

In [23]:
# @tf.function
def compute_similarity(desc_output, sc_output):
    norm_desc = tf.nn.l2_normalize(desc_output, axis=1, name="desc_output_norm")        
    norm_sc   = tf.nn.l2_normalize(sc_output, axis=1, name="sc_output_norm")
    cos_similarity = tf.reduce_sum(tf.multiply(norm_desc, norm_sc, name="b_outputs_dot"), 
                                   axis=1, 
                                   name="cos_similarity")
    return cos_similarity

### Full Model

In [24]:
##### Redundant #####

# inputs = [input_word_ids, input_mask, segment_ids, input_sc_ids]
# outputs = cos_similarity

# sim_model = tf.keras.Model(inputs=inputs, outputs=outputs)

optimizer = tf.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
loss_func = tf.keras.losses.BinaryCrossentropy()

# sim_model.compile(loss=loss_func, optimizer=optimizer, metrics=['accuracy'])

In [25]:
# sim_model.summary()

# Model Training

In [26]:
def loss_function(y_true, y_pred):

    loss = loss_func(y_true, y_pred)
    return tf.reduce_mean(loss)

def negative_sampling(desc_output, sc_output):
    neg_probs = tf.linalg.set_diag(tf.fill([batch_size, batch_size], 0.5),[0]*batch_size)
    neg_ids = tf.random.categorical(neg_probs, n_negatives)

    neg_desc = tf.reshape(tf.gather(desc_output, neg_ids), [-1, dense_units])
    neg_sc = tf.reshape(tf.gather(sc_output, [[i]*n_negatives for i in range(batch_size)]), [-1, dense_units])

    desc_output = tf.concat([desc_output,neg_desc], axis=0)
    sc_output = tf.concat([sc_output,neg_sc], axis=0)
    
    return desc_output, sc_output

@tf.function
def train_step(input_word_ids, input_mask, segment_ids, input_sc_ids, batch_size, n_negatives):

    with tf.GradientTape() as tape:
        
        desc_output = desc_propagate(input_word_ids, input_mask, segment_ids)
        sc_output = sc_propagate(input_sc_ids)
        
        desc_output, sc_output = negative_sampling(desc_output, sc_output)
        cos_similarity = compute_similarity(desc_output, sc_output)
        labels = [1] * batch_size + [0] * (batch_size * n_negatives)
        loss = loss_function(labels, cos_similarity)
    # Adjust the parameters of the model using the computed gradients
    variables = desc_get_trainable_parameters() + sc_get_trainable_parameters()
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

def validation_step(input_word_ids, input_mask, segment_ids, input_sc_ids, batch_size, n_negatives):

    desc_output = desc_propagate(input_word_ids, input_mask, segment_ids)
    sc_output = sc_propagate(input_sc_ids)

    desc_output, sc_output = negative_sampling(desc_output, sc_output)
    cos_similarity = compute_similarity(desc_output, sc_output)
    labels = [1] * batch_size + [0] * (batch_size * n_negatives)
    loss = loss_function(labels, cos_similarity)

    return loss

In [27]:
splitted_data = train_test_split(desc_word_ids, desc_input_mask, desc_segment_ids, sc_ids)
train_desc_word_ids, test_desc_word_ids = splitted_data[:2]
train_desc_input_mask, test_desc_input_mask = splitted_data[2:4]
train_desc_segment_ids, test_desc_segment_ids = splitted_data[4:6]
train_sc_ids, test_sc_ids = splitted_data[6:8]

In [28]:
batch_size = 256
train_samples = len(train_desc_word_ids)
valid_samples = len(test_desc_word_ids)
train_steps_per_epoch = train_samples // batch_size
valid_steps_per_epoch = valid_samples // batch_size
epochs = 5
n_negatives = 80

In [29]:
train_data = tf.data.Dataset.from_tensor_slices((train_desc_word_ids, train_desc_input_mask, train_desc_segment_ids, train_sc_ids)).shuffle(len(train_desc_word_ids), reshuffle_each_iteration=True).batch(batch_size, drop_remainder=True)
valid_data = tf.data.Dataset.from_tensor_slices((test_desc_word_ids, test_desc_input_mask, test_desc_segment_ids, test_sc_ids)).shuffle(len(test_desc_word_ids), reshuffle_each_iteration=True).batch(batch_size, drop_remainder=True)

In [None]:
# Start the training
batch_loss_frequency = train_steps_per_epoch // 5

for epoch in range(epochs):
    start = time.time()

    total_loss = 0.0
    total_val_loss = 0.0

    # Perform training steps on the training data batches
    for (batch, (bdesc_word_ids, bdesc_input_mask, bdesc_segment_ids, bsc_ids)) in enumerate(train_data.take(train_steps_per_epoch)):
        batch_loss = train_step(bdesc_word_ids, bdesc_input_mask, bdesc_segment_ids, bsc_ids, batch_size, n_negatives)
        total_loss += batch_loss
        if batch % batch_loss_frequency == 0:
            print(f"Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy()}")
            
    for (batch, (vbdesc_word_ids, vbdesc_input_mask, vbdesc_segment_ids, vbsc_ids)) in enumerate(train_data.take(train_steps_per_epoch)):
        vbatch_loss = validation_step(vbdesc_word_ids, vbdesc_input_mask, vbdesc_segment_ids, vbsc_ids, batch_size, n_negatives)
        total_val_loss += vbatch_loss
        
    print(f"Epoch {epoch+1} Train loss {total_loss/train_steps_per_epoch} Val loss {total_val_loss/valid_steps_per_epoch} Time spent {time.time()-start} sec")

Epoch 1 Batch 0 Loss 2.415342092514038
Epoch 1 Batch 17 Loss 0.8460800051689148
Epoch 1 Batch 34 Loss 0.3809548020362854
Epoch 1 Batch 51 Loss 0.23988837003707886
Epoch 1 Batch 68 Loss 0.1757964789867401
Epoch 1 Batch 85 Loss 0.15545348823070526
Epoch 1 Train loss 0.5574725270271301 Val loss 0.42419907450675964 Time spent 1250.1830506324768 sec
Epoch 2 Batch 0 Loss 0.14253267645835876
Epoch 2 Batch 17 Loss 0.1228240355849266
Epoch 2 Batch 34 Loss 0.10881292074918747
Epoch 2 Batch 51 Loss 0.10960756987333298
Epoch 2 Batch 68 Loss 0.10081747174263
Epoch 2 Batch 85 Loss 0.10182619839906693
Epoch 2 Train loss 0.11126609146595001 Val loss 0.2812860310077667 Time spent 1240.616108417511 sec
Epoch 3 Batch 0 Loss 0.09414134174585342
Epoch 3 Batch 17 Loss 0.08793734014034271
Epoch 3 Batch 34 Loss 0.08356796205043793
Epoch 3 Batch 51 Loss 0.08182909339666367
Epoch 3 Batch 68 Loss 0.0843687430024147
Epoch 3 Batch 85 Loss 0.08056549727916718
Epoch 3 Train loss 0.08676941692829132 Val loss 0.244901

# Evaluation

In [None]:
def evaluate(input_word_ids, input_mask, segment_ids, input_sc_ids):
    desc_output = desc_propagate(input_word_ids, input_mask, segment_ids)
    sc_output = sc_propagate(input_sc_ids)

    desc_output, sc_output = negative_sampling(desc_output, sc_output)

    neg_probs = tf.linalg.set_diag(tf.fill([batch_size, batch_size], 0.5),[0]*batch_size)
    neg_ids = tf.random.categorical(neg_probs, n_negatives)

    neg_desc = tf.reshape(tf.gather(desc_output, neg_ids), [-1, dense_units])
    neg_sc = tf.reshape(tf.gather(sc_output, [[i]*n_negatives for i in range(batch_size)]), [-1, dense_units])
    
    desc_output = tf.concat([desc_output,neg_desc], axis=0)
    sc_output = tf.concat([sc_output,neg_sc], axis=0)
    
    cos_similarity = compute_similarity(desc_output, sc_output)
    return cos_similarity

In [None]:
input_word_ids, input_mask, segment_ids, input_sc_ids = list(train_data.take(1).as_numpy_iterator())[0]
evaluate(input_word_ids, input_mask, segment_ids, input_sc_ids)