In [1]:
!pip install -q tokenizers
!pip install -q transformers

[K     |████████████████████████████████| 7.6 MB 11.2 MB/s 
[K     |████████████████████████████████| 5.8 MB 9.8 MB/s 
[K     |████████████████████████████████| 182 kB 50.8 MB/s 
[?25h

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM, Embedding, Masking
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers, processors
from sklearn.model_selection import train_test_split
import pickle

Mounted at /content/drive


In [None]:
url = 'https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Text-Code/text-to-code/dataset/concode/train.json'
train = pd.read_json(url,  dtype='dict',lines=True)
train=train[['nl','code']]
x=train['nl'].to_numpy()
y=train['code'].to_numpy()

In [None]:
nl_tokenizer = Tokenizer(models.WordPiece(unk_token='[UNK]'))
nl_tokenizer.normalizer = normalizers.BertNormalizer(clean_text = False)
nl_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
nl_tokenizer.decoder = decoders.WordPiece(prefix='##')
trainer = trainers.WordPieceTrainer(
    vocab_size=8000,
    show_progress=True,
    special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    continuing_subword_prefix='##'
)
nl_tokenizer.train_from_iterator(x, trainer=trainer)
nl_tokenizer.post_processor = processors.BertProcessing(('[CLS]',nl_tokenizer.token_to_id('[CLS]')),('[SEP]',nl_tokenizer.token_to_id('[SEP]')))

In [None]:
code_tokenizer = Tokenizer(models.WordPiece(unk_token='[UNK]'))
code_tokenizer.normalizer = normalizers.BertNormalizer(clean_text = False)
code_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
code_tokenizer.decoder = decoders.WordPiece(prefix='##')
trainer = trainers.WordPieceTrainer(
    vocab_size=8000,
    show_progress=True,
    special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    continuing_subword_prefix='##'
)
code_tokenizer.train_from_iterator(y, trainer=trainer)
code_tokenizer.post_processor = processors.BertProcessing(('[CLS]',code_tokenizer.token_to_id('[CLS]')),('[SEP]',code_tokenizer.token_to_id('[SEP]')))

In [None]:
tokenized_data=np.array(list(map(lambda item: item.ids, nl_tokenizer.encode_batch(x))))

nl_tokenizer.enable_padding()
code_tokenizer.enable_padding()
x=x[np.array([len(i) for i in tokenized_data])<150]
y=y[np.array([len(i) for i in tokenized_data])<150]
x_tokenized=np.array(list(map(lambda t: t.ids, nl_tokenizer.encode_batch(x))))
y_tokenized=np.array(list(map(lambda t: t.ids, code_tokenizer.encode_batch(y))))

  tokenized_data=np.array(list(map(lambda item: item.ids, nl_tokenizer.encode_batch(x))))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_tokenized, y_tokenized, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

path='/Data/java_train'
np.save(f'{path}/X_train.npy', X_train)
np.save(f'{path}/y_train.npy', y_train)
np.save(f'{path}/X_test.npy', X_test)
np.save(f'{path}/y_test.npy', y_test)
np.save(f'{path}/X_val.npy', X_val)
np.save(f'{path}/y_val.npy', y_val)

input_vocab_size=nl_tokenizer.get_vocab_size()
output_vocab_size=code_tokenizer.get_vocab_size()
input_vocab_size,output_vocab_size

(8000, 8000)

In [None]:
nl_tokenizer.save('/Data/java_train/nl_tokenizer.json')
code_tokenizer.save('/Data/java_train/code_tokenizer.json')

In [3]:
input_length=X_train.shape[1]
output_length=y_train.shape[1]
model = Sequential([
Masking(mask_value=0,input_shape=(input_length, 1)),
Bidirectional(LSTM(input_length, return_sequences=False)),
RepeatVector(output_length),
LSTM(512, return_sequences=True),
LSTM(1024, return_sequences=True),
Dropout(0.2),
TimeDistributed(Dense(1024)),
Dropout(0.2),
TimeDistributed(Dense(units=output_vocab_size))
])
model.compile()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 149, 1)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 298)              179992    
 l)                                                              
                                                                 
 repeat_vector (RepeatVector  (None, 153, 298)         0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 153, 512)          1660928   
                                                                 
 lstm_2 (LSTM)               (None, 153, 1024)         6295552   
                                                                 
 dropout (Dropout)           (None, 153, 1024)         0

In [None]:
def loss_function(x, y):
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=y, y_pred=x)
  mask = tf.logical_not(tf.math.equal(y,0))
  mask = tf.cast(mask, dtype=loss.dtype)  
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss

buffer_size=32000
batch_size=240
Y_len = np.count_nonzero(y_train, axis=1)
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train, Y_len)).shuffle(buffer_size=buffer_size).batch(batch_size=batch_size)
valid_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val, np.count_nonzero(y_val, axis=1))).shuffle(buffer_size=buffer_size).batch(batch_size=batch_size)
optimizer = tf.keras.optimizers.Adam()

In [None]:
for epoch in range(5000):
  avg_loss = 0
  training_step = 0
  for x_train2, y_train2, data_len in train_ds:
      with tf.GradientTape() as tape:
          loss = loss_function(model(x_train2), y_train2)
      grads = tape.gradient(loss, model.trainable_variables)
      optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
      avg_loss += loss
      training_step += 1
  avg_loss /= training_step
  
  if (epoch + 1) % 10 == 0:
      avg_val_loss = 0
      val_training_step = 0
      for x_valid2, y_valid2, data_len2 in valid_ds:
          val_loss = loss_function(model(x_valid2), y_valid2)
          avg_val_loss += loss
          val_training_step += 1
      avg_val_loss /= val_training_step
      print('val_loss: {:.3f}'.format(avg_val_loss))
      print('Epoch: {:3}, tr_loss: {:.3f}'.format((epoch+1)/100, avg_loss))
      model.save('/Data/java_model.h5')

In [None]:
sample_num=12
y_pred = model.predict(np.expand_dims(X_test[sample_num],axis=0),verbose=0)
y_pred = np.argmax(y_pred, axis=-1) 
print(nl_tokenizer.decode(X_test[sample_num]))
print(code_tokenizer.decode(y_pred[0]))
print(code_tokenizer.decode(y_test[sample_num]))

sets the iteration number. concode_field_sep int n_i concode_elem_sep int tau concode_elem_sep long serialversionuid concode_elem_sep double counter concode_elem_sep int tau_t concode_elem_sep int m concode_elem_sep int n concode_field_sep void setn_t concode_elem_sep int getn_t concode_elem_sep double apply concode_elem_sep double apply concode_elem_sep int getm concode_elem_sep void settau_t concode_elem_sep int getn concode_elem_sep void setn concode_elem_sep int gettau_t concode_elem_sep int gettau concode_elem_sep void setm
void function ( ) { return ( ;. size. arg0 ; } if ( loc0 ; size ; } payload ; } ; ; ) payload ; payload ( put ( arg0 ) ; else else else else else ) else else else else else else else else else else else else else else else else } } } } } } } } } } } } } } } } }
void function ( int arg0 ) { this. tdet = arg0 ; }


In [None]:
sample_num=0
y_pred = model.predict(np.expand_dims(X_train[sample_num],axis=0),verbose=0)
y_pred = np.argmax(y_pred, axis=-1)
print(nl_tokenizer.decode(X_train[sample_num]))
print(code_tokenizer.decode(y_pred[0]))
print(code_tokenizer.decode(y_train[sample_num]))

thid method closes the buffered reader concode_field_sep string log_arcwivemun concode_field_sep bufferedwriter getnewfilewriter concode_elem_septemplatelink getnewstreamreader
void function ( bufferedreader arg0 ) { if ( arg0!= null ) { try ( arg0. arg0 ( ) ; } catch. ioexception ( ) { loc0. loc0 ( ) ) ; } ; }.....ext ( ) ) ) ) ) ) ) ) ) ) ) ) } } } } } } } } }
void function ( bufferedreader arg0 ) { if ( arg0!= null ) { try { arg0. close ( ) ; } catch ( ioexception loc0 ) { loc0. printstacktrace ( ) ; } } }
