<a href="https://colab.research.google.com/github/masies/CRA/blob/main/replication_package/Replication_package_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# T5 Fine_Tuning

in this notebook we will fine-tune different models on the datasets we already processed.

We start by setting the environment. connecting colab to the GCS bucket and setting everything up for the TPU processor. (This colab uses TPU and high ram settings)

In [1]:
from google.colab import auth
auth.authenticate_user()
#@title ## Set Your GCS credential
project_id = 'prova-314912'#@param {type:"string"}
bucket_name = 'gatto_bucket'#@param {type:"string"}

!gcloud config set project {project_id}

!gsutil cp gs://{bucket_name}/replication_package/requirements/requirements_FineTuning.txt  requirements_FineTuning.txt

!pip3 install --upgrade pip
!pip install -r /content/requirements_FineTuning.txt
!pip install -qU t5

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

#Set the base dir(Google cloud bucket)
BASE_DIR = "gs://" + bucket_name

if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
ON_CLOUD = True


if ON_CLOUD:
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Updated property [core/project].
Copying gs://code_review_automation/replication_package/requirements/requirements_FineTuning.txt...
/ [1 files][  7.0 KiB/  7.0 KiB]                                                
Operation completed over 1 objects/7.0 KiB.                                      
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting mesh-tensorflow==0.1.19
[?25l  Downloading https://files.pythonhosted.org/packages/ce/10/37df0bc87ebf84e1414613176340e3aadc3697d2bd112bf63d3d4b1e848a/mesh_tensorflow-0.1.19-py3-none-any.whl (366kB)
[K     |████████████████████████████████| 368kB 8.0MB/s 
Collecting portalocker==2.0.0
  Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl
Collecting rouge-score==0.0.4
  Downloading https://

Instructions for updating:
non-resource variables are not supported in the long term


We specify the paths and the sizes of all our datasets to later build our tasks.

In [2]:
## tasks large dataset
nq_tsv_path_code_code_large = {
    "train":      'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/large/code_code/train.tsv',
    "validation": 'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/large/code_code/val.tsv'
}
num_nq_examples_code_code_large = dict(train=134442, validation=16805)

nq_tsv_path_code_comment_large = {
    "train":      'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/large/code_comment/train.tsv',
    "validation": 'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/large/code_comment/val.tsv'
}
num_nq_examples_code_comment_large = dict(train=134442, validation=16805)

nq_tsv_path_codeANDcomment_code_large = {
    "train":      'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/large/codeANDcomment_code/train.tsv',
    "validation": 'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/large/codeANDcomment_code/val.tsv'
}
num_nq_examples_codeANDcomment_code_large = dict(train=134442, validation=16805)

nq_tsv_path_marked_code_large = {
    "train":      'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/large/marked_code/train.tsv',
    "validation": 'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/large/marked_code/val.tsv'
}
num_nq_examples_marked_code_large = dict(train=134442, validation=16805)


## tasks small dataset v1

nq_tsv_path_code_code_small_v1 = {
    "train":      'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/small/v1/code_code/train.tsv',
    "validation": 'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/small/v1/code_code/val.tsv'
}
num_nq_examples_code_code_small_v1 = dict(train=13671, validation=1714)

nq_tsv_path_codeANDcomment_code_small_v1 = {
    "train":      'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/small/v1/codeANDcomment_code/train.tsv',
    "validation": 'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/small/v1/codeANDcomment_code/val.tsv'
}
num_nq_examples_codeANDcomment_code_small_v1 = dict(train=13671, validation=1714)

## tasks small dataset v2
nq_tsv_path_code_code_small_v2 = {
    "train":      'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/small/v2/code_code/train.tsv',
    "validation": 'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/small/v2/code_code/val.tsv'
}
num_nq_examples_code_code_small_v2 = dict(train=13671, validation=1714)

nq_tsv_path_codeANDcomment_code_small_v2 = {
    "train":      'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/small/v2/codeANDcomment_code/train.tsv',
    "validation": 'gs://' + bucket_name + '/replication_package/dataset/fine-tuning/small/v2/codeANDcomment_code/val.tsv'
}
num_nq_examples_codeANDcomment_code_small_v2 = dict(train=13671, validation=1714)

We specify the model and vocab path of the previusly trained sentencepiece model in the GCS bucket

In [3]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary

# # Set the path of sentencepiece model and vocab files
# # Must be the same used for the pre-trained phase

vocab_model_path = 'gs://' + bucket_name + '/replication_package/code_review_model/TestModel.model'
vocab_path = 'gs://' + bucket_name + '/replication_package/code_review_model/TestModel.vocab'

TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

def get_default_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, 100)

DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True, required=False),

    "targets": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True)
}

# Setting up all the tasks

We will set 8 tasks
- code prediction (large dataset)
- code prediction (small dataset v1)
- code prediction (small dataset v2)
- comment implementation (large dataset)
- comment implementation (small dataset v1)
- comment implementation (small dataset v2)
- code prediction, given marked code (large dataset)
- comment prediction (large dataset)

then we will later chose which one or which mixture to tune


## FIRST TASK : CODE to CODE large_dataset
- task name = `code_code`
- task prefix = `code2code: `

In [4]:
def nq_dataset_code_code_large(split, shuffle_files=True):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_code_code_large[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(nq_dataset_code_code_large("validation").take(2)):
  print(ex)
print("A few raw training examples...")
for ex in tfds.as_numpy(nq_dataset_code_code_large("train").take(2)):
  print(ex)

def code_code_preprocessing(ds):
  def to_inputs_and_targets(ex):
        inputs = tf.strings.join(['code2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
t5.data.TaskRegistry.remove('code_code')
t5.data.TaskRegistry.add(
    "code_code",
    dataset_fn=nq_dataset_code_code_large,
    splits=["train", "validation"],
    text_preprocessor=[code_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_code_code_large
)

nq_task = t5.data.TaskRegistry.get("code_code")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

A few raw validation examples...
{'input': b'public void execute() throws BuildException { String generatedPassword=""; if (addproperty == null || addproperty.equals("")) { throw new BuildException("\\tThe output property is required for this task."); } if (password == null || password.equals("")) { throw new BuildException("\\tThe password property is required for this task."); } try { MessageDigest md = MessageDigest.getInstance("SHA-256"); md.update(password.getBytes()); byte[] bytes = md.digest(); generatedPassword = new String(Base64.encodeBase64(bytes)); } catch (NoSuchAlgorithmException e) { throw new BuildException("\\tThere is a problem encrypting the password with MD5 algorithm"); } if (addproperty != null && !addproperty.equals("")) { getProject().setProperty(addproperty, generatedPassword); } }', 'output': b'public void execute() throws BuildException { String generatedPassword=""; if (addproperty == null || addproperty.equals("")) { throw new BuildException("\\tThe output 

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


## SECOND TASK : CODE to COMMENT large_dataset
- task name = `code_comment`
- task prefix = `code2comment: `

In [5]:
def nq_dataset_code_comment_large(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_code_comment_large[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(nq_dataset_code_comment_large("validation").take(2)):
  print(ex)
print("A few raw training examples...")
for ex in tfds.as_numpy(nq_dataset_code_comment_large("train").take(2)):
  print(ex)

def code_comment_preprocessing(ds):
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['code2comment: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('code_comment')
t5.data.TaskRegistry.add(
    "code_comment",
    dataset_fn=nq_dataset_code_comment_large,
    splits=["train", "validation"],
    text_preprocessor=[code_comment_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_code_comment_large
)

nq_task = t5.data.TaskRegistry.get("code_comment")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

A few raw validation examples...
{'input': b'public void execute() throws BuildException { String generatedPassword=""; if (addproperty == null || addproperty.equals("")) { throw new BuildException("\\tThe output property is required for this task."); } if (password == null || password.equals("")) { throw new BuildException("\\tThe password property is required for this task."); } try { MessageDigest md = MessageDigest.getInstance("SHA-256"); md.update(password.getBytes()); byte[] bytes = md.digest(); generatedPassword = new String(Base64.encodeBase64(bytes)); } catch (NoSuchAlgorithmException e) { throw new BuildException("\\tThere is a problem encrypting the password with MD5 algorithm"); } if (addproperty != null && !addproperty.equals("")) { getProject().setProperty(addproperty, generatedPassword); } }', 'output': b"If we're going to include commons-codec as a dependency, then I think you should just replace the whole block of code above with this:  generatedPassword = DigestUtils.

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


## THIRD TASK : CODE and COMMENT to CODE large_dataset
- task name = `codeANDcomment_code`
- task prefix = `code&comment2code: `

In [6]:
############### THIRD TASK : CODE&COMMENT2CODE ###############

def nq_dataset_codeANDcomment_code_large(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_codeANDcomment_code_large[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(nq_dataset_codeANDcomment_code_large("validation").take(2)):
  print(ex)
print("A few raw training examples...")
for ex in tfds.as_numpy(nq_dataset_codeANDcomment_code_large("train").take(2)):
  print(ex)

def codeANDcomment_code_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['code&comment2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('codeANDcomment_code')
t5.data.TaskRegistry.add(
    "codeANDcomment_code",
    dataset_fn=nq_dataset_codeANDcomment_code_large,
    splits=["train", "validation"],
    text_preprocessor=[codeANDcomment_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_codeANDcomment_code_large
)

nq_task = t5.data.TaskRegistry.get("codeANDcomment_code")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

A few raw validation examples...
{'input': b'<code>public void execute() throws BuildException { String generatedPassword=""; if (addproperty == null || addproperty.equals("")) { throw new BuildException("\\tThe output property is required for this task."); } if (password == null || password.equals("")) { throw new BuildException("\\tThe password property is required for this task."); } try { MessageDigest md = MessageDigest.getInstance("SHA-256"); md.update(password.getBytes()); byte[] bytes = md.digest(); <START> generatedPassword = new String(Base64.encodeBase64(bytes)); <END> } catch (NoSuchAlgorithmException e) { throw new BuildException("\\tThere is a problem encrypting the password with MD5 algorithm"); } if (addproperty != null && !addproperty.equals("")) { getProject().setProperty(addproperty, generatedPassword); } }</code><technical_language>If include commons-codec a dependency, I replace block of code this: generatedPassword = DigestUtils.sha256Hex(password);</technical_lan

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


## FOURTH TASK : MARKED CODE to CODE large_dataset
- task name = `marked_code`
- task prefix = `markedCode2code: `

In [7]:
def nq_dataset_marked_code_large(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_marked_code_large[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(nq_dataset_marked_code_large("validation").take(2)):
  print(ex)
print("A few raw training examples...")
for ex in tfds.as_numpy(nq_dataset_marked_code_large("train").take(2)):
  print(ex)

def marked_code_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['markedCode2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('marked_code')
t5.data.TaskRegistry.add(
    "marked_code",
    dataset_fn=nq_dataset_marked_code_large,
    splits=["train", "validation"],
    text_preprocessor=[marked_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_marked_code_large
)

nq_task = t5.data.TaskRegistry.get("marked_code")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

A few raw validation examples...
{'input': b'public void execute() throws BuildException { String generatedPassword=""; if (addproperty == null || addproperty.equals("")) { throw new BuildException("\\tThe output property is required for this task."); } if (password == null || password.equals("")) { throw new BuildException("\\tThe password property is required for this task."); } try { MessageDigest md = MessageDigest.getInstance("SHA-256"); md.update(password.getBytes()); byte[] bytes = md.digest(); <START> generatedPassword = new String(Base64.encodeBase64(bytes)); <END> } catch (NoSuchAlgorithmException e) { throw new BuildException("\\tThere is a problem encrypting the password with MD5 algorithm"); } if (addproperty != null && !addproperty.equals("")) { getProject().setProperty(addproperty, generatedPassword); } }', 'output': b'public void execute() throws BuildException { String generatedPassword=""; if (addproperty == null || addproperty.equals("")) { throw new BuildException("

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


## FIFTH TASK : CODE to CODE small_dataset_v1
- task name = `code_code_small_v1`
- task prefix = `code2code: `

In [8]:
def nq_dataset_code_code_small_v1(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_code_code_small_v1[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(nq_dataset_code_code_small_v1("validation").take(2)):
  print(ex)
print("A few raw training examples...")
for ex in tfds.as_numpy(nq_dataset_code_code_small_v1("train").take(2)):
  print(ex)

def marked_code_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['code2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('code_code_small_v1')
t5.data.TaskRegistry.add(
    "code_code_small_v1",
    dataset_fn=nq_dataset_code_code_small_v1,
    splits=["train", "validation"],
    text_preprocessor=[marked_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_codeANDcomment_code_small_v2
)

nq_task = t5.data.TaskRegistry.get("code_code_small_v1")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

A few raw validation examples...
{'input': b'public void startRuntime() { String tempDir = AppConstants.getInstance().getString("log.dir", null); v8 = V8.createV8Runtime("J2V8Javascript", tempDir); }', 'output': b'public void startRuntime() { v8 = V8.createV8Runtime(); }'}
{'input': b'public GWCConfig getConfig() { if (gsEnvironment != null && gsEnvironment.isStale()) { syncEnvironment(); } return gwcConfigPersister.getConfig(); }', 'output': b'public GWCConfig getConfig() { return gwcConfigPersister.getConfig(); }'}
A few raw training examples...
{'input': b'protected static String commentFormat(String comment) { if (comment == null || comment.isEmpty()) return ""; while (comment.getBytes(ENCODING).length > 255) { comment = comment.substring(0, comment.length() - 1); } return comment; }', 'output': b'protected static String commentFormat(String comment) { if (comment == null || comment.length() == 0) return ""; while (comment.getBytes(ENCODING).length > 255) { comment = comment.substr

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


## SIXTH TASK : CODE and COMMENT to CODE small_dataset_v1
- task name = `codeANDcomment_code_small_v1`
- task prefix = `code&comment2code: `

In [9]:
def nq_dataset_codeANDcomment_code_small_v1(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_codeANDcomment_code_small_v1[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(nq_dataset_codeANDcomment_code_small_v1("validation").take(2)):
  print(ex)
print("A few raw training examples...")
for ex in tfds.as_numpy(nq_dataset_codeANDcomment_code_small_v1("train").take(2)):
  print(ex)

def marked_code_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['code&comment2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('codeANDcomment_code_small_v1')
t5.data.TaskRegistry.add(
    "codeANDcomment_code_small_v1",
    dataset_fn=nq_dataset_codeANDcomment_code_small_v1,
    splits=["train", "validation"],
    text_preprocessor=[marked_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_codeANDcomment_code_small_v1
)

nq_task = t5.data.TaskRegistry.get("codeANDcomment_code_small_v1")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

A few raw validation examples...
{'input': b'<code> public void startRuntime() { String tempDir = AppConstants.getInstance().getString("log.dir", null); <START> v8 = V8.createV8Runtime("J2V8Javascript", tempDir); <END> } </code><technical_language> Setting the alias and temp directory can be quite important. Perhaps make this an optional argument? So you can overwrite the alias when needed (i.e. to improve performance). Right now all instances will be cached under the same name. </technical_language>', 'output': b'public void startRuntime() { v8 = V8.createV8Runtime(); }'}
{'input': b'<code> public GWCConfig getConfig() { <START> if (gsEnvironment != null && gsEnvironment.isStale()) { <END> syncEnvironment(); } return gwcConfigPersister.getConfig(); } </code><technical_language> This bloc of code is repeated many times, should be centralized in a single method. </technical_language>', 'output': b'public GWCConfig getConfig() { return gwcConfigPersister.getConfig(); }'}
A few raw traini

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


## SEVENTH TASK : CODE to CODE small_dataset_v2
- task name = `codeANDcomment_code_small_v2`
- task prefix = `code2code: `

In [10]:
def nq_dataset_code_code_small_v2(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_code_code_small_v2[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(nq_dataset_code_code_small_v2("validation").take(2)):
  print(ex)
print("A few raw training examples...")
for ex in tfds.as_numpy(nq_dataset_code_code_small_v2("train").take(2)):
  print(ex)

def marked_code_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['code2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('code_code_small_v2')
t5.data.TaskRegistry.add(
    "code_code_small_v2",
    dataset_fn=nq_dataset_code_code_small_v2,
    splits=["train", "validation"],
    text_preprocessor=[marked_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_codeANDcomment_code_small_v2
)

nq_task = t5.data.TaskRegistry.get("code_code_small_v2")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

A few raw validation examples...
{'input': b'public void startRuntime() { String tempDir = AppConstants.getInstance().getString("log.dir", null); v8 = V8.createV8Runtime("J2V8Javascript", tempDir); }', 'output': b'public void startRuntime() { v8 = V8.createV8Runtime(); }'}
{'input': b'public GWCConfig getConfig() { if (gsEnvironment != null && gsEnvironment.isStale()) { syncEnvironment(); } return gwcConfigPersister.getConfig(); }', 'output': b'public GWCConfig getConfig() { return gwcConfigPersister.getConfig(); }'}
A few raw training examples...
{'input': b'protected static String commentFormat(String comment) { if (comment == null || comment.isEmpty()) return ""; while (comment.getBytes(ENCODING).length > 255) { comment = comment.substring(0, comment.length() - 1); } return comment; }', 'output': b'protected static String commentFormat(String comment) { if (comment == null || comment.length() == 0) return ""; while (comment.getBytes(ENCODING).length > 255) { comment = comment.substr

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


## EIGHT TASK : CODE and COMMENT to CODE small_dataset_v2
- task name = `codeANDcomment_code_small_v2`
- task prefix = `code&comment2code: `

In [11]:
def nq_dataset_codeANDcomment_code_small_v2(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_codeANDcomment_code_small_v2[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(nq_dataset_codeANDcomment_code_small_v2("validation").take(2)):
  print(ex)
print("A few raw training examples...")
for ex in tfds.as_numpy(nq_dataset_codeANDcomment_code_small_v2("train").take(2)):
  print(ex)

def marked_code_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['code&comment2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('codeANDcomment_code_small_v2')
t5.data.TaskRegistry.add(
    "codeANDcomment_code_small_v2",
    dataset_fn=nq_dataset_codeANDcomment_code_small_v2,
    splits=["train", "validation"],
    text_preprocessor=[marked_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_codeANDcomment_code_small_v2
)

nq_task = t5.data.TaskRegistry.get("codeANDcomment_code_small_v2")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

A few raw validation examples...
{'input': b'<code> public void startRuntime() { String tempDir = AppConstants.getInstance().getString("log.dir", null); <START> v8 = V8.createV8Runtime("J2V8Javascript", tempDir); <END> } </code><technical_language> Setting alias temp directory important. this optional argument? overwrite alias needed (i.e. improve performance). Right instances cached name </technical_language>', 'output': b'public void startRuntime() { v8 = V8.createV8Runtime(); }'}
{'input': b'<code> public GWCConfig getConfig() { <START> if (gsEnvironment != null && gsEnvironment.isStale()) { <END> syncEnvironment(); } return gwcConfigPersister.getConfig(); } </code><technical_language> This bloc of code is repeated times, centralized in a single method </technical_language>', 'output': b'public GWCConfig getConfig() { return gwcConfigPersister.getConfig(); }'}
A few raw training examples...
{'input': b'<code> protected static String commentFormat(String comment) { <START> if (commen

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


# Setting Up fine tuning tasks and mixtures

In [12]:
def _rate_num_input_examples(task):
  if "train" in task.splits:
    return float(task.num_input_examples("train"))
  elif "validation" in task.splits:
    return float(task.num_input_examples("validation"))
  else:
    raise ValueError("Task %s does not have a train or validation split." % (task.name))

In [13]:
t5.data.MixtureRegistry.remove("code_code_large")
t5.data.MixtureRegistry.add(
    "code_code_large",
    ["code_code"],
    default_rate=_rate_num_input_examples
)
t5.data.MixtureRegistry.remove("code_comment_large")
t5.data.MixtureRegistry.add(
    "code_comment_large",
    ["code_comment"],
    default_rate=_rate_num_input_examples
)

t5.data.MixtureRegistry.remove("codeANDcomment_large")
t5.data.MixtureRegistry.add(
    "codeANDcomment_large",
    ["codeANDcomment_code"],
    default_rate=_rate_num_input_examples
)

t5.data.MixtureRegistry.remove("marked_code_large")
t5.data.MixtureRegistry.add(
    "marked_code_large",
    ["marked_code"],
    default_rate=_rate_num_input_examples
)

t5.data.MixtureRegistry.remove("all_large")
t5.data.MixtureRegistry.add(
    "all_large",
    ["code_code","code_comment","codeANDcomment_code"],
    default_rate=_rate_num_input_examples
)

t5.data.MixtureRegistry.remove("code_code_small_dataset_v1")
t5.data.MixtureRegistry.add(
    "code_code_small_dataset_v1",
    ["code_code_small_v1"],
    default_rate=_rate_num_input_examples
)

t5.data.MixtureRegistry.remove("codeANDcomment_code_small_dataset_v1")
t5.data.MixtureRegistry.add(
    "codeANDcomment_code_small_dataset_v1",
    ["codeANDcomment_code_small_v1"],
    default_rate=_rate_num_input_examples
)

t5.data.MixtureRegistry.remove("all_small_v1")
t5.data.MixtureRegistry.add(
    "all_small_v1",
    ["code_code_small_v1", "codeANDcomment_code_small_v1"],
    default_rate=_rate_num_input_examples
) 

t5.data.MixtureRegistry.remove("code_code_small_dataset_v2")
t5.data.MixtureRegistry.add(
    "code_code_small_dataset_v2",
    ["code_code_small_v2"],
    default_rate=_rate_num_input_examples
)

t5.data.MixtureRegistry.remove("codeANDcomment_code_small_dataset_v2")
t5.data.MixtureRegistry.add(
    "codeANDcomment_code_small_dataset_v2",
    ["codeANDcomment_code_small_v2"],
    default_rate=_rate_num_input_examples
)

t5.data.MixtureRegistry.remove("all_small_v2")
t5.data.MixtureRegistry.add(
    "all_small_v2",
    ["code_code_small_v2", "codeANDcomment_code_small_v2"],
    default_rate=_rate_num_input_examples
)

<seqio.dataset_providers.Mixture at 0x7fd545fc5790>

We specify the path of our pre-trained model, the model size (small), and the directory where we want to store our model checkpoints in the GCS

In [15]:
# Specify the pre-trained dir which must contain the pre-trained models, the operative_config.gin file and the checkpoint file as well
PRETRAINED_DIR= 'gs://' + bucket_name + '/replication_package/model_dumps'

# our T5 selected architecture
MODEL_SIZE = "small"

#@title Selecte the task or the mixture you want to train the model on
Task_to_train = "all_small_v1" #@param ["code_code_large","code_comment_large","codeANDcomment_large","marked_code_large","all_large","code_code_small_dataset_v1","codeANDcomment_code_small_dataset_v1","all_small_v1","code_code_small_dataset_v2","codeANDcomment_code_small_dataset_v2","all_small_v2"]

############ output path ############
MODEL_DIR = 'gs://' + bucket_name + '/replication_package/fine_tuning_model_dumps/'+ Task_to_train 


model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 128, 200),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]


We set the selected learning rate scheduler

In [16]:
from mesh_tensorflow.transformer.learning_rate_schedules import slanted_triangular 

from mesh_tensorflow.transformer.learning_rate_schedules import truncated_rsqrt
 
from tensorflow.keras.optimizers.schedules import PolynomialDecay

starter_learning_rate = 0.05
end_learning_rate = 0.001
decay_steps = 10000

learning_rate_fn = PolynomialDecay(
    starter_learning_rate,
    decay_steps,
    end_learning_rate,
    power=0.5)

#@title Select a learning rate scheduler
learning_rate_scheduler_picker = "slanted" #@param ["slanted", "isr", "polynomial", "constant"]

if learning_rate_scheduler_picker == "slanted":
  selected_learning_rate_scheduler = slanted_triangular
  PATH_GIN_FILE = 'gs://' + bucket_name + '/replication_package/utils/operative_config_slanted.gin'
elif learning_rate_scheduler_picker == "isr":
  selected_learning_rate_scheduler = truncated_rsqrt
  PATH_GIN_FILE = 'gs://' + bucket_name + '/replication_package/utils/operative_config_isr.gin'
elif learning_rate_scheduler_picker == "polynomial":
  selected_learning_rate_scheduler = learning_rate_fn
  PATH_GIN_FILE = 'gs://' + bucket_name + '/replication_package/utils/operative_config_polynomial.gin'
elif learning_rate_scheduler_picker == "constant":
  selected_learning_rate_scheduler = 0.001
  PATH_GIN_FILE = 'gs://' + bucket_name + '/replication_package/utils/operative_config_constant.gin'

#@title Select a learning rate scheduler
number_of_steps = 1000 #@param {type:"integer"}

tf.io.gfile.makedirs(MODEL_DIR)

model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    learning_rate_schedule = selected_learning_rate_scheduler,
    sequence_length={"inputs": 512, "targets": 512},
    save_checkpoints_steps=10000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)

!gsutil cp {PATH_GIN_FILE}  ./config.gin


Copying gs://code_review_automation/replication_package/utils/operative_config_slanted.gin...
/ [1 files][ 11.6 KiB/ 11.6 KiB]                                                
Operation completed over 1 objects/11.6 KiB.                                     


If the selected learning rate scheduler is Slanted, we need to modify the gin file according to our settings:

in particular, in this file, on line `197` we have to set the number of already done pre-training steps, in our original case was 200000

then, in the next line we have to set the number of steps we want to fine tune the model, in our case we have different settings for each configurations:
- small dataset single task 100K
- small dataset mixture 100K
- large dataset single task 300K
- large dataset mixture 600K


In [18]:
import gin

# # PRETRAINED
#with gin.unlock_config():
#    gin.parse_config_file("./config.gin")
#    #RUN FINE-TUNING
#    FINETUNE_STEPS = number_of_steps
#    model.finetune(
#        mixture_or_task_name=Task_to_train,
#        pretrained_model_dir=PRETRAINED_DIR,
#        finetune_steps=FINETUNE_STEPS
#    )
# NON PRETRAINED
with gin.unlock_config():
    gin.parse_config_file("./config.gin")
    TRAIN_STEPS = number_of_steps
    model.train(Task_to_train, steps=TRAIN_STEPS)

INFO:root:system_path_file_exists:gs://code_review_automation/replication_package/model_dumps/operative_config.gin
ERROR:root:Path not found: gs://code_review_automation/replication_package/model_dumps/operative_config.gin


INFO:tensorflow:Using config: {'_model_dir': 'gs://code_review_automation/replication_package/fine_tuning_model_dumps/all_small_v1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.199.66:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.199.66:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.199.66:8470', '_eval

# Evaluation

In [None]:
# Use a larger batch size for evaluation, which requires less memory.
model.batch_size = 1024
model.eval(
    mixture_or_task_name=Task_to_train,
    # -1 will evaluate the last checkpoint, you can also provide 
    # a list of checkpoints with the following format : [10, 20, 30]
    checkpoint_steps=-1
    )

INFO:root:system_path_file_exists:gs://code_review_automation/replication_package/fine_tuning_model_dumps/all_small_v1/operative_config.gin
ERROR:root:Path not found: gs://code_review_automation/replication_package/fine_tuning_model_dumps/all_small_v1/operative_config.gin
INFO:absl:Adding task 'codeANDcomment_code_small_v1' with predict metric_fn(s).
INFO:absl:Adding task 'code_code_small_v1' with predict metric_fn(s).
INFO:absl:Automatically caching small dataset in memory: 'codeANDcomment_code_small_v1:validation'
  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
INFO:absl:Skipping packing/padding for 'codeANDcomment_code_small_v1' since sequence length is None.
INFO:absl:Automatically caching small dataset in memory: 'code_code_small_v1:validation'
INFO:absl:Skipping packing/padding for 'code_code_small_v1' since sequence length is None.
INFO:absl:Setting sequence lengths to {'inputs': 913, 'targets': 179}
INFO:absl:Evaluating checkpoint step: 700


INFO:tensorflow:Using config: {'_model_dir': 'gs://code_review_automation/replication_package/fine_tuning_model_dumps/all_small_v1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.199.66:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.199.66:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.199.66:8470', '_eval

INFO:absl:Automatically caching small dataset in memory: 'codeANDcomment_code_small_v1:validation'
INFO:absl:Padding 'codeANDcomment_code_small_v1' with sequence lengths: {'inputs': 913, 'targets': 179}
INFO:absl:Automatically caching small dataset in memory: 'code_code_small_v1:validation'
INFO:absl:Padding 'code_code_small_v1' with sequence lengths: {'inputs': 913, 'targets': 179}


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[8] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=8] LayoutRules{('ensemble', 'ensemble'), ('experts', 'batch'), ('batch', 'batch'), ('d_ff', 'model'), ('vocab', 'model'), ('heads', 'model')}
INFO:tensorflow:Device Assignment: <tensorflow.python.tpu.device_assignment.DeviceAssignment object at 0x7fd54817ff50>
INF

# Prediction 

In [None]:
with open("./input.txt","w") as f:
  ## change the task prefix with the one of your choice
  ## see tasks above
  f.write('code2code: "your code here"')

model.predict(input_file='./input.txt', output_file='./output.txt', checkpoint_steps=-1,
              beam_size=1, temperature=1.0, keep_top_k=-1, vocabulary=get_default_vocabulary())

INFO:root:system_path_file_exists:gs://code_review_automation/replication_package/fine_tuning_model_dumps/all_small_v1/operative_config.gin
ERROR:root:Path not found: gs://code_review_automation/replication_package/fine_tuning_model_dumps/all_small_v1/operative_config.gin


INFO:tensorflow:Using config: {'_model_dir': 'gs://code_review_automation/replication_package/fine_tuning_model_dumps/all_small_v1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 10000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.125.53.98:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.125.53.98:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.125.53.98:8470', '_eval