<a href="https://colab.research.google.com/github/masies/CRA/blob/main/Pre_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install tensorflow
%tensorflow_version 2.x
!pip3 install --upgrade pip
#!pip install -qU t5
!pip3 install git+https://github.com/google-research/text-to-text-transfer-transformer.git #extra_id_x support

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
tf.enable_eager_execution()

import tensorflow_datasets as tfds

import t5

#Set the base dir(Google cloud bucket)
BASE_DIR = "gs://code_review_automation" 

if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
ON_CLOUD = True


if ON_CLOUD:
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/fe/ef/60d7ba03b5c442309ef42e7d69959f73aacccd0d86008362a681c4698e83/pip-21.0.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 6.8MB/s 
[?25hInstalling collected packages: pip
  Found existing installation: pip 19.3.1
    Uninstalling pip-19.3.1:
      Successfully uninstalled pip-19.3.1
Successfully installed pip-21.0.1
Collecting git+https://github.com/google-research/text-to-text-transfer-transformer.git
  Cloning https://github.com/google-research/text-to-text-transfer-transformer.git to /tmp/pip-req-build-lxejvb4v
  Running command git clone -q https://github.com/google-research/text-to-text-transfer-transformer.git /tmp/pip-req-build-lxejvb4v
Collecting mesh-tensorflow[transformer]>=0.1.13
  Downloading mesh_tensorflow-0.1.18-py3-none-any.whl (361 kB)
[K     |████████████████████████████████| 361 kB 6.9 MB/s 
Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
nq_tsv_path = {
    "train":'gs://code_review_automation/dataset/pre-training.tsv'
}

In [3]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary

# # Set the path of sentencepiece model and vocab files
# # Must be the same used for the pre-trained phase
vocab_model_path = 'gs://code_review_automation/models/TestModel.model'
vocab_path = 'gs://code_review_automation/vocab/TestModel.vocab'

TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

def get_default_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, 100)

DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=get_default_vocabulary(), add_eos=False, required=True),

    "targets": Feature(
        vocabulary=get_default_vocabulary(), add_eos=False)
}

In [4]:
def nq_dataset_fn(split, shuffle_files=True):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds


# print("A few raw train examples...")
for ex in tfds.as_numpy(nq_dataset_fn("train").take(5)):
  print(ex)

{'input': b'Bind indexed elements<extra_id_0> the supplied<extra_id_1> .<extra_id_2> param name the name<extra_id_3> the property to bind @param target the target bindable @<extra_id_4> elementBinder the binder to use for elements<extra_id_5> aggregate<extra_id_6> the aggregate type<extra_id_7> may be<extra_id_8> collection or an array @<extra_id_9> elementType the element type @param result the destination for results protected<extra_id_10> void<extra_id_11> Indexed(ConfigurationPropertyName name<extra_id_12> able<?> target, AggregateElementBinder elementBinder,<extra_id_13> aggregateType,<extra_id_14> elementType, IndexedCollectionSupplier result) { for (ConfigurationPropertySource source :<extra_id_15> getSource<extra_id_16> ()) { bindIndexed(source, name, target, elementBinder,<extra_id_17> , aggregateType, elementType); if (<extra_id_18> .<extra_id_19> Supplied() && result.get() != null) { return; } } }</s>', 'output': b'<extra_id_0> to<extra_id_1> collection<extra_id_2> @<extra_i

In [5]:
def preprocessing(ds):
  def to_inputs_and_targets(ex):
        inputs = tf.strings.join([ ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
  return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [6]:
#Create a new training task
t5.data.TaskRegistry.remove('pretraining')
t5.data.TaskRegistry.add(
    "pretraining",
    t5.data.Task,
    dataset_fn=nq_dataset_fn,
    splits=["train", "validation"],
    text_preprocessor=[preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
)

<t5.data.dataset_providers.FunctionTask at 0x7f2a9455fe10>

In [7]:
nq_task = t5.data.TaskRegistry.get("pretraining")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


A few preprocessed training examples...
{'inputs_pretokenized': b'Apply caching configuration when appropriate to the given invoker. @param invoker the invoker to wrap @param timeToLive the maximum<extra_id_0> in milliseconds that a response can be cached<extra_id_1> return a caching version<extra_id_2> the invoker or<extra_id_3> original instance<extra_id_4> is not required public<extra_id_5> OperationInvoker<extra_id_6> (OperationInvoker invoker, long timeToLive)<extra_id_7> if<extra_id_8> timeToLive<extra_id_9> 0)<extra_id_10> return new CachingOperationInvoker(invoker, timeToLive); }<extra_id_11> invoker; }</s>', 'inputs': array([ 7989,  6198,   507,   156,  1566,    10,     4,   240, 20670,
           3,    21,    45, 20670,     4, 20670,    10,  2272,    21,
          45,     7, 30247,     4,  1641,    41,  9105,    24,   164,
        3258,    18,    26,  2945,    38,    13,   300,    51,    42,
        2851,    41,  9105,    24,   164,  3332,    18,    36,    13,
        6198,  

In [8]:
from mesh_tensorflow.transformer.learning_rate_schedules import learning_rate_schedule_noam

#See https://github.com/google-research/text-to-text-transfer-transformer if you want to scale up the model
MODEL_SIZE = "small"  

MODEL_DIR = 'gs://code_review_automation/model_dumps'


model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 512, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]


tf.io.gfile.makedirs(MODEL_DIR)

model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 512, "targets": 512},
    learning_rate_schedule = learning_rate_schedule_noam,
    save_checkpoints_steps=5000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None
)

In [None]:
!gsutil cp gs://code_review_automation/config/operative_config.gin ./operative_config.gin 
PATH_GIN_FILE = './operative_config.gin'
import gin
with gin.unlock_config():    
    gin.parse_config_file(PATH_GIN_FILE)
    TRAIN_STEPS = 200000
    model.train("pretraining", steps=TRAIN_STEPS)

Copying gs://code_review_automation/config/operative_config.gin...
/ [0 files][    0.0 B/ 11.5 KiB]                                                / [1 files][ 11.5 KiB/ 11.5 KiB]                                                
Operation completed over 1 objects/11.5 KiB.                                     
INFO:tensorflow:Using config: {'_model_dir': 'gs://code_review_automation/model_dumps', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.94.57.122:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_ses

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[8] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=8] LayoutRules{('batch', 'batch'), ('d_ff', 'model'), ('heads', 'model'), ('experts', 'batch'), ('ensemble', 'ensemble'), ('vocab', 'model')}
INFO:tensorflow:Device Assignment: <tensorflow.python.tpu.device_assignment.DeviceAssignment object at 0x7f295289d050>
INF

In [None]:
# myModel = model

In [None]:
# Predicts targets from the given inputs.

# Args:
#   input_file: str, path to a text file containing newline-separated input
#     prompts to predict from.
#   output_file: str, path prefix of output file to write predictions to. Note
#     the checkpoint step will be appended to the given filename.
#   checkpoint_steps: int, list of ints, or None. If an int or list of ints,
#     inference will be run on the checkpoint files in model_dir whose
#     global steps are closest to the global steps provided. If None, run
#     inference continuously waiting for new checkpoints. If -1, get the
#     latest checkpoint from the model directory.
#   beam_size: int, a number >= 1 specifying the number of beams to use for
#     beam search.
#   temperature: float, a value between 0 and 1 (must be 0 if beam_size > 1)
#     0.0 means argmax, 1.0 means sample according to predicted distribution.
#   keep_top_k: integer, a value between 1 and the vocabulary size. When
#     sampling, only pick tokens that are in the k most likely.
#   vocabulary: vocabularies.Vocabulary object to use for tokenization, or
#     None to use the default SentencePieceVocabulary.


# model.predict(input_file="gs://code_review_automation/dataset/data/test.source", output_file="gs://code_review_automation/dataset/data/test.target",checkpoint_steps=-1, beam_size=1, temperature=1.0, vocabulary=SentencePieceVocabulary(vocab_model_path, 100))

In [None]:
# # Load the TensorBoard notebook extension
# %load_ext tensorboard

# import tensorflow as tf
# import datetime

# # Clear any logs from previous runs
# !rm -rf ./logs/ 

# %tensorboard --logdir logs/fit