Copyright 2024 Google LLC.

SPDX-License-Identifier: Apache-2.0

This Colab example initiates a tuning job on base model `translation-llm-002` with provided [translation dataset](https://cloud.google.com/translate/docs/advanced/automl-quickstart?_gl=1*1uxuas*_up*MQ..&gclid=Cj0KCQiAvP-6BhDyARIsAJ3uv7bZUmDC7sTZ4d5N-08d2vtOXi9smnSTsI0YDQHK49fxYKwy68eAAtMaAixREALw_wcB&gclsrc=aw.ds) (legacy datasets are not supported).

##Prerequisites

*   Enable Vertex AI API in APIs & services page.
*   It's recommended to use Colab for running the script, as it best supports the authentication process and Cloud CLI.

Prerequisite: Enable Vertex AI API in APIs & services page.

##Authentication

In [None]:
from google.colab import auth

PROJECT_ID = "my-project"  # @param {type:"string"}
auth.authenticate_user(project_id=PROJECT_ID)

In [None]:
!gcloud config set project {PROJECT_ID}

##Input parameters

To quick start: First import a native dataset at [AutoML](https://console.cloud.google.com/translation/datasets), and get dataset id under the display name. Fill in the required parameters.

By default, the model name to be used for translate text requests will be returned after the tuning finishes. For your reference, the tuning will take less than 20 minutes for a dataset with 10k training examples.

In [None]:
# Directory to save converted dataset.
GCS_EXPORT_PATH = 'gs://my_bucket/dir' # @param {type:"string"}

# Only native datasets are supported.
DATASET_ID = '123abc' # @param {type:"string"}

# Model display name on Vertex AI Online Prediction page.
TUNED_MODEL_DISPLAY_NAME = 'translation-llm-test' # @param {type:"string"}

# Set sample size. Set to "-1" to use all examples.
TRAIN_DATASET_SAMPLE_SIZE = -1 # @param {type:"integer"}

# Validation size limit is 1000.
VALIDATION_DATASET_SAMPLE_SIZE = 250 # @param {type:"integer"}


##Helper functions

In [None]:
# only us-central1 is supported for now
LOCATION = 'us-central1'

language_map = {
    'en' : 'English',
    'es' : 'Spanish',
    'fr' : 'French',
    'de' : 'German',
    'it' : 'Italian',
    'pt' : 'Portuguese',
    'zh' : 'Chinese',
    'ja' : 'Japanese',
    'ko' : 'Korean',
    'ar' : 'Arabic',
    'hi' : 'Hindi',
    'ru' : 'Russian',
}

In [None]:
import csv
import json
import glob
import os
import time

from google.cloud import translate_v3
from google.cloud import storage

import vertexai
from vertexai.tuning import sft


# Creates single json tuning input data
def convert_line_to_jsonl(source_language, target_language, source_sentence, target_sentence):
  return json.dumps({
      "contents": [{"role": "user", "parts": [{"text": source_language + ": " + source_sentence + " " + target_language + ": "}]},
       {"role": "model", "parts": [{"text": target_sentence}]}]}, ensure_ascii=False)


# Checking dataset exists and extract language pairs
def check_dataset(project_id, location, dataset_id):
  translation_client = translate_v3.TranslationServiceClient()
  request = translate_v3.GetDatasetRequest(
    name=f"projects/{project_id}/locations/{location}/datasets/{dataset_id}",
  )
  try:
    response = translation_client.get_dataset(request=request)
    print(response)
    if response.source_language_code not in language_map or response.target_language_code not in language_map:
      raise ValueError("Invalid language code")
    return response.source_language_code, response.target_language_code
  except Exception as e:
    raise ValueError(f"Error getting dataset: {e}")


# Export dataset to gcs directory
def export_data(project_id, location, dataset_id, gcs_export_path):
  translation_client = translate_v3.TranslationServiceClient()

  # Initialize request argument(s)
  output_config = translate_v3.DatasetOutputConfig()
  output_config.gcs_destination.output_uri_prefix = gcs_export_path

  export_request = translate_v3.ExportDataRequest(
    dataset=f"projects/{project_id}/locations/{location}/datasets/{dataset_id}",
    output_config=output_config,
  )

  # Make the request
  response = translation_client.export_data(request=export_request)

  print("Waiting for operation to complete...")

  while not response.done():
    time.sleep(5)

  if response.metadata.error.message:
    print("Dataset exported failed.")
    print(response.metadata.error.message)
    return ""
  else:
    print("Dataset exported successfully.")
    operation_short_name = response.operation.name.rsplit('/', 1)[-1]
    exported_bucket = gcs_export_path + '/exported_' + dataset_id + '_' + operation_short_name
    print(exported_bucket)
    return exported_bucket


# Format conversion function as part of the AutoML export workflow.
def convert_exported_files(colab_path, source_language_code, target_language_code, train_dataset_sample_size, validation_dataset_sample_size):
  train_file_list = glob.glob(colab_path + '/train*')
  validation_file_list = glob.glob(colab_path + '/validation*')
  train_jsonl = os.path.join(colab_path, "train.jsonl")
  validation_jsonl = os.path.join(colab_path, "validation.jsonl")

  with open(train_jsonl, 'w', encoding='utf-8') as outfile:
    count = 0
    for train_file in train_file_list:
      with open(train_file, 'r', encoding='utf-8') as infile:
        reader = csv.reader(infile, delimiter='\t')
        for row in reader:
          message = convert_line_to_jsonl(language_map[source_language_code], language_map[target_language_code], row[0], row[1])
          outfile.write(message)
          outfile.write('\n')
          count += 1
          if count == train_dataset_sample_size:
            break
      if count == train_dataset_sample_size:
        break

  with open(validation_jsonl, 'w', encoding='utf-8') as outfile:
    count = 0
    for validation_file in validation_file_list:
      with open(validation_file, 'r', encoding='utf-8') as infile:
        reader = csv.reader(infile, delimiter='\t')
        for row in reader:
          message = convert_line_to_jsonl(language_map[source_language_code], language_map[target_language_code], row[0], row[1])
          outfile.write(message)
          outfile.write('\n')
          count += 1
          if count == validation_dataset_sample_size:
            break
      if count == validation_dataset_sample_size:
        break

  print("File conversion completed.")


# Initiates model training
def train_model(train_dataset_path, validation_dataset_path, tuned):
  vertexai.init(project=PROJECT_ID, location=LOCATION)

  sft_tuning_job = sft.train(
    source_model="translation-llm-002",
    train_dataset=train_dataset_path,
    validation_dataset=validation_dataset_path,
    tuned_model_display_name=TUNED_MODEL_DISPLAY_NAME,
  )

  # Polling for job completion
  while not sft_tuning_job.has_ended:
    time.sleep(60)
    sft_tuning_job.refresh()

  endpoint_short_name = sft_tuning_job.tuned_model_endpoint_name.rsplit('/', 1)[-1]
  custom_model_name = f"projects/{PROJECT_ID}/locations/{LOCATION}/models/translation-llm-custom/{endpoint_short_name}"

  print("Model: ", custom_model_name)
  return custom_model_name


## Export dataset from AutoML

In [None]:
GCS_EXPORT_PATH = GCS_EXPORT_PATH.rstrip('/')
SOURCE_LANGUAGE_CODE, TARGET_LANGUAGE_CODE = check_dataset(PROJECT_ID, LOCATION, DATASET_ID)
exported_bucket = export_data(PROJECT_ID, LOCATION, DATASET_ID, GCS_EXPORT_PATH)

In [None]:
!gsutil cp -r {exported_bucket} '/content/'

###Optional: Delete exported files in gcs bucket.

In [None]:
!gsutil rm -r {exported_bucket}

##Dataset Format Conversion

This step convers data to `.jsonl` format for tuning.

In [None]:
colab_path = os.path.join('/content/', exported_bucket.rsplit('/', 1)[-1])
convert_exported_files(colab_path, SOURCE_LANGUAGE_CODE, TARGET_LANGUAGE_CODE, TRAIN_DATASET_SAMPLE_SIZE, VALIDATION_DATASET_SAMPLE_SIZE)
train_jsonl = os.path.join(colab_path, 'train.jsonl')
validation_jsonl = os.path.join(colab_path, 'validation.jsonl')
train_dataset_path = GCS_EXPORT_PATH + '/' + DATASET_ID + '_train.jsonl'
validation_dataset_path = GCS_EXPORT_PATH + '/' + DATASET_ID + '_validation.jsonl'

In [None]:
!gsutil cp {train_jsonl} {train_dataset_path}
!gsutil cp {validation_jsonl} {validation_dataset_path}

###Optional: Remove dataset copied to Colab

In [None]:
!rm -rf {colab_path}

## Initiate Vertex Tuning Request

After tuning is done, the translation model name will be returned to be used for translation requests.

In [None]:
custom_model_name = train_model(train_dataset_path, validation_dataset_path, TUNED_MODEL_DISPLAY_NAME)