In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Fine-tuning Gemini for Domain Specificity

This notebooks demonstrates how to fine-tune Gemini to perform translation tasks from multiple languages (DE,ES,FR,IT,PL,PT,RU,SV,UK,ZH) to English.  
It follows the steps:  
  - Reads the dataset from a GCS bucket
  - Save it in the Iceberg format using Dataproc Serverless
  - Use Vertex AI Supervised fine-tuning job to fine-tune Gemini with the dataset
  - Register the model in Vertex AI Model Registry

#### Setup

In [None]:
!pip3 install google-cloud-bigquery google-cloud-dataproc google-cloud-storage -q

In [None]:
from typing import List
import pandas as pd
import time
import json

from google.cloud import bigquery, storage
from pyspark.sql.types import LongType, StringType, DoubleType

type_mapping = {
        LongType: 'long',
        StringType: 'string',
        DoubleType: 'double'
}

#### Config

In this notebook we are going to create Iceberg tables, so you need to have a Kernel running PySpark with Iceberg, and you can do that using a [Serverless Apache Spark runtime template](https://docs.cloud.google.com/dataproc-serverless/docs/quickstarts/jupyterlab-sessions).

You go to the:
- JupyterLab/VSCode Settings ->
- Google Cloud Settings ->
- Create Serverless Runtime Template ->
- Metastore ->
- Biglake Metastore -> 
- Set a data warehouse GCS bucket and choose your ICEBERG_CATALOG name

And use your created runtime as jupyter kernel

In [None]:
PROJECT_ID = "<YOUR_PROJECT>"
LOCATION = "<YOUR_LOCATION>"

JSON_FILES_GCS_URI = "gs://dataproc-metastore-public-binaries/wikipedia_translated_clusters/*"

BUCKET_NAME = "<YOUR_BUCKET_NAME>"

ICEBERG_CATALOG = "biglake"
ICEBERG_SCHEMA = "default"
ICERBERG_TABLE = "wikipedia_translated_docs"
ICEBERG_WAREHOURSE_GCS_PATH="gs://<YOUR_ICEBERG_WAREHOUSE_GCS_BUCKET>/warehouse"

Make sure you have [gcloud](https://docs.cloud.google.com/sdk/docs/install-sdk) installed and authenticated in your terminal using gcloud auth application-default login

In [None]:
!gcloud config set project $PROJECT_ID

Create a GCS bucket if you do not have one

In [None]:
from google.cloud import storage

storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

if not bucket.exists():
    bucket = storage_client.create_bucket(BUCKET_NAME, location=LOCATION)
    print(f"Created bucket {BUCKET_NAME}")
else:
    print(f"Bucket {BUCKET_NAME} already exists")

#### Read input dataset

In [None]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("IcebergApp")
    .getOrCreate())

In [None]:
raw_dataset = spark.read.json(JSON_FILES_GCS_URI)

#### Transform the dataset

In [None]:
from pyspark.sql.functions import explode, array, lit, col, struct, desc, concat
from pyspark.sql.types import StringType

# Step 1: Get all column names from the DataFrame
columns = raw_dataset.columns

# Step 2: Create an array of structs with the column name and its content
exploded_df = raw_dataset.select(
    explode(
        array([
            struct(
                lit(column).alias("topic"),
                col(f"`{column}`.title").alias("title"),
                col(f"`{column}`.intro").alias("intro"),
                col(f"`{column}`.translated_intro").alias("translated_intro")
            )
            for column in columns
        ])
    ).alias("exploded")
)

# Step 3: Extract the fields from the struct and add the prompt column
transformed_df = exploded_df.select(
    col("exploded.title").alias("title"),
    lit("Translate this intro to english: ").alias("prompt"),
    col("exploded.intro").alias("intro"),
    col("exploded.translated_intro").alias("translated_intro")
)

# Step 4: Drop rows where any value is null
transformed_df = transformed_df.dropna(how="any")

# Step 5: Sort by title in descending order (Z to A)
transformed_df = transformed_df.orderBy(desc("title"))

In [None]:
transformed_df.printSchema()

In [None]:
# Display the schema of the resulting DataFrame
transformed_df.printSchema()

# Show a sample of the resulting DataFrame
transformed_df.show(40, 50)

#### Create Apache Iceberg table in catalog

In [None]:
spark.sql(f"USE `{ICEBERG_CATALOG}`;")
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS `{ICEBERG_SCHEMA}`;")
spark.sql(f"USE `{ICEBERG_SCHEMA}`;")

In [None]:
dataset_schema = f"({', '.join([f'{field.name} {type_mapping.get(type(field.dataType), str(field.dataType))}' for field in transformed_df.schema.fields])})"
dataset_schema

In [None]:
transformed_df.write.format("iceberg").save(f"{ICEBERG_CATALOG}.{ICEBERG_SCHEMA}.{ICERBERG_TABLE}")

In [None]:
spark.sql(f"SHOW SCHEMAS IN {ICEBERG_CATALOG};").show()

In [None]:
spark.sql(f"SHOW TABLES IN {ICEBERG_CATALOG}.{ICEBERG_SCHEMA};").show()

#### Read Apache Iceberg table

In [None]:
iceberg_df = spark.read.table(f"{ICEBERG_CATALOG}.{ICEBERG_SCHEMA}.{ICERBERG_TABLE}")
iceberg_df.show()

#### Generate dataset for finetuning

In [None]:
from pyspark.sql.functions import concat, col

finetune_dataset = iceberg_df.select(
    concat(col("prompt"), col("intro")).alias("input_prompt"),
    col("translated_intro").alias("expected_model_output")
)

finetune_dataset.show(5, 100)

In [None]:
finetune_dataset_pandas = finetune_dataset.toPandas()

train_set = finetune_dataset_pandas.sample(frac=0.8, random_state=42) 
test_set = finetune_dataset_pandas.drop(train_set.index)

In [None]:
def generate_records(df: pd.DataFrame) -> List:
    
    records = []

    for index, row in df.iterrows():

        input_prompt = row['input_prompt']
        expected_model_output = row['expected_model_output']

        record = {
          "contents": [
            { "role": "user", "parts": [ { "text": input_prompt } ] },
            { "role": "model", "parts": [ { "text": expected_model_output } ] } ] 
        }

        records.append(record)
        
    return records

In [None]:
train_records = generate_records(train_set)[:1000] # Select 1000 training records for fine tuning training
val_records = generate_records(test_set)[:300]     # Select 300 eval records for fine tuning evaluation

In [None]:
TRAIN_FILE_NAME = "wikipedia_translated/records/fine-tuning-train-dataset.jsonl"
VAL_FILE_NAME = "wikipedia_translated/records/fine-tuning-val-dataset.jsonl"

In [None]:
def upload_gcs(records: List, file_name: str, bucket_name: str = BUCKET_NAME, project_id: str = PROJECT_ID) -> str:
    
    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)

    jsonl_data = "\n".join(json.dumps(item) for item in records)
    blob.upload_from_string(jsonl_data)
    
    uri = f"gs://{bucket_name}/{file_name}"

    return uri

In [None]:
uri_train = upload_gcs(train_records, TRAIN_FILE_NAME)
uri_val = upload_gcs(val_records, VAL_FILE_NAME)

#### Run finetuning job on Vertex AI

In [None]:
from google import genai
from google.genai import types

In [None]:
client = genai.Client(
    vertexai=True,
    project=PROJECT_ID,
    location=LOCATION
)

In [None]:
GEMINI_MODEL = "gemini-2.5-flash"

**Warning**: fine tuning the model will take +1 hour

In [None]:
tuned_model_name = "tuned_gemini_translation"

tuning_job = client.tunings.tune(
    base_model = GEMINI_MODEL,
    training_dataset = types.TuningDataset(gcs_uri = uri_train),
    config=types.CreateTuningJobConfig(
        epoch_count= 8,
        tuned_model_display_name=tuned_model_name,
        adapter_size = "ADAPTER_SIZE_FOUR",
        learning_rate_multiplier = 0.8,
        validation_dataset = types.TuningDataset(gcs_uri=uri_val)
    )
)

In [None]:
while not tuning_job.has_ended:
    time.sleep(30)
    tuning_job = client.tunings.get(name=tuning_job.name)

In [None]:
print(tuning_job.tuned_model.model)
print(tuning_job.tuned_model.endpoint)
print(tuning_job.experiment)

In [None]:
def predict(prompt, model):

    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text=prompt)
            ]
        )
    ]

    generate_content_config = types.GenerateContentConfig(
        temperature = 0.5,
        max_output_tokens = 128,
        response_mime_type = "text/plain",
        safety_settings = [types.SafetySetting(
            category = 'HARM_CATEGORY_UNSPECIFIED',
            threshold = 'BLOCK_ONLY_HIGH',
        )]
    )
    
    response = client.models.generate_content(model=model,
                                                contents=contents,
                                                config=generate_content_config)
    return response.text

In [None]:
input_prompt = """Translate this intro to english: You (Estilizado como YOU - Sendo nomeado no Brasil como Você, em Portugal como Tu) é uma série de televisão americana de suspense psicológico desenvolvida por Greg Berlanti e Sera Gamble.
Produzido pela Warner Horizon Television, em associação com Alloy Entertainment e A&E Studios. A série é baseada no romance de 2014 de mesmo nome de Caroline Kepnes.
A primeira temporada segue Joe Goldberg, gerente de uma livraria de Nova York e um serial killer que se apaixona por uma cliente chamada Guinevere Beck e rapidamente desenvolve uma obsessão extrema, tóxica e delirante.
A segunda temporada segue Joe enquanto ele se muda para Los Angeles e se apaixona por Love Quinn, chef e sócia de uma rede de produtos naturais.
A primeira temporada, que foi lançada em 2018, é estrelada por Penn Badgley, Elizabeth Lail, Luca Padovan, Zach Cherry e Shay Mitchell.
Para a segunda temporada, Ambyr Childers foi promovida a regular na série, juntando-se aos recém-escalados Victoria Pedretti, James Scully, Jenna Ortega e Carmela Zumbado.
A série estreou na Lifetime em 9 de setembro de 2018 nos Estados Unidos e transmitida internacionalmente pela Netflix em 26 de dezembro de 2018.
A série atraiu um público limitado na Lifetime antes de se tornar mais popular e um sucesso crítico para a Netflix, com mais de 43 milhões de espectadores tendo transmitido a primeira temporada após sua estreia no serviço de streaming.
A Lifetime anunciou que You foi renovada para uma segunda temporada baseada no romance seguinte de Kepnes, Hidden Bodies, em 26 de julho de 2018, antes da estreia da série.
Em dezembro de 2018, foi anunciado que a série mudaria para a Netflix como um título Original Netflix.
A segunda temporada foi lançada exclusivamente na Netflix em 26 de dezembro de 2019.
Em janeiro de 2020, a série foi renovada para uma terceira temporada pela Netflix, que conta com Badgley e Pedretti reprisando seus papéis.
No dia 30 de agosto de 2021, foi confirmado que a terceira temporada irá estrear dia 15 de outubro de 2021.
Em outubro de 2021, antes da estreia da terceira temporada, a série foi renovada para uma quarta temporada."""

In [None]:
predict(input_prompt, tuning_job.tuned_model.endpoint)