In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Banner advertising understanding

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/content_generation/banner_advertising_understanding.ipynb">
      <img src="https://avatars.githubusercontent.com/u/33467679?s=200&v=4" width="32px" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/content_generation/banner_advertising_understanding.ipynb">
      <img src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/generative_ai/content_generation/banner_advertising_understanding.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/content_generation/banner_advertising_understanding.ipynb">
      <img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw&s" alt="BQ logo" width="35">
      Open in BQ Studio
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fai-ml-recipes%2Fmain%2Fnotebooks%2Fgenerative_ai%2Fcontent_generation%2Fbanner_advertising_understanding.ipynb">
    <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo">
    Open in Colab Enterprise
    </a>
  </td>

</table>

## Overview

This notebook shows how to process ads banners images and generate enriched information classifications using a taxonomy using Bigframes and Gemini.  
It reads ads banners images from a GCS bucket and process them using Bigframes and Gemini, leveraging BigQuery's distributed processing capabilities.  

### Setup

Make sure you or the service account running this notebook has the required permissions:

In [None]:
#!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={SERVICE_ACCOUNT} --role='roles/resourcemanager.projectIamAdmin'
#!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={SERVICE_ACCOUNT} --role='roles/cloudfunctions.developer'
#!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={SERVICE_ACCOUNT} --role='roles/serviceusage.serviceUsageConsumer'
#!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={SERVICE_ACCOUNT} --role='roles/storage.admin'

#### Enable required APIs

In [None]:
!gcloud services enable cloudfunctions.googleapis.com
!gcloud services enable cloudbuild.googleapis.com

### Imports

In [None]:
!pip install --upgrade google-cloud-bigquery google-cloud-storage google-cloud-bigquery-connection bigframes -q

In [None]:
import io

import bigframes.pandas as bpd
import bigframes.bigquery as bbq
bpd.options.display.progress_bar = None
import pandas as pd

from google.cloud import storage
from IPython.display import display
from PIL import Image

import requests

from tabulate import tabulate

In [None]:
%load_ext google.cloud.bigquery

### Parameters

In [None]:
PROJECT_ID = "<PROJECT_ID>"
REGION = "<REGION>" # ex: us-central1

# Choose the name of the resources that will be created:
DATASET_BUCKET_NAME = "<DATASET_BUCKET_NAME>" # no gs://, just the name
DATASET_ID = "<BIGQUERY_DATASET_NAME>"
OBJECT_TABLE_NAME = "<BIGQUERY_OBJECT_TABLE_NAME>"
OUTPUT_TABLE_NAME = "<BIGQUERY_OUTPUT_TABLE_NAME>"

In [None]:
# Note: The project option is not required in all environments.
# On BigQuery Studio, the project ID is automatically detected.
bpd.options.bigquery.project = PROJECT_ID
# Note: The location option is not required.
# It defaults to the location of the first table or query
# passed to read_gbq(). For APIs where a location can't be
# auto-detected, the location defaults to the "US" location.
bpd.options.bigquery.location = REGION

### Copy data to your own bucket

In [None]:
def create_bucket_and_copy_data(source_path: str, destination_path: str, location: str):
    """
    Creates a GCS bucket and copies all data from a source path to the new bucket
    using a more compatible copy method.

    Args:
        source_path (str): The 'gs://' path of the source bucket.
        destination_path (str): The 'gs://' path for the new bucket to be created.
        location (str): The location for the new bucket (e.g., 'us-central1').
    """
    # Parse bucket names from the 'gs://' paths
    source_bucket_name = source_path.replace("gs://", "").split("/")[0]
    destination_bucket_name = destination_path.replace("gs://", "")

    storage_client = storage.Client()

    # Create the new destination bucket
    print(f"Creating bucket: {destination_bucket_name} in location: {location}")
    try:
        destination_bucket = storage_client.create_bucket(destination_bucket_name, location=location)
        print(f"✅ Bucket '{destination_bucket.name}' created successfully.")
    except Exception as e:
        print(f"⚠️ Error creating bucket. It may already exist: {e}")
        destination_bucket = storage_client.bucket(destination_bucket_name)

    # Get the source bucket
    source_bucket = storage_client.bucket(source_bucket_name)

    # Get the folder name from the source path, if any
    source_folder = source_path.replace(f"gs://{source_bucket_name}/", "")
    if source_folder == source_bucket_name:
        source_folder = ""

    # List and copy blobs from the source path to the destination
    print(f"\nCopying data from '{source_path}' to '{destination_bucket_name}'...")
    
    # We iterate over blobs in the source bucket, filtered by the prefix
    blobs = source_bucket.list_blobs(prefix=source_folder)
    
    copied_count = 0
    for blob in blobs:
        # Create a new blob name for the destination, removing the source folder prefix
        destination_blob_name = blob.name.replace(source_folder, "", 1).lstrip('/')
        
        # If the destination blob name is empty, it means we are trying to copy a folder itself, skip.
        if not destination_blob_name:
            continue

        # Check if the blob is a folder representation (ends with /), if so, skip.
        if blob.name.endswith('/'):
            continue
            
        # Use the more compatible copy_blob() method
        source_bucket.copy_blob(blob, destination_bucket, destination_blob_name)
        print(f"   ➡️ Copied '{blob.name}' to '{destination_blob_name}'")
        copied_count += 1
    
    print(f"\n✅ Finished copying {copied_count} file(s).")

In [None]:
SOURCE_BUCKET_PATH = "gs://dataproc-metastore-public-binaries/ads_banners_images" # Public dataset
DATASET_PATH = f"gs://{DATASET_BUCKET_NAME}"
DATASET_PATH_LOCATION = "us-central1"

create_bucket_and_copy_data(SOURCE_BUCKET_PATH, DATASET_PATH, DATASET_PATH_LOCATION)

### Create a BigQuery dataset

In [None]:
create_dataset_sql =f"CREATE SCHEMA IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}` OPTIONS ( location = '{REGION}' );"

In [None]:
%%bigquery
$create_dataset_sql

### Create a BigQuery object table from the GCS location

In [None]:
create_object_table_sql = f"""
      CREATE OR REPLACE EXTERNAL TABLE
        `{PROJECT_ID}.{DATASET_ID}.{OBJECT_TABLE_NAME}`
      WITH
        CONNECTION DEFAULT
        OPTIONS
          (object_metadata = 'SIMPLE', uris = ['{DATASET_PATH}/*']);
      """

In [None]:
%%bigquery
$create_object_table_sql

### Read object tables using Bigframes

In [None]:
df = bpd.read_gbq(f"{PROJECT_ID}.{DATASET_ID}.{OBJECT_TABLE_NAME}", use_cache=True)

In [None]:
df

### Fetch the ads product taxonomy from the web (Interactive Advertising Bureau)

In [None]:
TAXONOMY_URL = 'https://raw.githubusercontent.com/InteractiveAdvertisingBureau/Taxonomies/main/Ad%20Product%20Taxonomies/Ad%20Product%20Taxonomy%202.0.tsv'

In [None]:
ads_product_taxonomy = pd.read_csv(TAXONOMY_URL, sep='\t', header=0)
ads_product_taxonomy

In [None]:
ads_product_taxonomy_lowest_rank_list = ads_product_taxonomy['Name'].to_list()
ads_product_taxonomy_lowest_rank_list[:10]

In [None]:
ads_product_taxonomy_lowest_rank = '\n'.join(ads_product_taxonomy_lowest_rank_list)
print(ads_product_taxonomy_lowest_rank[:100])

### Create the prompt to analyze the ads banner image and generate the interpretations

In [None]:
system_instructions = [
    """You are a marketing and advertising expert, and have in-depth knowledge of web advertising campaigns.""",
    """Its task is to analyze a banner in image or video format, and return various information about it.""",
    """You will respond in JSON format to the following fields: product, interpretation, intended_audience, classification and score."""
]

In [None]:
def master_prompt(ads_product_taxonomy_lowest_rank):
  return f"""
I will give you instructions on how to obtain each piece of information.

<h1>product</h1>
What is the brand and product being promoted on the banner? Be brief in your answer, just say the name of the brand and product separated by | and nothing else.
For example:
Sony|BRAVIA X90K

<h1>interpretation</h1>
Generate an interpretation of the main message that the banner conveys, the product being promoted and the target audience.
For example, for a banner showing an offer from Audible, the result would be:
The main message of the banner is that Audible is offering a 66% discount for the first 3 months of subscription. The product being promoted is Audible, an audiobook and podcast streaming service. The target audience is anyone interested in listening to audiobooks or podcasts.
DO NOT generate multi-line responses. Be concise, 2 to 3 sentences maximum.

<h1>intended_audience</h1>
What is the target audience for the ad? Focus on the target persona of the product being promoted.
For example, for a banner showing an offer from Audible, the result would be:
The target audience is anyone interested in listening to audiobooks or podcasts.
DO NOT generate multi-line responses. Be concise, with 1 sentence.

<h1>classification</h1>
I need you to classify the banner based on a taxonomy. For classification, use the exact term, even if it is in English.
Here is the taxonomy you should consider to classify the banner:
<h2>taxonomy</h2>
{ads_product_taxonomy_lowest_rank}
</>
For example, for a banner selling an office chair, the classification would be:
Office Equipment and Supplies
For example, for a banner selling a dry cleaning promotion, the classification would be:
Laundry and Dry Cleaning Services
DO NOT generate multi-line responses. You must ONLY generate ONE EXISTING classification IN THE TAXONOMY LIST and NO MORE WORDS!

<h1>score</h1>
Evaluate the confidence in you have in your analysis from 0 to 10.  
If you are not sure what the banner means, you assign a lower score.  
If it is very clear, you assign a higher score.  
Consider this reasoning:
Are the classifications referring to an actual quote from the content?
Are the classifications correct, accurate and factual?

<h1>Response in JSON format</h1>
"""

### Define the BigQuery [remote function](https://cloud.google.com/python/docs/reference/bigframes/0.19.2/bigframes.session.Session#bigframes_session_Session_remote_function) to call the Gemini API

By doing so, a Cloud Function will be deployed in your GCP project

In [None]:
response_schema = {
    "type": "OBJECT",
    "properties": {
                    "product": {"type": "STRING"},
                    "interpretation": {"type": "STRING"},
                    "intended_audience": {"type": "STRING"},
                    "classification": {"type": "STRING"},
                    "score": {"type": "INTEGER"}
                },
                "required": ["product", "interpretation", "intended_audience", "classification", "score"],
}

In [None]:
def predict(prompt, uri, content_type, temperature=0.5, model_name="gemini-2.0-flash"):

    from vertexai.generative_models import GenerativeModel, GenerationConfig, Part, Image, Content, HarmCategory, HarmBlockThreshold
    
    model = GenerativeModel(model_name=model_name, system_instruction=system_instructions)
    
    prompt_content = Content(
        role="user",
        parts=[
            Part.from_uri(uri, content_type),
            Part.from_text(prompt)
        ] if model_name=="gemini-2.0-flash" else [Part.from_text(prompt)]
    )

    response = model.generate_content(
        prompt_content,
        generation_config=GenerationConfig(
            max_output_tokens= 8192, temperature=temperature, response_mime_type="application/json", response_schema=response_schema
        ),
        safety_settings={
                HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_ONLY_HIGH
        }
    )
    
    return response.text

In [None]:
@bpd.remote_function(
    [str, str],
    str,
    reuse=True,
    cloud_function_service_account="default",
    packages=["google-cloud-aiplatform"]
)
def generate_predictions(uri: str, mime_type: str) -> str:
    
    prediction = predict(master_prompt(ads_product_taxonomy_lowest_rank), uri, mime_type)
    
    return prediction

### Run predictions

In [None]:
input_remote_function = df[["uri"]].assign(mime_type=df[["content_type"]])

In [None]:
result_df = df.assign(pred=input_remote_function.apply(generate_predictions, axis=1))

In [None]:
result_df

### Extract attributes

In [None]:
explode_columns = ["product", "interpretation", "intended_audience", "classification", "score"]

for col in explode_columns:
    result_df[col] = bbq.json_extract(result_df["pred"], json_path=f"$.{col}")

result_df

### Write the final results to BigQuery

In [None]:
result_df.to_gbq(destination_table=f"{PROJECT_ID}.{DATASET_ID}.{OUTPUT_TABLE_NAME}", if_exists="replace")

### Iterate on result df to display images and prediction

In [None]:
def display_image_predictions(df, bucket_loc):
    
  gcs_client = storage.Client()
  bucket_name = bucket_loc.split('/')[2]

  for _, row in df.iterrows():
    image_path = row['uri'].split('/', 3)[-1]
    url = f"https://storage.googleapis.com/{bucket_name}/{image_path}"
    
    try:
      response = requests.get(url)
      response.raise_for_status()

      image_bytes = response.content
      image = Image.open(io.BytesIO(image_bytes))

      print(f"Prediction for image: {row['uri']}")
      display(image)

      prediction_data = [
          ['Interpretation', row["interpretation"]],
          ['Intended Audience', row["intended_audience"]],
          ['Classification', row["classification"]],
          ['Score', row["score"]]
      ]

      print(tabulate(prediction_data, headers=["Field", "Value"]))

    except requests.exceptions.RequestException as e:
      print(f"Error fetching image: {e}")

display_image_predictions(result_df.to_pandas(), DATASET_PATH)