In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Generate descriptions from videos

## Overview

This notebook shows how to generate descriptions of videos in a GCS bucket.  
It uses the [Youtube UGC dataset](https://media.withyoutube.com/) and uses the [Gemini](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini) to generate video descriptions for each video.

#### **Steps**
Using Spark,
1) It reads the table [Youtube UGC dataset](https://media.withyoutube.com/) from gs://dataproc-metastore-public-binaries/youtube_ucg/
2) It calls Vertex AI Gemini API vision pro to generate description from videos.

### Setup

Make sure the service account running this notebook has the required permissions:

- **Run the notebook**
  - AI Platform Notebooks Service Agent
  - Notebooks Admin
  - Vertex AI Administrator
- **Read files from bucket**
  - Storage Object Viewer
- **Run Dataproc jobs**
  - Dataproc Service Agent
  - Dataproc Worker
- **Call Google APIs**
  - Service Usage Consumer

#### Imports

In [None]:
import time

from pyspark.sql.functions import regexp_replace, concat
from pyspark.sql.functions import udf, col, lit

import google.auth
import google.auth.transport.requests
import requests

import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.min_rows', 20)

In [None]:
# When using Dataproc Serverless, installed packages are automatically available on all nodes
!pip install --upgrade google-cloud-aiplatform
# When using a Dataproc cluster, you will need to install these packages during cluster creation: https://cloud.google.com/dataproc/docs/tutorials/python-configuration

#### Authentication

In [None]:
credentials, project_id = google.auth.default()
auth_req = google.auth.transport.requests.Request()
credentials.refresh(auth_req)

#### Setup Spark Session

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
  .appName("Video descriptions generation") \
  .enableHiveSupport() \
  .getOrCreate()

#### Read dataset

In [None]:
BINARIES_BUCKET_PATH = "gs://dataproc-metastore-public-binaries/youtube_ucg/"
binaries_df = spark.read.format("binaryFile").option("recursiveFileLookup", "true").load(BINARIES_BUCKET_PATH)

In [None]:
# Let's select the paths of the first 5 youtube videos
paths_df = binaries_df.select("path").limit(5)

#### Define UDF and call Gemini API to generate video descriptions

In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part , HarmCategory, HarmBlockThreshold

vertexai.init(project=project_id, location="us-central1")

def gemini_predict(gcs_uri, prompt):
      
    gemini_pro_vision_model = GenerativeModel("gemini-1.0-pro-vision")
    config = {"max_output_tokens": 2048, "temperature": 0.4, "top_p": 1, "top_k": 32}
    safety_config = {
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    }
    
    prediction = gemini_pro_vision_model.generate_content([
          prompt,
          Part.from_uri(gcs_uri, mime_type="video/mp4")
        ],
        generation_config=config,
        safety_settings=safety_config,
        stream=True
    )
    
    text_responses = []
    for response in prediction:
        text_responses.append(response.text)
    return "".join(text_responses)

In [None]:
def generate_descriptions(gcs_uri):
    
    prompt = f"""
        Create a short description for this video with the following questions:
         1) Where the video was from? 
         2) How many people we have? 
         3) What people are doing? 
         4) Whats the proposition for the video?
         5) A sumary description from the itens 1,2,3 and 4
        Format the 5 items as attributes of a JSON object: Where, HowManyPeople, Task, Proposition and Description.
        The response should be a single valid formatted JSON object only.
        """

    descriptions = gemini_predict(gcs_uri, prompt)
    return descriptions
    
generate_descriptions_udf = udf(generate_descriptions)

In [None]:
df_descriptions = paths_df.sort(paths_df.path.asc()).withColumn("data", generate_descriptions_udf(paths_df.path))

In [None]:
df_descriptions.cache()

In [None]:
df_descriptions.toPandas()

#### Extract feature from generated text

In [None]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType
schema = StructType(
    [
        StructField('Where', StringType(), True),
        StructField('HowManyPeople', StringType(), True),
        StructField('Proposition', StringType(), True),
        StructField('Description', StringType(), True),
        StructField('Task', StringType(), True)
    ]
)
df_final = df_descriptions.withColumn("exploded_data", from_json(regexp_replace(regexp_replace(col("data"),"json", ""),"```",""), schema))\
    .select(col('path'),col('exploded_data.*'))

In [None]:
df_final.toPandas()

|                path|               Where|       HowManyPeople|         Proposition|         Description|                Task|
|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
|gs://dataproc-met...|The video was rec...|There are two peo...|The video is abou...|The video is abou...|The people are pl...|
|gs://dataproc-met...|The video was tak...|There is one pers...|The video is abou...|The video is abou...|The person is mak...|
|gs://dataproc-met...|From a notebook p...|          One person|How to draw a bet...|This video shows ...|Drawing some line...|
|gs://dataproc-met...|The video was tak...|There is one pers...|The video is abou...|The video shows a...|The person is usi...|
|gs://dataproc-met...|From a couch in a...|There is one pers...|The video is abou...|A woman is sittin...|The person is dra...|