In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Generate descriptions from videos

## Overview

This notebook shows how to generate descriptions of videos in a GCS bucket.  
It uses the [Youtube UGC dataset](https://media.withyoutube.com/) and uses the [Gemini](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini) to generate video descriptions for each video.

#### **Steps**
Using Spark,
1) It reads a metadata table of the [Youtube UGC dataset](https://media.withyoutube.com/) from the **public_datasets** dataset located in the [metastore](../../public_datasets/dataproc_metastore/metastore_public_datasets_quickstart.ipynb) (notebook should be connected with the public metastore if using this specific dataset).
This metadata table contains the paths of the image files in the bucket.
If you want to apply this to a different dataset, you can read the pdf files in your bucket with spark.read.format("binaryFile") (no need of the metastore) - more details [here](../../public_datasets/dataproc_metastore/metastore_public_datasets_quickstart.ipynb).
2) It calls Vertex AI Gemini API to get product sales descriptions based on the image.

### Setup

Make sure the service account running this notebook has the required permissions:

- **Run the notebook**
  - AI Platform Notebooks Service Agent
  - Notebooks Admin
  - Vertex AI Administrator
- **Read tables from Dataproc Metastore**
  - Dataproc Metastore Editor
  - Dataproc Metastore Metadata Editor
  - Dataproc Metastore Metadata User
  - Dataproc Metastore Service Agent
- **Read files from bucket**
  - Storage Object Viewer
- **Run Dataproc jobs**
  - Dataproc Service Agent
  - Dataproc Worker
- **Call Google APIs**
  - Service Usage Consumer
- **BigQuery**
  - BigQuery Data Editor

#### Imports

In [1]:
import time

from pyspark.sql.functions import regexp_replace, concat
from pyspark.sql.functions import udf, col, lit

import google.auth
import google.auth.transport.requests
import requests

import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.min_rows', 20)

#### Authentication

In [2]:
credentials, project_id = google.auth.default()
auth_req = google.auth.transport.requests.Request()
credentials.refresh(auth_req)

#### Setup Spark Session

In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder \
  .appName("Video descriptions generation") \
  .enableHiveSupport() \
  .getOrCreate()

24/01/04 14:58:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


#### Read dataset

In [8]:
### Read the dataset from the public Dataproc Metastore connected
binaries_df = spark.read.table("public_datasets.youtube_ucg")

In [9]:
### Another option is to read from the bucket directly
# BINARIES_BUCKET_PATH = "gs://dataproc-metastore-public-binaries/stanford_online_products/"
# binaries_df = spark.read.format("binaryFile").option("recursiveFileLookup", "true").load(BINARIES_BUCKET_PATH)

In [10]:
# Let's select the paths of the first 100 product images
paths_df = binaries_df.select("path").limit(5)

In [11]:
paths_df.show()



+--------------------+
|                path|
+--------------------+
|gs://diogokato-us...|
|gs://diogokato-us...|
|gs://diogokato-us...|
|gs://diogokato-us...|
|gs://diogokato-us...|
+--------------------+



                                                                                

#### Define UDF and call Gemini API to generate video descriptions

In [12]:
def generate_descriptions(gcs_uri):

  def gemini_predict(gcs_uri, prompt):
      
    model_url = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{project_id}/locations/us-central1/publishers/google/models/gemini-pro-vision:streamGenerateContent"
    request_body = {
      "contents": {
        "role": "user",
        "parts": [
          {
            "fileData": {
              "mimeType": "video/mp4",
              "fileUri": gcs_uri
            }
          },
          {
            "text": prompt
          }
        ]
      },
      "safety_settings": {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_LOW_AND_ABOVE"
      },
      "generation_config": {
        "temperature": 0.4,
        "topP": 1.0,
        "topK": 32,
        "maxOutputTokens": 2048
      }
    }
      
    prediction = requests.post(
      model_url,
      headers={'Authorization': 'Bearer %s' % credentials.token,
               'Content-Type': 'application/json'},
      json = request_body
    ).json()


    full_prediction = ""
    for pred in prediction:
      if "candidates" in pred:
        content = pred["candidates"][0]["content"]["parts"][0]["text"]
        full_prediction += content
    return full_prediction

  prompt = f"""
        Create a short description for this video with the following questions:
         1-) Where the video was from? 
         2-) How many people we have? 
         3-) What pople are doing? 
         4-) whats the proposition for the video?
         5-) A sumary description from the itens 1,2,3 and 4
        Format the 5 descriptions in a JSON format with the KEYS: Where, HowManyPeople, Task, Proposition and Description.
    """

  descriptions = gemini_predict(gcs_uri, prompt)

  return descriptions

In [13]:
generate_descriptions_udf = udf(generate_descriptions)

In [14]:
df_descriptions = paths_df.sort(paths_df.path.asc()).withColumn("data", generate_descriptions_udf(paths_df.path))

In [15]:
df_descriptions.cache()

DataFrame[path: string, data: string]

In [16]:
df_descriptions.toPandas()

                                                                                

Unnamed: 0,path,data
0,gs://diogokato-us/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-09f8.mp4,"```json\n {\n ""Where"": ""From a phone"",\n ""HowManyPeople"": ""One person"",\n ""Task"": ""Showing how to delete a folder"",\n ""Proposition"": ""If you want to delete a folder, you can do it by following the steps in this video"",\n ""Description"": ""The video shows how to delete a folder on a phone. The video is from a phone, and it shows one person deleting a folder. The video is short and easy to follow, and it provides clear instructions on how to delete a folder.""\n }\n ```"
1,gs://diogokato-us/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-2074.mp4,"```json\n{\n ""Where"": ""The video was taken from a home."",\n ""HowManyPeople"": ""There is one person in the video."",\n ""Task"": ""The person is cutting out felt petals from a larger piece of felt."",\n ""Proposition"": ""The video is about how to make felt petals."",\n ""Description"": ""A person is cutting out felt petals from a larger piece of felt. The video is about how to make felt petals.""\n}\n```"
2,gs://diogokato-us/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-2fd5.mp4,"```json\n {\n ""Where"": ""A classroom"",\n ""HowManyPeople"": ""One"",\n ""Task"": ""Writing on a notebook"",\n ""Proposition"": ""To show how to write on a notebook"",\n ""Description"": ""A person is writing on a notebook. The person is using a pencil. The person is writing in cursive. The person is writing a sentence. The sentence is 'I am writing on a notebook'.""\n }\n ```"
3,gs://diogokato-us/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-6a0e.mp4,"```json\n{\n ""Where"": ""From a forest"",\n ""HowManyPeople"": ""There aren't people"",\n ""Task"": ""Showing an image processing"",\n ""Proposition"": ""The video shows how to process an image using a high pass filter to enhance the details."",\n ""Description"": ""The video shows how to process an image using a high pass filter to enhance the details. The video is from a forest, and there aren't people. The video shows how to use the high pass filter to enhance the details of the image.""\n}\n```"
4,gs://diogokato-us/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-7fb1.mp4,"```json\n {\n ""Where"": ""From a house"",\n ""HowManyPeople"": ""One person"",\n ""Task"": ""Drawing Sailor Moon"",\n ""Proposition"": ""Watch the video to see the drawing"",\n ""Description"": ""A person is drawing Sailor Moon. The person is sitting on a couch and has a lot of manga on the table.""\n }\n ```"


#### Extract feature from generated text

In [18]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType
schema = StructType(
    [
        StructField('Where', StringType(), True),
        StructField('HowManyPeople', StringType(), True),
        StructField('Proposition', StringType(), True),
        StructField('Description', StringType(), True),
        StructField('Task', StringType(), True)
    ]
)
df_final = df_descriptions.withColumn("data2", from_json(regexp_replace(regexp_replace(col("data"),"json", ""),"```",""), schema))\
    .select(col('path'),col('data2.*'))

In [19]:
df_final.toPandas()

                                                                                

Unnamed: 0,path,Where,HowManyPeople,Proposition,Description,Task
0,gs://diogokato-us/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-09f8.mp4,From a phone,One person,"If you want to delete a folder, you can do it by following the steps in this video","The video shows how to delete a folder on a phone. The video is from a phone, and it shows one person deleting a folder. The video is short and easy to follow, and it provides clear instructions on how to delete a folder.",Showing how to delete a folder
1,gs://diogokato-us/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-2074.mp4,The video was taken from a home.,There is one person in the video.,The video is about how to make felt petals.,A person is cutting out felt petals from a larger piece of felt. The video is about how to make felt petals.,The person is cutting out felt petals from a larger piece of felt.
2,gs://diogokato-us/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-2fd5.mp4,A classroom,One,To show how to write on a notebook,A person is writing on a notebook. The person is using a pencil. The person is writing in cursive. The person is writing a sentence. The sentence is 'I am writing on a notebook'.,Writing on a notebook
3,gs://diogokato-us/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-6a0e.mp4,From a forest,There aren't people,The video shows how to process an image using a high pass filter to enhance the details.,"The video shows how to process an image using a high pass filter to enhance the details. The video is from a forest, and there aren't people. The video shows how to use the high pass filter to enhance the details of the image.",Showing an image processing
4,gs://diogokato-us/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-7fb1.mp4,From a house,One person,Watch the video to see the drawing,A person is drawing Sailor Moon. The person is sitting on a couch and has a lot of manga on the table.,Drawing Sailor Moon
