In [2]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Generate descriptions from videos

## Overview

This notebook shows how to generate descriptions of videos in a GCS bucket.  
It uses the [Youtube UGC dataset](https://media.withyoutube.com/) and uses the [Gemini](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini) to generate video descriptions for each video.

#### **Steps**
Using Spark,
1) It reads a metadata table of the [Youtube UGC dataset](https://media.withyoutube.com/) from the **public_datasets** dataset located in the [metastore](../../public_datasets/dataproc_metastore/metastore_public_datasets_quickstart.ipynb) (notebook should be connected with the public metastore if using this specific dataset).
This metadata table contains the paths of the image files in the bucket.
If you want to apply this to a different dataset, you can read the pdf files in your bucket with spark.read.format("binaryFile") (no need of the metastore) - more details [here](../../public_datasets/dataproc_metastore/metastore_public_datasets_quickstart.ipynb).
2) It calls Vertex AI Gemini API to get product sales descriptions based on the image.

### Setup

Make sure the service account running this notebook has the required permissions:

- **Run the notebook**
  - AI Platform Notebooks Service Agent
  - Notebooks Admin
  - Vertex AI Administrator
- **Read tables from Dataproc Metastore**
  - Dataproc Metastore Editor
  - Dataproc Metastore Metadata Editor
  - Dataproc Metastore Metadata User
  - Dataproc Metastore Service Agent
- **Read files from bucket**
  - Storage Object Viewer
- **Run Dataproc jobs**
  - Dataproc Service Agent
  - Dataproc Worker
- **Call Google APIs**
  - Service Usage Consumer
- **BigQuery**
  - BigQuery Data Editor

#### Imports

In [1]:
import time

from pyspark.sql.functions import regexp_replace, concat
from pyspark.sql.functions import udf, col, lit

import google.auth
import google.auth.transport.requests
import requests

import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.min_rows', 20)

#### Authentication

In [2]:
credentials, project_id = google.auth.default()
auth_req = google.auth.transport.requests.Request()
credentials.refresh(auth_req)

#### Setup Spark Session

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder \
  .appName("Video descriptions generation") \
  .enableHiveSupport() \
  .getOrCreate()

24/01/08 19:06:44 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


#### Read dataset

In [5]:
### Read the dataset from the public Dataproc Metastore connected
# binaries_df = spark.read.table("public_datasets.youtube_ucg")

In [6]:
### Another option is to read from the bucket directly
BINARIES_BUCKET_PATH = "gs://dataproc-metastore-public-binaries/youtube_ucg/"
binaries_df = spark.read.format("binaryFile").option("recursiveFileLookup", "true").load(BINARIES_BUCKET_PATH)

In [11]:
binaries_df.show()

                                                                                

+--------------------+--------------------+-------+--------------------+
|                path|    modificationTime| length|             content|
+--------------------+--------------------+-------+--------------------+
|gs://dataproc-met...|2024-01-04 20:08:...|5051568|[00 00 00 20 66 7...|
|gs://dataproc-met...|2024-01-04 20:08:...|4450939|[00 00 00 20 66 7...|
|gs://dataproc-met...|2024-01-04 20:08:...|2766749|[00 00 00 20 66 7...|
|gs://dataproc-met...|2024-01-04 20:08:...|2525019|[00 00 00 20 66 7...|
|gs://dataproc-met...|2024-01-04 20:08:...|2311945|[00 00 00 20 66 7...|
|gs://dataproc-met...|2024-01-04 20:08:...|1682359|[00 00 00 20 66 7...|
|gs://dataproc-met...|2024-01-04 20:08:...|1389832|[00 00 00 20 66 7...|
|gs://dataproc-met...|2024-01-04 20:08:...|1388558|[00 00 00 20 66 7...|
|gs://dataproc-met...|2024-01-04 20:08:...|1095245|[00 00 00 20 66 7...|
|gs://dataproc-met...|2024-01-04 20:08:...|1020169|[00 00 00 20 66 7...|
|gs://dataproc-met...|2024-01-04 20:08:...| 957007|

In [9]:
# Let's select the paths of the first 100 product images
paths_df = binaries_df.select("path").limit(5)

In [10]:
paths_df.show()

                                                                                

+--------------------+
|                path|
+--------------------+
|gs://dataproc-met...|
|gs://dataproc-met...|
|gs://dataproc-met...|
|gs://dataproc-met...|
|gs://dataproc-met...|
+--------------------+



#### Define UDF and call Gemini API to generate video descriptions

In [11]:
def generate_descriptions(gcs_uri):

  def gemini_predict(gcs_uri, prompt):
      
    model_url = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{project_id}/locations/us-central1/publishers/google/models/gemini-pro-vision:streamGenerateContent"
    request_body = {
      "contents": {
        "role": "user",
        "parts": [
          {
            "fileData": {
              "mimeType": "video/mp4",
              "fileUri": gcs_uri
            }
          },
          {
            "text": prompt
          }
        ]
      },
      "safety_settings": {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_LOW_AND_ABOVE"
      },
      "generation_config": {
        "temperature": 0.4,
        "topP": 1.0,
        "topK": 32,
        "maxOutputTokens": 2048
      }
    }
      
    prediction = requests.post(
      model_url,
      headers={'Authorization': 'Bearer %s' % credentials.token,
               'Content-Type': 'application/json'},
      json = request_body
    ).json()


    full_prediction = ""
    for pred in prediction:
      if "candidates" in pred:
        content = pred["candidates"][0]["content"]["parts"][0]["text"]
        full_prediction += content
    return full_prediction

  prompt = f"""
        Create a short description for this video with the following questions:
         1-) Where the video was from? 
         2-) How many people we have? 
         3-) What pople are doing? 
         4-) whats the proposition for the video?
         5-) A sumary description from the itens 1,2,3 and 4
        Format the 5 descriptions in a JSON format with the KEYS: Where, HowManyPeople, Task, Proposition and Description.
    """

  descriptions = gemini_predict(gcs_uri, prompt)

  return descriptions

In [12]:
generate_descriptions_udf = udf(generate_descriptions)

In [13]:
df_descriptions = paths_df.sort(paths_df.path.asc()).withColumn("data", generate_descriptions_udf(paths_df.path))

In [14]:
df_descriptions.cache()

DataFrame[path: string, data: string]

In [15]:
df_descriptions.toPandas()

                                                                                

Unnamed: 0,path,data
0,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-09f8.mp4,"```json\n {\n ""Where"": ""The video was recorded in a park."",\n ""HowManyPeople"": ""There are two people in the video."",\n ""Task"": ""The people are playing with a dog."",\n ""Proposition"": ""The video is about two people playing with a dog in a park."",\n ""Description"": ""The video is about two people playing with a dog in a park. The people are throwing a ball for the dog to fetch. The dog is running around and having fun. The people are laughing and enjoying themselves. The video is short and sweet, and it captures the joy of playing with a dog.""\n }\n ```"
1,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-1dba.mp4,"```json\n {\n ""Where"": ""From a table at home"",\n ""HowManyPeople"": ""One person"",\n ""Task"": ""Making a rainbow loom bracelet"",\n ""Proposition"": ""This video shows you how to make a rainbow loom bracelet."",\n ""Description"": ""A person is sitting at a table making a rainbow loom bracelet. The person is wearing a blue sweatshirt. The table is made of wood and is stained a dark brown color.""\n }\n ```"
2,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-2fd5.mp4,"```json\n {\n ""Where"": ""From a notebook."",\n ""HowManyPeople"": ""One person."",\n ""Task"": ""The person is drawing."",\n ""Proposition"": ""The video is about drawing."",\n ""Description"": ""A person is drawing in a notebook.""\n }\n ```"
3,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-5da7.mp4,"```json\n {\n ""Where"": ""The video was taken from a person's home."",\n ""HowManyPeople"": ""There is one person in the video."",\n ""Task"": ""The person is using a hot glue gun to create a design on a surface."",\n ""Proposition"": ""The video is about how to use a hot glue gun to create a design on a surface."",\n ""Description"": ""The video shows a person using a hot glue gun to create a design on a surface. The person is skilled in using the hot glue gun and is able to create a beautiful design. The video is informative and easy to follow.""\n }\n ```"
4,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-7fb1.mp4,"```json\n {\n ""Where"": ""From a couch in a living room."",\n ""HowManyPeople"": ""There is one person in the video."",\n ""Task"": ""The person is drawing a picture."",\n ""Proposition"": ""The video is about drawing a picture."",\n ""Description"": ""A person is sitting on a couch in a living room, drawing a picture. The person is wearing glasses and has short brown hair. They are using a pencil and a piece of paper. The picture is of a anime character.""\n }\n ```"


#### Extract feature from generated text

In [16]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType
schema = StructType(
    [
        StructField('Where', StringType(), True),
        StructField('HowManyPeople', StringType(), True),
        StructField('Proposition', StringType(), True),
        StructField('Description', StringType(), True),
        StructField('Task', StringType(), True)
    ]
)
df_final = df_descriptions.withColumn("data2", from_json(regexp_replace(regexp_replace(col("data"),"json", ""),"```",""), schema))\
    .select(col('path'),col('data2.*'))

In [17]:
df_final.toPandas()

                                                                                

Unnamed: 0,path,Where,HowManyPeople,Proposition,Description,Task
0,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-09f8.mp4,The video was recorded in a park.,There are two people in the video.,The video is about two people playing with a dog in a park.,"The video is about two people playing with a dog in a park. The people are throwing a ball for the dog to fetch. The dog is running around and having fun. The people are laughing and enjoying themselves. The video is short and sweet, and it captures the joy of playing with a dog.",The people are playing with a dog.
1,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-1dba.mp4,From a table at home,One person,This video shows you how to make a rainbow loom bracelet.,A person is sitting at a table making a rainbow loom bracelet. The person is wearing a blue sweatshirt. The table is made of wood and is stained a dark brown color.,Making a rainbow loom bracelet
2,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-2fd5.mp4,From a notebook.,One person.,The video is about drawing.,A person is drawing in a notebook.,The person is drawing.
3,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-5da7.mp4,The video was taken from a person's home.,There is one person in the video.,The video is about how to use a hot glue gun to create a design on a surface.,The video shows a person using a hot glue gun to create a design on a surface. The person is skilled in using the hot glue gun and is able to create a beautiful design. The video is informative and easy to follow.,The person is using a hot glue gun to create a design on a surface.
4,gs://dataproc-metastore-public-binaries/youtube_ucg/original_videos/HowTo/360P/HowTo_360P-7fb1.mp4,From a couch in a living room.,There is one person in the video.,The video is about drawing a picture.,"A person is sitting on a couch in a living room, drawing a picture. The person is wearing glasses and has short brown hair. They are using a pencil and a piece of paper. The picture is of a anime character.",The person is drawing a picture.
