In [None]:
# pip install --upgrade google-cloud-aiplatform
# pip install vertexai
# pip install langdetect

# Authorize
# gcloud auth application-default login

## Transcribes a single file

In [None]:
import base64
import json  # Import the json module
import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
from vertexai import generative_models


def generate():
    vertexai.init(project="pawait-ccai-test", location="us-central1")
    model = GenerativeModel("gemini-1.5-flash-preview-0514")

    # Initialize an empty list to store responses
    responses_list = []

    responses = model.generate_content(
        ["""Transcribe the audio""", audio1, text1],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=True,
    )

    # Append each response to the list instead of printing
    for response in responses:
        responses_list.append(response.text)

    return responses_list


audio1 = Part.from_uri(
    mime_type="audio/mpeg",
    uri="gs://test-audio-segments/ke_convos/final_dataset/10a5b3a2-3e35-4702-8e6c-d0e10006e44c_4.mp3",
)
text1 = """Transcribe the audio and write the content in the JSON file below without time stamps. And then translate. For the output, give it tanscript and translate as key value pairs"""

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

# Call the generate function and get the list of responses
responses_list = generate()

with open("transcript.json", "w") as f:
    json.dump(responses_list, f, indent=4)

## Transcribes all audios in a GCS Bucket

In [None]:
import base64
import vertexai
from google.cloud import storage
from vertexai import generative_models
from vertexai.generative_models import GenerativeModel, Part, FinishReason


def init_vertex_ai():
    vertexai.init(project="pawait-ccai-test", location="us-central1")


def get_audio_uris(bucket_name, prefix):
    # Initialize a GCS client
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    # Filter blobs to include only audio files
    audio_uris = [
        f"gs://{bucket_name}/{blob.name}"
        for blob in blobs
        if blob.name.endswith(".mp3")
    ]
    return audio_uris


def generate(audio_uris):
    model = GenerativeModel("gemini-1.5-flash-preview-0514")
    for audio_uri in audio_uris:
        audio_part = Part.from_uri(uri=audio_uri, mime_type="audio/mpeg")
        text_prompt = """Transcribe the audio and translate to English. First give transcription and then the translation."""

        safety_settings = {
            generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
        }

        responses = model.generate_content(
            [text_prompt, audio_part],
            generation_config={
                "max_output_tokens": 8192,
                "temperature": 1,
                "top_p": 0.95,
            },
            safety_settings=safety_settings,
            stream=True,
        )
        for response in responses:
            if response.text.strip():
                print(response.text, end="")
            else:
                print("No text found in the response.", end="")


if __name__ == "__main__":
    init_vertex_ai()
    bucket_name = "test-audio-segments"
    prefix = "ke_convos/final_dataset/"
    audio_uris = get_audio_uris(bucket_name, prefix)
    generate(audio_uris)