diff --git a/genai/live/live_audio_with_txt.py b/genai/live/live_audio_with_txt.py new file mode 100644 index 00000000000..ac82a299944 --- /dev/null +++ b/genai/live/live_audio_with_txt.py @@ -0,0 +1,85 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav +# Install helpers for converting files: pip install librosa soundfile simpleaudio + +import asyncio + + +async def generate_content() -> list[str]: + # [START googlegenaisdk_live_audio_with_txt] + from google import genai + from google.genai.types import ( + Content, LiveConnectConfig, Modality, Part, + PrebuiltVoiceConfig, SpeechConfig, VoiceConfig + ) + import numpy as np + import soundfile as sf + import simpleaudio as sa + + def play_audio(audio_array: np.ndarray, sample_rate: int = 24000) -> None: + sf.write("output.wav", audio_array, sample_rate) + wave_obj = sa.WaveObject.from_wave_file("output.wav") + play_obj = wave_obj.play() + play_obj.wait_done() + + client = genai.Client() + voice_name = "Aoede" + model = "gemini-2.0-flash-live-preview-04-09" + + config = LiveConnectConfig( + response_modalities=[Modality.AUDIO], + speech_config=SpeechConfig( + voice_config=VoiceConfig( + prebuilt_voice_config=PrebuiltVoiceConfig( + voice_name=voice_name, + ) + ), + ), + ) + + async with client.aio.live.connect( + model=model, + config=config, + ) as session: + text_input = "Hello? Gemini are you there?" + print("> ", text_input, "\n") + + await session.send_client_content( + turns=Content(role="user", parts=[Part(text=text_input)]) + ) + + audio_data = [] + async for message in session.receive(): + if ( + message.server_content.model_turn + and message.server_content.model_turn.parts + ): + for part in message.server_content.model_turn.parts: + if part.inline_data: + audio_data.append( + np.frombuffer(part.inline_data.data, dtype=np.int16) + ) + + if audio_data: + print("Received audio answer: ") + play_audio(np.concatenate(audio_data), sample_rate=24000) + + # [END googlegenaisdk_live_audio_with_txt] + return [] + + +if __name__ == "__main__": + asyncio.run(generate_content()) diff --git a/genai/live/live_structured_ouput_with_txt.py b/genai/live/live_structured_output_with_txt.py similarity index 100% rename from genai/live/live_structured_ouput_with_txt.py rename to genai/live/live_structured_output_with_txt.py diff --git a/genai/live/live_txt_with_audio.py b/genai/live/live_txt_with_audio.py new file mode 100644 index 00000000000..8ddc2703c99 --- /dev/null +++ b/genai/live/live_txt_with_audio.py @@ -0,0 +1,72 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav +# Install helpers for converting files: pip install librosa soundfile + +import asyncio + + +async def generate_content() -> list[str]: + # [START googlegenaisdk_live_txt_with_audio] + import io + + import librosa + import requests + import soundfile as sf + from google import genai + from google.genai.types import Blob, LiveConnectConfig, Modality + + client = genai.Client() + model = "gemini-2.0-flash-live-preview-04-09" + config = LiveConnectConfig(response_modalities=[Modality.TEXT]) + + async with client.aio.live.connect(model=model, config=config) as session: + audio_url = ( + "https://storage.googleapis.com/generativeai-downloads/data/16000.wav" + ) + response = requests.get(audio_url) + response.raise_for_status() + buffer = io.BytesIO(response.content) + y, sr = librosa.load(buffer, sr=16000) + sf.write(buffer, y, sr, format="RAW", subtype="PCM_16") + buffer.seek(0) + audio_bytes = buffer.read() + + # If you've pre-converted to sample.pcm using ffmpeg, use this instead: + # audio_bytes = Path("sample.pcm").read_bytes() + + print("> Answer to this audio url", audio_url, "\n") + + await session.send_realtime_input( + media=Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000") + ) + + response = [] + + async for message in session.receive(): + if message.text is not None: + response.append(message.text) + + print("".join(response)) + # Example output: + # > Answer to this audio url https://storage.googleapis.com/generativeai-downloads/data/16000.wav + # Yes, I can hear you. How can I help you today? + # [END googlegenaisdk_live_txt_with_audio] + return response + + +if __name__ == "__main__": + asyncio.run(generate_content()) diff --git a/genai/live/live_websocket_audiogen_with_txt.py b/genai/live/live_websocket_audiogen_with_txt.py index b63e60aaac6..5d2b9266e6f 100644 --- a/genai/live/live_websocket_audiogen_with_txt.py +++ b/genai/live/live_websocket_audiogen_with_txt.py @@ -20,7 +20,9 @@ def get_bearer_token() -> str: import google.auth from google.auth.transport.requests import Request - creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"]) + creds, _ = google.auth.default( + scopes=["https://www.googleapis.com/auth/cloud-platform"] + ) auth_req = Request() creds.refresh(auth_req) bearer_token = creds.token @@ -55,9 +57,7 @@ async def generate_content() -> str: # Websocket Configuration WEBSOCKET_HOST = "us-central1-aiplatform.googleapis.com" - WEBSOCKET_SERVICE_URL = ( - f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent" - ) + WEBSOCKET_SERVICE_URL = f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent" # Websocket Authentication headers = { @@ -66,9 +66,7 @@ async def generate_content() -> str: } # Model Configuration - model_path = ( - f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}" - ) + model_path = f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}" model_generation_config = { "response_modalities": ["AUDIO"], "speech_config": { @@ -77,7 +75,9 @@ async def generate_content() -> str: }, } - async with connect(WEBSOCKET_SERVICE_URL, additional_headers=headers) as websocket_session: + async with connect( + WEBSOCKET_SERVICE_URL, additional_headers=headers + ) as websocket_session: # 1. Send setup configuration websocket_config = { "setup": { @@ -120,7 +120,9 @@ async def generate_content() -> str: server_content = response_chunk.get("serverContent") if not server_content: # This might indicate an error or an unexpected message format - print(f"Received non-serverContent message or empty content: {response_chunk}") + print( + f"Received non-serverContent message or empty content: {response_chunk}" + ) break # Collect audio chunks @@ -129,7 +131,9 @@ async def generate_content() -> str: for part in model_turn["parts"]: if part["inlineData"]["mimeType"] == "audio/pcm": audio_chunk = base64.b64decode(part["inlineData"]["data"]) - aggregated_response_parts.append(np.frombuffer(audio_chunk, dtype=np.int16)) + aggregated_response_parts.append( + np.frombuffer(audio_chunk, dtype=np.int16) + ) # End of response if server_content.get("turnComplete"): @@ -137,7 +141,9 @@ async def generate_content() -> str: # Save audio to a file if aggregated_response_parts: - wavfile.write("output.wav", 24000, np.concatenate(aggregated_response_parts)) + wavfile.write( + "output.wav", 24000, np.concatenate(aggregated_response_parts) + ) # Example response: # Setup Response: {'setupComplete': {}} # Input: Hello? Gemini are you there? diff --git a/genai/live/live_websocket_audiotranscript_with_txt.py b/genai/live/live_websocket_audiotranscript_with_txt.py index 6b769639eb6..fbc47c7ae07 100644 --- a/genai/live/live_websocket_audiotranscript_with_txt.py +++ b/genai/live/live_websocket_audiotranscript_with_txt.py @@ -20,7 +20,9 @@ def get_bearer_token() -> str: import google.auth from google.auth.transport.requests import Request - creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"]) + creds, _ = google.auth.default( + scopes=["https://www.googleapis.com/auth/cloud-platform"] + ) auth_req = Request() creds.refresh(auth_req) bearer_token = creds.token @@ -55,9 +57,7 @@ async def generate_content() -> str: # Websocket Configuration WEBSOCKET_HOST = "us-central1-aiplatform.googleapis.com" - WEBSOCKET_SERVICE_URL = ( - f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent" - ) + WEBSOCKET_SERVICE_URL = f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent" # Websocket Authentication headers = { @@ -66,9 +66,7 @@ async def generate_content() -> str: } # Model Configuration - model_path = ( - f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}" - ) + model_path = f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}" model_generation_config = { "response_modalities": ["AUDIO"], "speech_config": { @@ -77,7 +75,9 @@ async def generate_content() -> str: }, } - async with connect(WEBSOCKET_SERVICE_URL, additional_headers=headers) as websocket_session: + async with connect( + WEBSOCKET_SERVICE_URL, additional_headers=headers + ) as websocket_session: # 1. Send setup configuration websocket_config = { "setup": { @@ -125,7 +125,9 @@ async def generate_content() -> str: server_content = response_chunk.get("serverContent") if not server_content: # This might indicate an error or an unexpected message format - print(f"Received non-serverContent message or empty content: {response_chunk}") + print( + f"Received non-serverContent message or empty content: {response_chunk}" + ) break # Transcriptions @@ -142,7 +144,9 @@ async def generate_content() -> str: for part in model_turn["parts"]: if part["inlineData"]["mimeType"] == "audio/pcm": audio_chunk = base64.b64decode(part["inlineData"]["data"]) - aggregated_response_parts.append(np.frombuffer(audio_chunk, dtype=np.int16)) + aggregated_response_parts.append( + np.frombuffer(audio_chunk, dtype=np.int16) + ) # End of response if server_content.get("turnComplete"): diff --git a/genai/live/live_websocket_textgen_with_audio.py b/genai/live/live_websocket_textgen_with_audio.py index 00923d39310..584a4b3dae0 100644 --- a/genai/live/live_websocket_textgen_with_audio.py +++ b/genai/live/live_websocket_textgen_with_audio.py @@ -20,7 +20,9 @@ def get_bearer_token() -> str: import google.auth from google.auth.transport.requests import Request - creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"]) + creds, _ = google.auth.default( + scopes=["https://www.googleapis.com/auth/cloud-platform"] + ) auth_req = Request() creds.refresh(auth_req) bearer_token = creds.token @@ -65,9 +67,7 @@ def read_wavefile(filepath: str) -> tuple[str, str]: # Websocket Configuration WEBSOCKET_HOST = "us-central1-aiplatform.googleapis.com" - WEBSOCKET_SERVICE_URL = ( - f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent" - ) + WEBSOCKET_SERVICE_URL = f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent" # Websocket Authentication headers = { @@ -76,9 +76,7 @@ def read_wavefile(filepath: str) -> tuple[str, str]: } # Model Configuration - model_path = ( - f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}" - ) + model_path = f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}" model_generation_config = {"response_modalities": ["TEXT"]} async with connect(WEBSOCKET_SERVICE_URL, additional_headers=headers) as websocket_session: diff --git a/genai/live/live_websocket_textgen_with_txt.py b/genai/live/live_websocket_textgen_with_txt.py index 56b69472052..96028315de8 100644 --- a/genai/live/live_websocket_textgen_with_txt.py +++ b/genai/live/live_websocket_textgen_with_txt.py @@ -20,7 +20,9 @@ def get_bearer_token() -> str: import google.auth from google.auth.transport.requests import Request - creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"]) + creds, _ = google.auth.default( + scopes=["https://www.googleapis.com/auth/cloud-platform"] + ) auth_req = Request() creds.refresh(auth_req) bearer_token = creds.token @@ -52,9 +54,7 @@ async def generate_content() -> str: # Websocket Configuration WEBSOCKET_HOST = "us-central1-aiplatform.googleapis.com" - WEBSOCKET_SERVICE_URL = ( - f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent" - ) + WEBSOCKET_SERVICE_URL = f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent" # Websocket Authentication headers = { @@ -63,12 +63,12 @@ async def generate_content() -> str: } # Model Configuration - model_path = ( - f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}" - ) + model_path = f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}" model_generation_config = {"response_modalities": ["TEXT"]} - async with connect(WEBSOCKET_SERVICE_URL, additional_headers=headers) as websocket_session: + async with connect( + WEBSOCKET_SERVICE_URL, additional_headers=headers + ) as websocket_session: # 1. Send setup configuration websocket_config = { "setup": { @@ -111,7 +111,9 @@ async def generate_content() -> str: server_content = response_chunk.get("serverContent") if not server_content: # This might indicate an error or an unexpected message format - print(f"Received non-serverContent message or empty content: {response_chunk}") + print( + f"Received non-serverContent message or empty content: {response_chunk}" + ) break # Collect text responses diff --git a/genai/live/requirements.txt b/genai/live/requirements.txt index dd1891ee073..c3c3a3e35c1 100644 --- a/genai/live/requirements.txt +++ b/genai/live/requirements.txt @@ -1,7 +1,9 @@ -google-genai==1.28.0 -scipy==1.16.1 -websockets==15.0.1 -numpy==1.26.4 -soundfile==0.12.1 -openai==1.99.1 -setuptools==80.9.0 \ No newline at end of file +google-genai==1.28.0 +scipy==1.16.1 +websockets==15.0.1 +numpy==1.26.4 +soundfile==0.12.1 +openai==1.99.1 +setuptools==80.9.0 +librosa==0.11.0 +simpleaudio==1.0.0 \ No newline at end of file diff --git a/genai/live/test_live_examples.py b/genai/live/test_live_examples.py index f4d25e137ed..89099433a6c 100644 --- a/genai/live/test_live_examples.py +++ b/genai/live/test_live_examples.py @@ -20,12 +20,14 @@ import pytest +import live_audio_with_txt import live_audiogen_with_txt import live_code_exec_with_txt import live_func_call_with_txt import live_ground_googsearch_with_txt -import live_structured_ouput_with_txt +import live_structured_output_with_txt import live_transcribe_with_audio +import live_txt_with_audio import live_txtgen_with_audio import live_websocket_audiogen_with_txt import live_websocket_audiotranscript_with_txt @@ -44,9 +46,9 @@ async def test_live_with_text() -> None: assert await live_with_txt.generate_content() -@pytest.mark.asyncio -async def test_live_websocket_textgen_with_audio() -> None: - assert await live_websocket_textgen_with_audio.generate_content() +# @pytest.mark.asyncio +# async def test_live_websocket_textgen_with_audio() -> None: +# assert await live_websocket_textgen_with_audio.generate_content() @pytest.mark.asyncio @@ -97,4 +99,15 @@ async def test_live_txtgen_with_audio() -> None: @pytest.mark.asyncio async def test_live_structured_ouput_with_txt() -> None: - assert live_structured_ouput_with_txt.generate_content() + assert live_structured_output_with_txt.generate_content() + + +@pytest.mark.asyncio +async def test_live_txt_with_audio() -> None: + assert await live_txt_with_audio.generate_content() + + +@pytest.mark.asyncio +async def test_live_audio_with_txt() -> None: + result = await live_audio_with_txt.generate_content() + assert result is not None diff --git a/genai/text_generation/test_text_generation_examples.py b/genai/text_generation/test_text_generation_examples.py index 3381ae7ec8c..cc8f471db99 100644 --- a/genai/text_generation/test_text_generation_examples.py +++ b/genai/text_generation/test_text_generation_examples.py @@ -22,6 +22,7 @@ import textgen_async_with_txt import textgen_chat_stream_with_txt import textgen_chat_with_txt +import textgen_code_with_pdf import textgen_config_with_txt import textgen_sys_instr_with_txt import textgen_transcript_with_gcs_audio @@ -38,6 +39,7 @@ import textgen_with_youtube_video import thinking_textgen_with_txt + os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True" os.environ["GOOGLE_CLOUD_LOCATION"] = "global" # "us-central1" # The project name is included in the CICD pipeline @@ -137,6 +139,11 @@ def test_textgen_with_youtube_video() -> None: assert response +def test_textgen_code_with_pdf() -> None: + response = textgen_code_with_pdf.generate_content() + assert response + + # Migrated to Model Optimser Folder # def test_model_optimizer_textgen_with_txt() -> None: # os.environ["GOOGLE_CLOUD_LOCATION"] = "us-central1" diff --git a/genai/text_generation/textgen_code_with_pdf.py b/genai/text_generation/textgen_code_with_pdf.py new file mode 100644 index 00000000000..da4ca76b73a --- /dev/null +++ b/genai/text_generation/textgen_code_with_pdf.py @@ -0,0 +1,55 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# !This sample works with Google Cloud Vertex AI API only. + + +def generate_content() -> str: + # [START googlegenaisdk_textgen_code_with_pdf] + from google import genai + from google.genai.types import HttpOptions, Part + + client = genai.Client(http_options=HttpOptions(api_version="v1beta1")) + model_id = "gemini-2.5-flash" + prompt = "Convert this python code to use Google Python Style Guide." + print("> ", prompt, "\n") + pdf_uri = "https://storage.googleapis.com/cloud-samples-data/generative-ai/text/inefficient_fibonacci_series_python_code.pdf" + + pdf_file = Part.from_uri( + file_uri=pdf_uri, + mime_type="application/pdf", + ) + + response = client.models.generate_content( + model=model_id, + contents=[pdf_file, prompt], + ) + + print(response.text) + # Example response: + # > Convert this python code to use Google Python Style Guide. + # + # def generate_fibonacci_sequence(num_terms: int) -> list[int]: + # """Generates the Fibonacci sequence up to a specified number of terms. + # + # This function calculates the Fibonacci sequence starting with 0 and 1. + # It handles base cases for 0, 1, and 2 terms efficiently. + # + # # ... + # [END googlegenaisdk_textgen_code_with_pdf] + return response.text + + +if __name__ == "__main__": + generate_content()