# Integration e2e tests for Whisper - ChromaDB - Google

In [None]:
%pip install -qU sounddevice scipy pydub ipywidgets

## Recording

In [None]:
import sounddevice as sd
import numpy as np
from scipy.io.wavfile import write
from pydub import AudioSegment
from IPython.display import display, Audio
from uuid import uuid4
import ipywidgets as widgets
import threading
import queue

samplerate = 44100
channels = 1
q = queue.Queue()
recording = []
stream = None

TMP_PATH = "../tmp/"
UID = str(uuid4())

def audio_callback(indata, frames, time, status):
    if status:
        print(status)
    q.put(indata.copy())

def start_recording(b):
    global stream, recording
    recording = []

    stream = sd.InputStream(samplerate=samplerate, channels=channels, callback=audio_callback)
    stream.start()

    def collect():
        while stream.active:
            chunk = q.get()
            recording.append(chunk)

    threading.Thread(target=collect, daemon=True).start()

def stop_recording(b):
    global stream

    stream.stop()
    stream.close()

    audio_data = np.concatenate(recording, axis=0)

    # Save .wav
    wav_path = TMP_PATH + UID + ".wav"
    write(wav_path, samplerate, audio_data)

    display(Audio(filename=wav_path))

# Widgets
start_button = widgets.Button(description="Record")
stop_button = widgets.Button(description="Stop")

start_button.on_click(start_recording)
stop_button.on_click(stop_recording)

display(widgets.HBox([start_button, stop_button]))

## STT

In [None]:
%pip install -qU langchain-openai langchain-community pydub librosa

In [None]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')

In [None]:
from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser

parser = OpenAIWhisperParser(
  api_key=OPENAI_API_KEY,
  chunk_duration_threshold=0.7, # means the parser will disregard audio chunks shorter than 0.7 seconds
  language="fr",  # Specify the language of the audio
  response_format="text",
  temperature=0.0, # deterministic
  model="whisper-1"
)

In [None]:
from langchain_community.document_loaders.generic import GenericLoader

# Define the path to the audio file
# audio_path = "../data/audio/test_whisper_clovis.m4a"

# Create a GenericLoader instance with the audio file and the parser
# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.generic.GenericLoader.html
loader = GenericLoader.from_filesystem(TMP_PATH + f"{UID}.wav", parser=parser)

# Load the documents using the loader
docs = loader.load()

## Save to DB

In [None]:
%pip install -qU chromadb python-dotenv langchain-chroma

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
# Instanciate chroma client

import os
import chromadb
from chromadb.api import ClientAPI

def get_chroma_client() -> ClientAPI:
  chroma_client = chromadb.CloudClient(
    api_key=os.getenv("CHROMA_API_KEY"),
    tenant=os.getenv("CHROMA_TENANT"),
    database=os.getenv("CHROMA_DATABASE")
  )

  return chroma_client

In [None]:
# Instanciate langchain vector store

from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

def get_chroma_collection(collection_name: str):
  collection = Chroma(
      client=get_chroma_client(),
      embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"),
      collection_name=collection_name,
  )

  return collection

In [None]:
# Add documents

from uuid import uuid4
from langchain_core.documents import Document

documents = []

for doc in docs:
    document = Document(page_content=doc.page_content, metadata={"tag": "test"})
    documents.append(document)

uuids = [str(uuid4()) for _ in range(len(documents))]
collection = get_chroma_collection("test_collection")
collection.add_documents(documents=documents, ids=uuids)

## Retrieve from DB

In [None]:
# Widgets
start_button = widgets.Button(description="Record")
stop_button = widgets.Button(description="Stop")

start_button.on_click(start_recording)
stop_button.on_click(stop_recording)

In [None]:
loader = GenericLoader.from_filesystem(TMP_PATH + f"{UID}.wav", parser=parser)
docs = loader.load()

# Query by retriever

retriever = collection.as_retriever(
    search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)

query_res = retriever.invoke(docs[0].page_content, filter=None, score_threshold=0)
query_res_parsed = query_res[0].page_content

print(query_res_parsed)

# TTS Google

Link to [Google TTS](https://python.langchain.com/docs/integrations/tools/google_cloud_texttospeech/)

## Setup google account

Link to [Google Cloud Console Tutorial](https://cloud.google.com/text-to-speech?hl=fr)

https://cloud.google.com/text-to-speech/docs/apis?hl=fr

## Install the library

In [None]:
%pip install -qU google-cloud-texttospeech

In [None]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../secrets/google_tts.json"

In [None]:
from google.cloud import texttospeech

# Crée un client
client = texttospeech.TextToSpeechClient()

# Définir le texte et la configuration
synthesis_input = texttospeech.SynthesisInput(text=query_res_parsed)

voice = texttospeech.VoiceSelectionParams(
    language_code="fr-FR",
    name="fr-FR-Chirp3-HD-Charon"
)

audio_config = texttospeech.AudioConfig(
    audio_encoding=texttospeech.AudioEncoding.MP3
)

# Appel de l'API
response = client.synthesize_speech(
    input=synthesis_input,
    voice=voice,
    audio_config=audio_config
)

# Sauvegarde du résultat
uid_2 = str(uuid4())
with open(f"../data/audio/{uid_2}.mp3", "wb") as out:
    out.write(response.audio_content)

display(Audio(filename=f"../data/audio/{uid_2}.mp3"))