In [1]:
%pip install -qU python-dotenv
%pip install -qU langchain-openai langchain-community pydub librosa numpy uuid # Speech-To-Text
%pip install -qU chromadb langchain-chroma # ChromaDB
%pip install -qU google-cloud-texttospeech # Text-To-Speech

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Chromadb config

In [2]:
import os
import chromadb
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

def instanciate_chroma_client():
    """Instantiate the ChromaDB client."""
    return chromadb.CloudClient(
        api_key=os.getenv("CHROMA_API_KEY"),
        tenant=os.getenv("CHROMA_TENANT"),
        database=os.getenv("CHROMA_DATABASE")
    )

def get_chroma_collection(collection_name: str):
    """Get or create a ChromaDB collection."""
    return Chroma(
        client=instanciate_chroma_client(),
        embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"),
        collection_name=collection_name,
    )

## Speech-to-Text config

In [3]:
from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
from langchain_community.document_loaders.generic import GenericLoader

def instanciate_whisper():
    """Instantiate the OpenAI Whisper parser with the required parameters."""
    return OpenAIWhisperParser(
        api_key=os.getenv('OPENAI_API_KEY'),
        chunk_duration_threshold=0.7,  # Disregard audio chunks shorter than 0.7 seconds
        language="fr",  # Specify the language of the audio
        response_format="text",
        temperature=0.0,  # Deterministic output
        model="whisper-1"
    )

def get_blob_parser(file_path: str):
    """Create a loader for the audio file."""
    parser = instanciate_whisper()
    return GenericLoader.from_filesystem(
        path=file_path,
        parser=parser
    )

def get_transcription(file_path: str):
    """Load and transcribe the audio file."""
    loader = get_blob_parser(file_path)
    docs = loader.load()
    return docs

## Save to ChromaDB

In [4]:
from langchain_core.documents import Document
from uuid import uuid4

def save_to_chroma(docs, collection_name: str):
    """Save documents to a ChromaDB collection."""
    collection = get_chroma_collection(collection_name)
    documents = []
    docIds = []  # To ensure unique IDs

    for doc in docs:
        # Generate a unique ID for each document
        unique_id = str(uuid4())
        docIds.append(unique_id)

        documents.append(
            Document(
                page_content=doc.page_content,
                metadata={
                    "tag": doc.metadata.get("source", ""),
                    "id": unique_id
                }
            )
        )
    
    collection = get_chroma_collection(collection_name)
    collection.add_documents(documents=documents, ids=docIds)

    return print(f"Saved {len(documents)} documents to ChromaDB collection '{collection_name}'")

## Retrieve from DB

In [5]:
def retrieve_from_chroma(query: str, collection_name: str, nResults: int = 1):
    """Retrieve documents from a ChromaDB collection based on a query."""
    collection = get_chroma_collection(collection_name)
    retriever = collection.as_retriever(
        search_type="mmr", search_kwargs={"k": nResults, "fetch_k": nResults * 2}
    )

    query_res = retriever.invoke(input=query)

    if not query_res:
        return print("No results found for the query.")
    
    return query_res[0].page_content

## Text-to-Speech config

In [6]:
from google.cloud import texttospeech

def instanciate_tts_client(credentials_path: str = "../secrets/google_tts.json"):
    """Instantiate the Google Text-to-Speech client."""
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
    return texttospeech.TextToSpeechClient()

def synthesize_speech(text: str):
    """Synthesize speech from text using Google Text-to-Speech."""
    client = instanciate_tts_client()

    synthesis_input = texttospeech.SynthesisInput(text=text)

    voice = texttospeech.VoiceSelectionParams(
        language_code="fr-FR",
        name="fr-FR-Chirp3-HD-Charon"
    )

    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )

    response = client.synthesize_speech(
        input=synthesis_input,
        voice=voice,
        audio_config=audio_config
    )

    return response.audio_content

def save_audio_to_file(audio_content, file_path: str):
    """Save the synthesized audio content to a file."""
    with open(file_path, "wb") as out:
        out.write(audio_content)

    return file_path

## Recording flow

In [7]:
import sounddevice as sd
import numpy as np
import threading
import queue
from scipy.io.wavfile import write
from uuid import uuid4

# Global variables
stream = None
recording = []
SAMPLERATE = 44100
QUEUE = queue.Queue()
FINAL_PATH = None

def audio_callback(indata, frames, time, status):
    """Callback function to handle audio input."""
    QUEUE.put(indata.copy())

def start_recording(button=None):
    """Start recording audio from the microphone."""
    global stream, recording, QUEUE
    
    # Reset recording and queue
    recording = []
    QUEUE = queue.Queue()

    # Streaming start
    stream = sd.InputStream(samplerate=SAMPLERATE, channels=1, callback=audio_callback)
    stream.start()

    # Function to append recording chunks
    def collect():
        while stream and stream.active:
            try:
                chunk = QUEUE.get(timeout=0.1)
                recording.append(chunk)
            except queue.Empty:
                continue

    # Start the recording
    threading.Thread(target=collect, daemon=True).start()
    print("Recording started...")

def stop_recording(recording_type: str):
    """Stop the audio recording."""
    os.makedirs("../tmp", exist_ok=True)
    audio_path = None

    if not stream or not stream.active:
        print("No active recording to stop.")
        return

    stream.stop()
    stream.close()
    
    if not recording:
        print("No audio data recorded.")
        return

    audio_data = np.concatenate(recording, axis=0)

    audio_uid = str(uuid4())
    audio_path = f"../tmp/{audio_uid}.wav"
    write(audio_path, SAMPLERATE, audio_data)

    docs = get_transcription(audio_path)
    collection_name = "test_collection"

    if recording_type == "create":
        save_to_chroma(docs=docs, collection_name=collection_name)
    elif recording_type == "query":
        os.remove(audio_path)
        query = ""
        for doc in docs:
            query += doc.page_content + " "
        
        print(f"Querying ChromaDB with: {query.strip()}")
        result = retrieve_from_chroma(query=query, collection_name=collection_name, nResults=1)
        if result:
            audio_content = synthesize_speech(result)
            audio_path = save_audio_to_file(audio_content, f"../tmp/{audio_uid}.mp3")
    
    return audio_path if audio_path else None

## Buttons

In [10]:
# RECORDING_TYPE = "create"
RECORDING_TYPE = "query"

In [9]:
import glob
import ipywidgets as widgets
from IPython.display import display, Audio, clear_output

# Widgets
start_button = widgets.Button(description="Start recording", button_style='success')
stop_button = widgets.Button(description="Stop recording", button_style='danger', disabled=True)
status_label = widgets.HTML(value="<b>Ready to record</b>")
audio_output = widgets.Output()

def on_start_click(_):
    # Vider dossier tmp
    if os.path.exists("../tmp"):
        for f in glob.glob(os.path.join("../tmp", "*")):
            os.remove(f)

    start_button.disabled = True
    stop_button.disabled = False
    status_label.value = "<b style='color: red;'>🔴 Recording...</b>"
    
    # Clear previous audio
    with audio_output:
        clear_output()
    
    start_recording()

def on_stop_click(_):
    start_button.disabled = False
    stop_button.disabled = True
    status_label.value = "<b style='color: orange;'>⏳ Processing...</b>"

    audio_path = stop_recording(RECORDING_TYPE)

    if audio_path:
        with audio_output:
            clear_output(wait=True)
            display(Audio(filename=audio_path, autoplay=False))
        status_label.value = "<b style='color: green;'>✅ Recording finished</b>"
        return None
    else:
        status_label.value = "<b style='color: red;'>❌ Recording error</b>"

start_button.on_click(on_start_click)
stop_button.on_click(on_stop_click)

# Display interface
def show_ui():
    display(widgets.VBox([
        widgets.HBox([start_button, stop_button]),
        status_label,
        audio_output
    ]))

show_ui()

VBox(children=(HBox(children=(Button(button_style='success', description='Start recording', style=ButtonStyle(…