# Agent 1 - Whisper + Llama3 (Ollama)

In [79]:
# imports 
import io
import json
import whisper
import requests
import webrtcvad
import numpy as np
import sounddevice as sd
from IPython.display import Audio
import scipy.io.wavfile as wavfile

## Audio

In [24]:
# audio configuration
_DURATION = 5 
_SAMPLE_RATE = 16000 # desired sample rate for models such as Whisper
_CHANNELS = 1

### Getting audio from the microphone

In [19]:
def take_audio(duration:int=_DURATION, samprate:int=_SAMPLE_RATE, chns:int=_CHANNELS) -> np.int16:
    print(f'Recording the microphone audio for {duration} seconds')
    audio = sd.rec(int(duration * samprate), samplerate=samprate, channels=chns, dtype='int16')
    sd.wait()
    print('Recording finished')
    return audio

In [31]:
def play_audio(_audio:np.int16, samprate:int=_SAMPLE_RATE) -> None:
    print('Playing the audio')
    sd.play(_audio, samprate)
    sd.wait()
    print('Audio finished')

The function below isn't work (detecting voice any time)

In [88]:
def automatic_audio_cap(samprate:int=_SAMPLE_RATE, aggress:int=3, silence_timeout:float=1.0, frame_duration_ms:int=30) -> np.float32:
    vad = webrtcvad.Vad()
    vad.set_mode(aggress)

    frame_size = int(samprate * frame_duration_ms / 1000)
    _buffer = []
    silent_chunks = 0
    max_silent_chunks = int(silence_timeout * 1000 / frame_duration_ms)
    _recording = False

    with sd.InputStream(samplerate=samprate, channels=1, dtype='int16') as stream:
        while True:
            audio_chunk, _ = stream.read(frame_size)
            audio_bytes = audio_chunk.tobytes()

            is_speech = vad.is_speech(audio_bytes, samprate)

            if not _recording:
                if is_speech:
                    print('Voice detected. Starting recording')
                    _recording = True
                    _buffer.append(audio_chunk)
            else:
                _buffer.append(audio_chunk)
                if not is_speech:
                    silent_chunks += 1
                else:
                    silent_chunks = 0
                if silent_chunks > max_silent_chunks:
                    print('Stopping recording ...')
                    break
    _audio = np.concatenate(_buffer, axis=0)
    _audio = _audio.astype(np.float32) / np.iinfo(np.int16).max
    _audio = np.squeeze(_audio)

    return _audio

#### Testing microphone

In [35]:
_audio_test = take_audio()
play_audio(_audio_test)

Recording the microphone audio for 5 seconds
Recording finished
Playing the audio
Audio finished


### Store audio in memory

In [23]:
def audio_buffer(_audio:np.int16, samprate:int=_SAMPLE_RATE):
    _buffer = io.BytesIO()
    wavfile.write(_buffer, samprate, _audio)
    _buffer.seek(0)
    return _buffer

#### Testing audio and buffer

In [43]:
_buf = audio_buffer(_audio_test)
_buf.seek(0)
Audio(_buf.read(), rate=_SAMPLE_RATE)

## Models

In [62]:
def transcription_audio(audio, model) -> str:
    print('Getting the audio text ...')
    result = model.transcribe(audio, fp16=False)
    return result['text']

#### [Whisper (OpenAI)](https://github.com/openai/whisper/blob/main/README.md)

Use to get the audio transcription

In [44]:
_model_size = 'base'

In [45]:
_model = whisper.load_model(_model_size)

In [54]:
# whisper expects float32
def _audio_to_model(_audio):
    _model_audio_in = _audio.astype(np.float32) / np.iinfo(_audio.dtype).max
    return np.squeeze(_model_audio_in)

In [56]:
_model_audio = _audio_to_model(_audio_test)

In [85]:
lol = automatic_audio_cap()
res = _model.transcribe(lol, fp16=False)
print(res['text'])

Voice detected. Starting recording
Stopping recording ...



In [58]:
# transcribe
_result = _model.transcribe(_model_audio, fp16=False)

In [60]:
print(_result['text'])

 Testando o microfone, um, dois, três, ABC.


#### Ollama

In [66]:
_OLLAMA_MODEL = 'llama3'
_OLLAMA_URL = 'http://localhost:11434/api/generate'

In [99]:
def question_ollama(prompt, model=_OLLAMA_MODEL, temp:float=0.7) -> None:

    _headers = {'Content-Type' : 'application/json'}
    
    print(f'Sending message to Ollama model {model} ...')
    _payload = {
        'model' : model,
        'prompt' : prompt,
        'stream' : True,
        'temperature' : temp
    } 

    response = requests.post(_OLLAMA_URL, headers=_headers, data=json.dumps(_payload), stream=True)
    
    print(f'{model.upper()} answer:')
    for line in response.iter_lines():
        if line:
            _data = json.loads(line.decode('utf-8'))
            if 'done' in _data and _data['done']:
                break
            print(_data.get('response', ''), end='', flush=True)

In [100]:
def llama3_conversation(model) -> None:
    # get audio
    _audio = take_audio()
    # fix data
    _audio_fixed = _audio_to_model(_audio)
    # transcription
    _human_text = transcription_audio(_audio_fixed, model)
    # send text
    _answer = question_ollama(_human_text)
    # get answer
    print(_answer)

In [101]:
llama3_conversation(_model)

Recording the microphone audio for 5 seconds
Recording finished
Getting the audio text ...
Sending message to Ollama model llama3 ...
LLAMA3 answer:
Uma pergunta interessante!

A quantidade de sangue no corpo humano varia dependendo do sexo e da idade. Em média, o volume total de sangue no corpo humano é cerca de 5 litros (5000 mililitros) para homens e aproximadamente 4,5 litros (4500 mililitros) para mulheres.

No entanto, é importante notar que essa quantidade pode variar ligeiramente dependendo do indivíduo. Por exemplo, pessoas mais magras ou com doenças crônicas podem ter volumes de sangue mais baixos, enquanto pessoas com obesidade ou doenças cardíacas podem ter volumes maiores.

Aqui está uma estimativa aproximada da distribuição do sangue no corpo humano:

* Coração: 0,2 litros (200 mililitros)
* Pulmões: 0,3 litros (300 mililitros)
* Braço esquerdo e direito: 0,5 litros (500 mililitros)
* Corpos cavernosos das artérias: 1,2 litros (1200 mililitros)
* Veias superficiais: 0,4 l