diff --git a/RELEASES.md b/RELEASES.md index 27c1ca86..d3490cb4 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -4,6 +4,10 @@ first release version. +## v0.1.8 + +* add speaker and listener with audio rate conversion + ## v0.1.7 * update speaker and listener with pyaudio device_index argument diff --git a/ghostos/framework/audio/pyaudio_io/__init__.py b/ghostos/framework/audio/pyaudio_io/__init__.py index d8f1ead1..32cd9551 100644 --- a/ghostos/framework/audio/pyaudio_io/__init__.py +++ b/ghostos/framework/audio/pyaudio_io/__init__.py @@ -4,6 +4,7 @@ def get_pyaudio_pcm16_listener( rate: int = 24000, + output_rate: int = 24000, interval: float = 0.5, channels: int = 1, chunk_size: int = 1024, @@ -15,7 +16,8 @@ def get_pyaudio_pcm16_listener( raise ImportError(f"pyaudio package is required. run `pip install ghostos[audio]`") from ghostos.framework.audio.pyaudio_io.listener import PyAudioPCM16Listener return PyAudioPCM16Listener( - rate=rate, + sample_rate=rate, + output_rate=output_rate, interval=interval, channels=channels, chunk_size=chunk_size, @@ -24,7 +26,8 @@ def get_pyaudio_pcm16_listener( def get_pyaudio_pcm16_speaker( - rate: int = 24000, + input_rate: int = 24000, + output_rate: int = 24000, buffer_size: int = 1024 * 5, channels: int = 1, output_device_index: Union[int, None] = None, @@ -35,7 +38,8 @@ def get_pyaudio_pcm16_speaker( raise ImportError(f"pyaudio package is required. run `pip install ghostos[audio]`") from ghostos.framework.audio.pyaudio_io.speaker import PyAudioPCM16Speaker return PyAudioPCM16Speaker( - rate=rate, + input_rate=input_rate, + output_rate=output_rate, buffer_size=buffer_size, channels=channels, output_device_index=output_device_index, diff --git a/ghostos/framework/audio/pyaudio_io/example.py b/ghostos/framework/audio/pyaudio_io/example.py index 39d9fe8a..df9eb2fa 100644 --- a/ghostos/framework/audio/pyaudio_io/example.py +++ b/ghostos/framework/audio/pyaudio_io/example.py @@ -9,7 +9,10 @@ if __name__ == '__main__': - listener = PyAudioPCM16Listener() + listener = PyAudioPCM16Listener( + sample_rate=44100, + output_rate=24000, + ) ticker = Timeleft(0) heard = BytesIO() @@ -46,7 +49,7 @@ def read() -> Union[bytes, None]: return heard.read(1024) - speaker = PyAudioPCM16Speaker() + speaker = PyAudioPCM16Speaker(input_rate=24000, output_rate=44100) print("start speaking, %f" % ticker.passed()) with speaker.speak(read) as speaking: speaking.wait() diff --git a/ghostos/framework/audio/pyaudio_io/listener.py b/ghostos/framework/audio/pyaudio_io/listener.py index c69907c5..63b48308 100644 --- a/ghostos/framework/audio/pyaudio_io/listener.py +++ b/ghostos/framework/audio/pyaudio_io/listener.py @@ -1,8 +1,10 @@ try: from pyaudio import PyAudio, paInt16 + from scipy.signal import resample except ImportError: raise ImportError(f"Pyaudio is required, please install pyaudio or ghostos[audio] first") +import numpy as np from typing import Callable, Optional from ghostos.abcd.realtime import Listener, Listening from threading import Thread, Event @@ -18,25 +20,28 @@ class PyAudioPCM16Listener(Listener): def __init__( self, - rate: int = 24000, + sample_rate: int = 24000, + output_rate: int = 24000, chunk_size: int = CHUNK, interval: float = 0.5, channels: int = CHANNELS, input_device_index: Optional[int] = None, ): - self.rate = rate + self.sample_rate = sample_rate + self.output_rate = output_rate self.chunk_size = chunk_size self.stream = PyAudio().open( format=paInt16, channels=channels, - rate=self.rate, + rate=self.sample_rate, input=True, input_device_index=input_device_index, ) self.interval = interval def listen(self, sender: Callable[[bytes], None]) -> Listening: - return PyAudioPCM16Listening(self.stream, sender, self.rate, self.chunk_size, self.interval) + return PyAudioPCM16Listening(self.stream, sender, self.sample_rate, self.output_rate, self.chunk_size, + self.interval) def __del__(self): self.stream.close() @@ -48,14 +53,16 @@ def __init__( self, stream, sender: Callable[[bytes], None], - rate: int = 24000, + sample_rate: int = 24000, + output_rate: int = 24000, chunk: int = CHUNK, interval: float = 0.5, ): self.sender = sender self.stream = stream self.interval = interval - self.rate = rate + self.sample_rate = sample_rate + self.output_rate = output_rate self.chunk = chunk self.stopped = Event() self.thread = Thread(target=self._listening) @@ -64,12 +71,25 @@ def _listening(self): self.stream.start_stream() while not self.stopped.is_set(): buffer = BytesIO() - for i in range(int((self.rate / self.chunk) * self.interval)): + for i in range(int((self.sample_rate / self.chunk) * self.interval)): data = self.stream.read(self.chunk, exception_on_overflow=False) buffer.write(data) - self.sender(buffer.getvalue()) + parsed = self._parse_output_data(buffer.getvalue()) + self.sender(parsed) self.stream.stop_stream() + def _parse_output_data(self, data: bytes) -> bytes: + if self.sample_rate == self.output_rate: + return data + audio_data = np.frombuffer(data, dtype=np.int16) + num_samples = int(len(audio_data) * self.output_rate / self.sample_rate) + + # 使用 resample 进行重新采样 + resampled_audio = resample(audio_data, num_samples) + + # 导出为二进制数据 + return resampled_audio.astype(np.int16) + def __enter__(self): self.thread.start() diff --git a/ghostos/framework/audio/pyaudio_io/speaker.py b/ghostos/framework/audio/pyaudio_io/speaker.py index 9e937dfd..6d1de924 100644 --- a/ghostos/framework/audio/pyaudio_io/speaker.py +++ b/ghostos/framework/audio/pyaudio_io/speaker.py @@ -1,8 +1,10 @@ try: from pyaudio import PyAudio, paInt16 + from scipy.signal import resample except ImportError: raise ImportError(f"Pyaudio is required, please install pyaudio or ghostos[audio] first") +import numpy as np from typing import Callable, Union from ghostos.abcd.realtime import Speaker, Speaking from threading import Thread, Event @@ -12,23 +14,25 @@ class PyAudioPCM16Speaker(Speaker): def __init__( self, - rate: int = 24000, + input_rate: int = 24000, + output_rate: int = 24000, buffer_size: int = 4096, channels: int = 1, output_device_index: Union[int, None] = None, ): - self.rate = rate + self.input_rate = input_rate + self.output_rate = output_rate self.buffer_size = buffer_size self.stream = PyAudio().open( format=paInt16, channels=channels, - rate=self.rate, + rate=self.output_rate, output=True, output_device_index=output_device_index, ) def speak(self, queue: Callable[[], Union[bytes, None]]) -> Speaking: - return PyAudioPCM16Speaking(self.stream, queue, self.rate, self.buffer_size) + return PyAudioPCM16Speaking(self.stream, queue, self.input_rate, self.output_rate, self.buffer_size) def __del__(self): self.stream.close() @@ -36,9 +40,17 @@ def __del__(self): class PyAudioPCM16Speaking(Speaking): - def __init__(self, stream, queue: Callable[[], Union[bytes, None]], rate: int = 24000, buffer_size: int = 0): + def __init__( + self, + stream, + queue: Callable[[], Union[bytes, None]], + input_rate: int = 24000, + output_rate: int = 24000, + buffer_size: int = 0, + ): self.stream = stream - self.rate = rate + self.input_rate = input_rate + self.output_rate = output_rate self.buffer_size = buffer_size self.queue = queue self.stop = Event() @@ -52,9 +64,22 @@ def _speaking(self): data = self.queue() if not data: break - self.stream.write(data) + parsed = self._parse_output_data(data) + self.stream.write(parsed) self._done = True + def _parse_output_data(self, data: bytes) -> bytes: + if self.input_rate == self.output_rate: + return data + audio_data = np.frombuffer(data, dtype=np.int16) + num_samples = int(len(audio_data) * self.output_rate / self.input_rate) + + # 使用 resample 进行重新采样 + resampled_audio = resample(audio_data, num_samples) + + # 导出为二进制数据 + return resampled_audio.astype(np.int16) + def __enter__(self): self.thread.start() return self diff --git a/ghostos/framework/openai_realtime/client.py b/ghostos/framework/openai_realtime/client.py index 4e5b4f78..c99a3c6d 100644 --- a/ghostos/framework/openai_realtime/client.py +++ b/ghostos/framework/openai_realtime/client.py @@ -161,6 +161,7 @@ def save_audio_data(self, item_id: str, audio_data: bytes) -> None: with wave.open(buffer, 'wb') as f: f.setnchannels(1) f.setsampwidth(2) + # todo: save rate by configs f.setframerate(24000) f.writeframes(audio_data) diff --git a/ghostos/prototypes/streamlitapp/pages/chat_with_ghost.py b/ghostos/prototypes/streamlitapp/pages/chat_with_ghost.py index ce51a6a4..a2ebed10 100644 --- a/ghostos/prototypes/streamlitapp/pages/chat_with_ghost.py +++ b/ghostos/prototypes/streamlitapp/pages/chat_with_ghost.py @@ -259,13 +259,15 @@ def get_realtime_app(conversation: Conversation) -> Optional[RealtimeApp]: audio_input = app_conf.audio_input audio_output = app_conf.audio_output speaker = get_pyaudio_pcm16_speaker( - rate=audio_output.sample_rate, + input_rate=audio_output.input_rate, + output_rate=audio_output.output_rate, buffer_size=audio_output.buffer_size, channels=audio_output.channels, output_device_index=audio_output.output_device_index, ) listener = get_pyaudio_pcm16_listener( rate=audio_input.sample_rate, + output_rate=audio_input.output_rate, interval=audio_input.interval, channels=audio_input.channels, chunk_size=audio_input.chunk_size, diff --git a/ghostos/prototypes/streamlitapp/resources.py b/ghostos/prototypes/streamlitapp/resources.py index d935eb5d..c0849413 100644 --- a/ghostos/prototypes/streamlitapp/resources.py +++ b/ghostos/prototypes/streamlitapp/resources.py @@ -20,6 +20,7 @@ def get_container() -> Container: class AudioInputConf(BaseModel): sample_rate: int = Field(24000) + output_rate: int = Field(24000) interval: float = Field(0.5) channels: int = Field(1) chunk_size: int = Field(1024) @@ -27,7 +28,8 @@ class AudioInputConf(BaseModel): class AudioOutputConf(BaseModel): - sample_rate: int = Field(24000) + input_rate: int = Field(24000) + output_rate: int = Field(24000) channels: int = Field(1) buffer_size: int = Field(1024 * 5) output_device_index: Union[int, None] = Field(None) diff --git a/ghostos/prototypes/streamlitapp/widgets/messages.py b/ghostos/prototypes/streamlitapp/widgets/messages.py index 47fb2017..9b34a9f8 100644 --- a/ghostos/prototypes/streamlitapp/widgets/messages.py +++ b/ghostos/prototypes/streamlitapp/widgets/messages.py @@ -188,8 +188,6 @@ def render_message_item(msg: Message, debug: bool): render_user_message(msg, debug) elif msg.role == Role.SYSTEM.value: render_sys_message(msg, debug) - elif msg.role == Role.FUNCTION.value: - render_func_message(msg, debug) else: render_other_message(msg, debug) diff --git a/pyproject.toml b/pyproject.toml index 8bcca594..0951ed21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,13 +44,14 @@ spherov2 = { version = "^0.12.1", optional = true } bleak = [ { version = "^0.22.3", python = ">=3.10,<3.14", optional = true } ] +scipy = { version = "^1.15.1", optional = true } [tool.poetry.scripts] ghostos = "ghostos.scripts.cli:main" [tool.poetry.extras] -realtime = ['pyaudio'] -sphero = ["spherov2", "bleak", "pyaudio"] +realtime = ['pyaudio', "scipy"] +sphero = ["spherov2", "bleak"] [tool.poetry.group.dev.dependencies]