# Speech-to-Text Application feat. Google Cloud Speech-to-Text API

In [15]:
import wave
import sys

import pyaudio

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1 if sys.platform == 'darwin' else 2
RATE = 44100
RECORD_SECONDS = 5

with wave.open('output.wav', 'wb') as wf:
    p = pyaudio.PyAudio()
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)

    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True)

    print('Recording...')
    for _ in range(0, RATE // CHUNK * RECORD_SECONDS):
        data= stream.read(CHUNK)
        wf.writeframes(data)
    print('Done')

    stream.close()
    p.terminate()

Recording...
Done


In [16]:
len(data)

4096

In [26]:
from google.cloud import speech
import os
import pyaudio
import io

# 设置Google Cloud凭据环境变量
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gcpai.json"

client = speech.SpeechClient()

# 音频录制参数
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

def record_and_recognize_audio():
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print("请开始说话...")

    # 录制短暂的音频片段
    frames = []
    for _ in range(0, int(RATE / CHUNK * 25)):  # 5 seconds
        data = stream.read(CHUNK)
        frames.append(data)

    print("录音结束，正在识别...")

    # 停止录音
    stream.stop_stream()
    stream.close()
    p.terminate()

    # 将录音数据转换为音频文件
    audio = speech.RecognitionAudio(content=b''.join(frames))
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code="zh-TW"  # 假设语音为中文
    )

    # 识别音频文件
    response = client.recognize(config=config, audio=audio)

    for result in response.results:
        print("识别结果：{}".format(result.alternatives[0].transcript))

record_and_recognize_audio()

请开始说话...
录音结束，正在识别...
识别结果：開始一想二里共三夫子不識四書五經都已經感覺七八九十分大概就是9瓶做的吧長7600分5號三心二意一等下就讀不下午就小書僮可笑可笑起來都不將就是你將軍提防提防車車禍


In [None]:
from google.cloud import speech
import os
import pyaudio
import io
from pydub import AudioSegment
import threading

AudioSegment.converter = r"C:\ffmpeg-6.1.1-full_build\bin\ffmpeg.exe"

# 设置Google Cloud凭据环境变量
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gcpai.json"

# 音频录制参数
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms
FORMAT = pyaudio.paInt16
CHANNELS = 1

# 用于控制录音的全局变量
is_recording = True

def record_audio(queue):
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print("请开始说话...（按's'键然后回车停止录音）")

    frames = []

    while is_recording:
        data = stream.read(CHUNK)
        frames.append(data)

    # 停止录音
    stream.stop_stream()
    stream.close()
    p.terminate()

    # 将录音数据放入队列中
    queue.append(b''.join(frames))

def save_audio(queue):
    # 等待录音结束
    while is_recording or not queue:
        pass
    
    # 转换为AudioSegment
    audio_segment = AudioSegment(
        data=queue[0],
        sample_width=pyaudio.PyAudio().get_sample_size(FORMAT),
        frame_rate=RATE,
        channels=CHANNELS
    )
    
    # 保存为MP3
    audio_segment.export("recorded_audio2.mp3", format="mp3")
    print("录音已保存为MP3")

def main():
    global is_recording
    audio_queue = []

    # 开始录音的线程
    record_thread = threading.Thread(target=record_audio, args=(audio_queue,))
    record_thread.start()

    # 等待用户输入停止录音的指令
    if input() == 's':
        is_recording = False
        record_thread.join()
    
    # 保存音频为MP3的线程
    save_thread = threading.Thread(target=save_audio, args=(audio_queue,))
    save_thread.start()
    save_thread.join()

    print("录音结束，正在识别...")

if __name__ == "__main__":
    main()