In [34]:
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import HTMLResponse
import uvicorn
import torch
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
import whisper
import tempfile
import os

app = FastAPI()

# Load models once
vad_model = load_silero_vad()
whisper_model = whisper.load_model("base")

HTML_FORM = """
<!DOCTYPE html>
<html>
<head>
    <title>Audio Transcription</title>
</head>
<body>
    <h2>Upload Audio for Transcription</h2>
    <form action="/transcribe" enctype="multipart/form-data" method="post">
        <input type="file" name="audiofile" accept="audio/*" required>
        <br><br>
        <button type="submit">Transcribe</button>
    </form>
    {result_section}
</body>
</html>
"""

def transcribe_audio(file_path: str) -> str:
    # Read audio for VAD
    wav = read_audio(file_path)

    # Get speech timestamps from Silero VAD
    speech_timestamps = get_speech_timestamps(wav, vad_model, return_seconds=True)

    if not speech_timestamps:
        return "No speech detected in audio."

    # For simplicity, transcribe whole file with Whisper (can improve by chunking by timestamps)
    result = whisper_model.transcribe(file_path)
    return result.get("text", "")

@app.get("/", response_class=HTMLResponse)
async def main():
    return HTML_FORM.format(result_section="")

@app.post("/transcribe", response_class=HTMLResponse)
async def transcribe(audiofile: UploadFile = File(...)):
    # Save uploaded file temporarily
    try:
        suffix = os.path.splitext(audiofile.filename)[1]
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
            tmp.write(await audiofile.read())
            tmp_path = tmp.name

        transcription = transcribe_audio(tmp_path)

    finally:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

    result_html = f"<h3>Transcription Result:</h3><p>{transcription}</p>"
    return HTML_FORM.format(result_section=result_html)

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)


RuntimeError: asyncio.run() cannot be called from a running event loop

In [21]:
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)


2.5.1
2.5.1


In [23]:
import torchaudio
print(torchaudio.list_audio_backends())  # Should now list ['sox_io', 'soundfile'] or at least one


[]


In [25]:
import torchaudio
torchaudio.set_audio_backend('sox_io')
print(torchaudio.get_audio_backend())


None


  torchaudio.set_audio_backend('sox_io')
  print(torchaudio.get_audio_backend())
