# STT (Speech-to-Text) Module Implementation


**Note:** Uses open-source Whisper model locally. Requires ffmpeg for audio processing.

In [1]:
# Install required packages
%pip install openai-whisper ffmpeg-python pydub numpy torch torchaudio

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
import os
import tempfile
from typing import Optional, Tuple
import io
import numpy as np
import whisper
from pydub import AudioSegment
import ffmpeg

In [3]:
# Define SpeechTranscriber class
class SpeechTranscriber:
    def __init__(self, model_size: str = "base"):
        """Initialize with local Whisper model.
        
        Args:
            model_size: Size of Whisper model ('tiny', 'base', 'small', 'medium', 'large')
        """
        try:
            self.model = whisper.load_model(model_size)
            print(f"Whisper model '{model_size}' loaded successfully.")
        except Exception as e:
            raise RuntimeError(f"Failed to load Whisper model: {str(e)}")
    
    def transcribe_audio(self, audio_bytes: bytes, language: Optional[str] = None) -> str:
        """Transcribe audio bytes to text using local Whisper model."""
        try:
            # Save audio bytes to a temporary WAV file
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                temp_file.write(audio_bytes)
                temp_path = temp_file.name
            
            # Transcribe using local Whisper model
            result = self.model.transcribe(
                temp_path,
                language=language,  # Optional: specify language or let Whisper detect
                fp16=False  # Disable fp16 for CPU compatibility
            )
            
            # Clean up temporary file
            os.unlink(temp_path)
            
            return result["text"].strip()
        except Exception as e:
            raise RuntimeError(f"Transcription failed: {str(e)}")
    
    def detect_language(self, audio_bytes: bytes) -> str:
        """Detect the language of the audio using local Whisper model."""
        try:
            # Save audio bytes to a temporary WAV file
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                temp_file.write(audio_bytes)
                temp_path = temp_file.name
            
            # Detect language using Whisper
            # Whisper's transcribe method can detect language
            result = self.model.transcribe(
                temp_path,
                fp16=False
            )
            
            # Clean up temporary file
            os.unlink(temp_path)
            
            return result.get("language", "unknown")
        except Exception as e:
            raise RuntimeError(f"Language detection failed: {str(e)}")
    
    def transcribe_with_language(self, audio_bytes: bytes) -> Tuple[str, str]:
        """Transcribe audio and return both text and detected language."""
        try:
            # Save audio bytes to a temporary WAV file
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                temp_file.write(audio_bytes)
                temp_path = temp_file.name
            
            # Transcribe with language detection
            result = self.model.transcribe(
                temp_path,
                fp16=False
            )
            
            # Clean up temporary file
            os.unlink(temp_path)
            
            text = result["text"].strip()
            language = result.get("language", "unknown")
            
            return text, language
        except Exception as e:
            raise RuntimeError(f"Transcription with language detection failed: {str(e)}")

In [4]:
# Test the SpeechTranscriber class instantiation
try:
    transcriber = SpeechTranscriber(model_size="base")  # Use 'tiny' for faster loading, 'base' for better accuracy
    print("SpeechTranscriber initialized successfully with local Whisper model.")
except Exception as e:
    print(f"Error: {e}")

Whisper model 'base' loaded successfully.
SpeechTranscriber initialized successfully with local Whisper model.


In [5]:
# Test transcription with sample audio file
import os

# Load sample audio file
audio_file_path = "../audio_data/sample_speech.mp3"

if os.path.exists(audio_file_path):
    # Load MP3 file and convert to WAV bytes
    audio = AudioSegment.from_mp3(audio_file_path)
    
    # Export to WAV bytes (16kHz mono for Whisper compatibility)
    wav_buffer = io.BytesIO()
    audio.export(wav_buffer, format="wav")
    audio_bytes = wav_buffer.getvalue()
    
    print(f"Loaded audio file: {len(audio_bytes)} bytes")
    print(f"Audio duration: {len(audio)} ms")
    
    # Test transcription
    try:
        text = transcriber.transcribe_audio(audio_bytes)
        print(f"Transcription: {text}")
    except Exception as e:
        print(f"Transcription test failed: {e}")
else:
    print(f"Audio file not found: {audio_file_path}")
    print("SpeechTranscriber is ready for transcription. Provide WAV audio bytes to test.")

Loaded audio file: 442924 bytes
Audio duration: 10043 ms
Transcription: My thought I have nobody by a beauty and will as you poured. Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in a-


In [6]:
# Test language detection with sample audio file
import os

# Load sample audio file
audio_file_path = "../audio_data/sample_speech.mp3"

if os.path.exists(audio_file_path):
    # Load MP3 file and convert to WAV bytes
    audio = AudioSegment.from_mp3(audio_file_path)
    
    # Export to WAV bytes (16kHz mono for Whisper compatibility)
    wav_buffer = io.BytesIO()
    audio.export(wav_buffer, format="wav")
    audio_bytes = wav_buffer.getvalue()
    
    # Test language detection
    try:
        lang = transcriber.detect_language(audio_bytes)
        print(f"Detected language: {lang}")
    except Exception as e:
        print(f"Language detection test failed: {e}")
else:
    print(f"Audio file not found: {audio_file_path}")
    print("Please ensure the audio file exists for testing.")

Detected language: en


In [7]:
# Test combined transcription and language detection with sample audio file
import os

# Load sample audio file
audio_file_path = "../audio_data/sample_speech.mp3"

if os.path.exists(audio_file_path):
    # Load MP3 file and convert to WAV bytes
    audio = AudioSegment.from_mp3(audio_file_path)
    
    # Export to WAV bytes (16kHz mono for Whisper compatibility)
    wav_buffer = io.BytesIO()
    audio.export(wav_buffer, format="wav")
    audio_bytes = wav_buffer.getvalue()
    
    # Test combined transcription and language detection
    try:
        text, lang = transcriber.transcribe_with_language(audio_bytes)
        print(f"Text: {text}")
        print(f"Language: {lang}")
    except Exception as e:
        print(f"Combined test failed: {e}")
else:
    print(f"Audio file not found: {audio_file_path}")
    print("Please ensure the audio file exists for testing.")

Text: My thought I have nobody by a beauty and will as you poured. Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in a-
Language: en


In [8]:
# Test with the other sample file (sample.mp3)
import os

# Load the other sample audio file
audio_file_path = "../audio_data/sample.mp3"

if os.path.exists(audio_file_path):
    # Load MP3 file and convert to WAV bytes
    audio = AudioSegment.from_mp3(audio_file_path)
    
    # Export to WAV bytes (16kHz mono for Whisper compatibility)
    wav_buffer = io.BytesIO()
    audio.export(wav_buffer, format="wav")
    audio_bytes = wav_buffer.getvalue()
    
    print(f"Loaded audio file: {len(audio_bytes)} bytes")
    print(f"Audio duration: {len(audio)} ms")
    
    # Test combined transcription and language detection
    try:
        text, lang = transcriber.transcribe_with_language(audio_bytes)
        print(f"Text: {text}")
        print(f"Language: {lang}")
    except Exception as e:
        print(f"Combined test failed: {e}")
else:
    print(f"Audio file not found: {audio_file_path}")

Loaded audio file: 563756 bytes
Audio duration: 3196 ms
Text: 
Language: en
