diff --git a/.env.example b/.env.example index 73366c5..c2cdd8e 100644 --- a/.env.example +++ b/.env.example @@ -4,9 +4,10 @@ OPENAI_API_KEY= HF_TOKEN= # Model preferences -LLM_BACKEND=ollama # Options: ollama, openai +LLM_BACKEND=ollama # Options: ollama, openai, groq WHISPER_MODEL=large-v3 WHISPER_BACKEND=local # Options: local, groq, openai +# Note: local uses faster-whisper, groq and openai use cloud APIs (requires API key) # Whisper model settings WHISPER_MODEL_SIZE=medium WHISPER_LANGUAGE=en diff --git a/CLOUD_INFERENCE_OPTIONS.md b/CLOUD_INFERENCE_OPTIONS.md index 901b295..2ecd10f 100644 --- a/CLOUD_INFERENCE_OPTIONS.md +++ b/CLOUD_INFERENCE_OPTIONS.md @@ -52,6 +52,50 @@ python test_api_keys.py --- +## 🟢 OpenAI Whisper API (PAY-PER-USE) + +**Best for:** High-quality transcription with official OpenAI support +**Cost:** Pay-per-use ($0.006 per minute of audio) +**Speed:** Fast cloud processing + +### Setup +1. Visit [https://platform.openai.com/api-keys](https://platform.openai.com/api-keys) +2. Create a new API key (requires payment method on file) +3. Add to your `.env` file: + ``` + OPENAI_API_KEY=your_openai_api_key_here + ``` + +### Supported Models +- **Whisper-1** - Official OpenAI Whisper model with excellent multilingual support + +### Configuration in UI +1. Navigate to **Step 2: Configure Session** +2. Expand **Advanced Backend Settings** accordion +3. Select backends: + - **Transcription:** `openai` + +### Pricing +- **$0.006 per minute** of audio +- Example: 4-hour D&D session = 240 minutes × $0.006 = **$1.44** +- Much cheaper than real-time transcription services +- No monthly minimums or subscription required + +### Features +- Verbose JSON response with segment and word-level timestamps +- Automatic language detection +- Excellent Dutch language support +- Built-in retry logic with exponential backoff +- Temporary file cleanup after processing + +### Testing +Run the API validation script: +```bash +python test_api_keys.py +``` + +--- + ## 🤗 HuggingFace Inference API (FREE TIER) **Best for:** Diarization (speaker identification) @@ -155,7 +199,7 @@ python test_api_keys.py | Task | Local Backend | Cloud Backend | Free Cloud Option | |------|--------------|---------------|-------------------| -| **Transcription** | Whisper (GPU/CPU) | Groq Whisper | ✅ Groq (unlimited) | +| **Transcription** | Whisper (GPU/CPU) | Groq / OpenAI | ✅ Groq (unlimited) / 💰 OpenAI ($0.006/min) | | **Diarization** | PyAnnote (GPU) | HF Inference | ✅ HuggingFace (~1000/day) | | **Classification** | Ollama (CPU/GPU) | Groq LLaMA | ✅ Groq (unlimited) | @@ -213,8 +257,16 @@ Diarization: pyannote (local - uses 8GB VRAM) Classification: groq (cloud - free, fast) ``` +**Alternative with OpenAI (for highest quality):** + +``` +Transcription: openai (cloud - paid, high quality) +Diarization: pyannote (local - uses 8GB VRAM) +Classification: groq (cloud - free, fast) +``` + **Why this works:** -- Groq handles transcription (no local VRAM usage) +- Cloud transcription handles audio processing (no local VRAM usage) - PyAnnote runs on GPU with 8GB VRAM (plenty of headroom) - Groq handles classification (no local VRAM usage) - No VRAM contention = no Ollama errors diff --git a/README.md b/README.md index 05f7194..aab553d 100644 --- a/README.md +++ b/README.md @@ -131,7 +131,7 @@ IC/OOC Classification (Ollama + Llama 3.1) |-----------|-----------|-----| | Audio Conversion | FFmpeg | Universal format support | | Voice Detection | Silero VAD | Best free VAD | -| Transcription | faster-whisper | 4x faster, excellent Dutch | +| Transcription | faster-whisper / Groq / OpenAI | Local or cloud options | | Diarization | PyAnnote.audio 3.1 | State-of-the-art | | Classification | Ollama (Llama 3.1) | Free, local, Dutch support | | UI | Gradio + Click + Rich | User-friendly interfaces | @@ -224,16 +224,17 @@ Speaker-labeled Chunks → Whisper API/Local → Dutch Transcription - Fully free, runs on your hardware - Supports `verbose_json` for detailed timestamps -2. **Groq API** (Alternative - fast & generous free tier): +2. **Groq API** (Fast & free cloud option): - Uses Whisper models with hardware acceleration - Much faster than local processing - Free tier: significant daily allowance - Good for testing/prototyping -3. **OpenAI Whisper API** (Fallback): - - 25MB file size limit per request - - Pay-per-use, but relatively cheap - - Most reliable Dutch support +3. **OpenAI Whisper API** (Official cloud option): + - Official OpenAI implementation (whisper-1 model) + - High quality, reliable results + - Pay-per-use pricing + - Excellent Dutch support **Process**: 1. Transcribe each chunk with `language="nl"` parameter for faster/better results diff --git a/src/transcriber.py b/src/transcriber.py index 9ed890a..40d503a 100644 --- a/src/transcriber.py +++ b/src/transcriber.py @@ -402,6 +402,139 @@ def _make_api_call(self, audio_file, language): ) +class OpenAITranscriber(BaseTranscriber): + """ + OpenAI Whisper API transcription - cloud-based Whisper. + + Pros: + - Very fast (cloud-accelerated) + - High quality results + - No local GPU needed + - Official OpenAI implementation + + Cons: + - Requires API key + - Internet connection required + - Pay-per-use pricing + """ + + def __init__(self, api_key: str = None): + from openai import OpenAI + import tempfile + + self.api_key = api_key or Config.OPENAI_API_KEY + if not self.api_key: + raise ValueError("OpenAI API key required. Set OPENAI_API_KEY in .env") + + self.client = OpenAI(api_key=self.api_key) + self.temp_dir = Path(tempfile.gettempdir()) + self.logger = get_logger("transcriber.openai") + + def transcribe_chunk( + self, + chunk: AudioChunk, + language: str = Config.WHISPER_LANGUAGE + ) -> ChunkTranscription: + """Transcribe using OpenAI Whisper API""" + import soundfile as sf + + # OpenAI requires a file path, so save chunk temporarily + temp_path = self.temp_dir / f"chunk_{chunk.chunk_index}.wav" + sf.write(str(temp_path), chunk.audio, chunk.sample_rate) + self.logger.debug("Submitting chunk %d to OpenAI (temp file: %s)", chunk.chunk_index, temp_path) + + try: + # Call OpenAI API + with open(str(temp_path), "rb") as audio_file: + response = self._make_api_call(audio_file, language) + + # Parse response + segments = [] + response_words = getattr(response, "words", None) + for seg in response.segments: + # Adjust timestamps to absolute time + absolute_start = chunk.start_time + seg['start'] + absolute_end = chunk.start_time + seg['end'] + + words = None + if response_words: + words = [ + { + 'word': w['word'], + 'start': chunk.start_time + w['start'], + 'end': chunk.start_time + w['end'], + 'probability': w.get('probability', 1.0) + } + for w in response_words + if seg['start'] <= w['start'] <= seg['end'] + ] + + segments.append(TranscriptionSegment( + text=seg['text'].strip(), + start_time=absolute_start, + end_time=absolute_end, + words=words + )) + + return ChunkTranscription( + chunk_index=chunk.chunk_index, + chunk_start=chunk.start_time, + chunk_end=chunk.end_time, + segments=segments, + language=response.language + ) + + finally: + # Clean up temp file + if temp_path.exists(): + temp_path.unlink() + self.logger.debug("Cleaned temporary chunk file %s", temp_path) + + def preflight_check(self): + """Check OpenAI API availability and authentication.""" + issues = [] + + if not self.api_key: + issues.append( + PreflightIssue( + component="transcriber.openai", + message="OpenAI API key not configured. Set OPENAI_API_KEY in .env file.", + severity="error", + ) + ) + return issues + + try: + # Test API with minimal request + response = self.client.chat.completions.create( + messages=[{"role": "user", "content": "test"}], + model="gpt-3.5-turbo", + max_tokens=1, + ) + self.logger.debug("OpenAI API preflight check passed") + except Exception as e: + issues.append( + PreflightIssue( + component="transcriber.openai", + message=f"OpenAI API test failed: {str(e)}. Check API key and internet connection.", + severity="error", + ) + ) + + return issues + + @retry_with_backoff(retries=3, backoff_in_seconds=1) + def _make_api_call(self, audio_file, language): + """Make API call with retry logic.""" + return self.client.audio.transcriptions.create( + file=audio_file, + model="whisper-1", + language=language if language != "auto" else None, + response_format="verbose_json", + timestamp_granularities=["segment", "word"] + ) + + class TranscriberFactory: """Factory to create appropriate transcriber based on config""" @@ -423,7 +556,6 @@ def create(backend: str = None) -> BaseTranscriber: elif backend == "groq": return GroqTranscriber() elif backend == "openai": - # TODO: Implement OpenAI transcriber if needed - raise NotImplementedError("OpenAI transcriber not yet implemented") + return OpenAITranscriber() else: raise ValueError(f"Unknown transcriber backend: {backend}") diff --git a/tests/test_transcriber.py b/tests/test_transcriber.py index c0b4f71..7ed1c81 100644 --- a/tests/test_transcriber.py +++ b/tests/test_transcriber.py @@ -10,12 +10,14 @@ def mock_config(): MockConfig.WHISPER_BACKEND = 'local' MockConfig.WHISPER_MODEL = 'tiny' MockConfig.GROQ_API_KEY = 'test-groq-api-key' + MockConfig.OPENAI_API_KEY = 'test-openai-api-key' yield MockConfig from src.transcriber import ( TranscriberFactory, FasterWhisperTranscriber, GroqTranscriber, + OpenAITranscriber, BaseTranscriber, ChunkTranscription, TranscriptionSegment @@ -48,8 +50,14 @@ def test_create_unknown_backend_raises_error(self): with pytest.raises(ValueError, match="Unknown transcriber backend: unknown"): TranscriberFactory.create(backend='unknown') - def test_create_openai_backend_raises_not_implemented(self): - with pytest.raises(NotImplementedError): + def test_create_openai_backend(self, mock_config): + mock_config.OPENAI_API_KEY = 'test-openai-api-key' + transcriber = TranscriberFactory.create(backend='openai') + assert isinstance(transcriber, OpenAITranscriber) + + def test_create_openai_with_no_api_key_raises_error(self, mock_config): + mock_config.OPENAI_API_KEY = None + with pytest.raises(ValueError, match="OpenAI API key required"): TranscriberFactory.create(backend='openai') def test_create_groq_with_no_api_key_raises_error(self, mock_config): @@ -245,3 +253,76 @@ def test_groq_transcriber_handles_empty_segments(mock_path_exists, mock_unlink, assert isinstance(result, ChunkTranscription) assert result.language == 'nl' assert result.segments == [] + + +@patch('openai.OpenAI') +@patch('soundfile.write') +@patch('builtins.open', new_callable=mock_open) +@patch('pathlib.Path.unlink') +@patch('pathlib.Path.exists', return_value=True) +def test_openai_transcriber(mock_path_exists, mock_unlink, mock_file_open, mock_sf_write, MockOpenAI, dummy_audio_chunk): + """Tests the OpenAITranscriber logic with extensive mocking.""" + # Arrange: Mock the OpenAI client and its API response + mock_openai_client = MockOpenAI.return_value + mock_response = MagicMock() + mock_response.language = 'nl' + mock_response.segments = [ + {'start': 1.0, 'end': 3.0, 'text': ' OpenAI transcription '} + ] + mock_response.words = [ + {'word': 'OpenAI', 'start': 1.0, 'end': 1.5}, + {'word': 'transcription', 'start': 1.6, 'end': 2.8} + ] + mock_openai_client.audio.transcriptions.create.return_value = mock_response + + transcriber = OpenAITranscriber(api_key='fake-key') + + # Act + result = transcriber.transcribe_chunk(dummy_audio_chunk, language='nl') + + # Assert + # Verify a temporary file was written and then opened + mock_sf_write.assert_called_once() + mock_file_open.assert_called_with(mock_sf_write.call_args[0][0], 'rb') + + # Verify the API was called + mock_openai_client.audio.transcriptions.create.assert_called_once() + + # Verify the temporary file was cleaned up + mock_unlink.assert_called_once() + + # Verify the returned data structure + assert isinstance(result, ChunkTranscription) + assert result.language == 'nl' + assert len(result.segments) == 1 + + segment = result.segments[0] + assert segment.text == "OpenAI transcription" + assert segment.start_time == pytest.approx(10.0 + 1.0) + assert segment.end_time == pytest.approx(10.0 + 3.0) + + assert len(segment.words) == 2 + assert segment.words[0]['word'] == 'OpenAI' + assert segment.words[0]['start'] == pytest.approx(10.0 + 1.0) + assert segment.words[1]['word'] == 'transcription' + + +@patch('openai.OpenAI') +@patch('soundfile.write') +@patch('builtins.open', new_callable=mock_open) +@patch('pathlib.Path.unlink') +@patch('pathlib.Path.exists', return_value=True) +def test_openai_transcriber_handles_empty_segments(mock_path_exists, mock_unlink, mock_file_open, mock_sf_write, MockOpenAI, dummy_audio_chunk): + mock_openai_client = MockOpenAI.return_value + mock_response = MagicMock() + mock_response.language = 'nl' + mock_response.segments = [] + mock_response.words = [] + mock_openai_client.audio.transcriptions.create.return_value = mock_response + + transcriber = OpenAITranscriber(api_key='fake-key') + result = transcriber.transcribe_chunk(dummy_audio_chunk, language='nl') + + assert isinstance(result, ChunkTranscription) + assert result.language == 'nl' + assert result.segments == []