In [5]:
import logging
import os
import torch
import torchaudio
import torchaudio.transforms as T
import noisereduce as nr
import sounddevice as sd
import numpy as np
from scipy.io.wavfile import write
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("transcription.log"),
        logging.StreamHandler()
    ]
)

# Step 1: Audio Recorder
class AudioRecorder:
    def __init__(self):
        self.sample_rate = 16000
        self.channels = 1
        self.audio_queue = []

    def list_devices(self):
        """Lists all available audio devices."""
        devices = sd.query_devices()
        for i, device in enumerate(devices):
            print(f"{i}: {device['name']}")

    def audio_callback(self, indata, frames, time, status):
        self.audio_queue.append(indata.copy())

    def record_audio(self, duration=10, device_name="default"):
        logging.info(f"Recording for {duration} seconds at {self.sample_rate} Hz...")
        
        # Find the correct device by name
        device_id = None
        for i, device in enumerate(sd.query_devices()):
            if device_name in device['name']:
                device_id = i
                break

        if device_id is None:
            raise ValueError(f"Device '{device_name}' not found")
        
        with sd.InputStream(callback=self.audio_callback, channels=self.channels, samplerate=self.sample_rate, device=device_id):
            sd.sleep(int(duration * 1000))
        
        # Combine all the chunks and save as a single WAV file
        combined_audio = np.concatenate(self.audio_queue, axis=0)
        raw_audio_folder = "raw_audio"
        os.makedirs(raw_audio_folder, exist_ok=True)
        raw_file_path = os.path.join(raw_audio_folder, "raw_audio_combined.wav")
        write(raw_file_path, self.sample_rate, combined_audio.astype(np.float32))
        logging.info(f"Saved raw audio to {raw_file_path}")
        return raw_file_path

# Step 2: Audio Processing
class AudioProcessor:
    def __init__(self, output_dir="cleaned_audio"):
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def clean_audio(self, raw_file_path):
        logging.info(f"Cleaning audio: {raw_file_path}")
        waveform, sample_rate = torchaudio.load(raw_file_path)

        # Convert to mono
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Apply noise reduction
        waveform_np = waveform.numpy()
        reduced_noise_waveform = nr.reduce_noise(y=waveform_np, sr=sample_rate)

        # Convert back to tensor
        cleaned_waveform_tensor = torch.tensor(reduced_noise_waveform, dtype=torch.float32)

        # Save cleaned audio
        cleaned_file_path = os.path.join(self.output_dir, "cleaned_audio.wav")
        torchaudio.save(cleaned_file_path, cleaned_waveform_tensor, sample_rate)
        logging.info(f"Saved cleaned audio to {cleaned_file_path}")
        return cleaned_file_path

# Step 3: Speech-to-Text Transcription
class SpeechToTextTranscriber:
    def __init__(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float32
        model_id = "distil-whisper/distil-large-v3"

        # Load the Whisper model and processor
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_id, torch_dtype=torch_dtype, use_safetensors=True
        ).to(device)
        self.processor = AutoProcessor.from_pretrained(model_id)

        # Create a speech recognition pipeline
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=self.model,
            tokenizer=self.processor.tokenizer,
            feature_extractor=self.processor.feature_extractor,
            device=device
        )

    def transcribe(self, cleaned_audio_file):
        logging.info(f"Transcribing {cleaned_audio_file}...")
        result = self.pipe(cleaned_audio_file, return_timestamps="word")
        text = result['text']
        logging.info(f"Transcription: {text}")
        return text

# Main execution function
def main():
    # Step 1: Record audio
    recorder = AudioRecorder()

    # Call the list of devices to select the right device
    #recorder.list_devices()

    # Specify the correct device name found in the list
    raw_file_path = recorder.record_audio(duration=10, device_name="Microphone Array (Intel® Smart Sound Technology for Digital Microphones)")

    # Step 2: Clean the recorded audio
    audio_processor = AudioProcessor()
    cleaned_file_path = audio_processor.clean_audio(raw_file_path)

    # Step 3: Transcribe the cleaned audio
    transcriber = SpeechToTextTranscriber()
    transcription = transcriber.transcribe(cleaned_file_path)

    logging.info(f"Final Transcription: {transcription}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'transformers.utils'

In [3]:
pip install transformers




SyntaxError: invalid syntax (3592654251.py, line 1)