## Objective
### Build a real-time or file-based transcription system.

### Install Required Libraries

In [20]:
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q sounddevice scipy ffmpeg-python
!sudo apt-get update
!sudo apt-get install -y ffmpeg libportaudio2 libportaudiocpp0 portaudio19-dev
!pip install noisereduce librosa


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 255 kB in 1s (179 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file

### Importing Required modules

In [27]:
# STEP 2: Imports and Whisper setup
from IPython.display import display, Audio
from google.colab import files
import sounddevice as sd
import scipy.io.wavfile as wav
import soundfile as sf
import librosa
import noisereduce as nr
import tempfile
import whisper
import tempfile
import os

### Upload Audio File or Record from Mic

In [12]:
# Load Whisper model
model = whisper.load_model("base")
print("✅ Whisper model loaded")

# Record from mic
def record_audio(duration=5, sample_rate=16000):
    print(f"🎤 Recording for {duration} seconds...")
    recording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='int16')
    sd.wait()
    temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    wav.write(temp_wav.name, sample_rate, recording)
    print(f"✅ Recording saved to {temp_wav.name}")
    return temp_wav.name

def upload_audio_file():
    print("📂 Please upload a .wav or .mp3 file...")
    uploaded = files.upload()
    filename = next(iter(uploaded))
    print(f"✅ File uploaded: {filename}")
    return filename

✅ Whisper model loaded


In [21]:
def reduce_noise(audio_path):
    y, sr = librosa.load(audio_path, sr=None)  # Load audio
    reduced_noise = nr.reduce_noise(y=y, sr=sr)  # Apply noise reduction
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    librosa.output.write_wav(temp_file.name, reduced_noise, sr)  # Save cleaned audio
    print(f"✅ Noise reduced audio saved at {temp_file.name}")
    return temp_file.name


In [32]:
import sounddevice as sd

def check_input_device():
    devices = sd.query_devices()
    input_devices = [i for i, d in enumerate(devices) if d['max_input_channels'] > 0]
    if not input_devices:
        raise RuntimeError("No input audio device found. Please connect a microphone or upload a file.")
    print(f"Input devices available: {input_devices}")
    return input_devices[0]  # pick first available device

device_index = check_input_device()

def record_audio(duration=5, sample_rate=16000):
    print(f"🎤 Recording for {duration} seconds...")
    recording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='int16', device=device_index)
    sd.wait()
    # ... rest unchanged


RuntimeError: No input audio device found. Please connect a microphone or upload a file.

### Transcribe Function with Timestamps

In [22]:
# STEP 4: Transcribe with timestamps
def transcribe_audio(path):
    print("📝 Transcribing...")
    result = model.transcribe(path)
    print("✅ Transcription complete!\n")

    for segment in result['segments']:
        start = segment['start']
        end = segment['end']
        text = segment['text']
        print(f"[{start:.2f}s - {end:.2f}s]: {text}")

    if 'text' in result and isinstance(result['text'], str):
        return result['text']
    else:
        print("⚠️ Warning: Could not extract full transcribed text.")
        return "Transcription text not available."

### Choose Mode (Record or Upload)

In [29]:
mode = input("Choose mode: [1] Upload file, [2] Record from mic: ")

if mode == "1":
    file_path = upload_audio_file()
elif mode == "2":
    duration = int(input("Enter recording duration in seconds: "))
    file_path = record_audio(duration)
else:
    raise ValueError("Invalid option")

# Transcribe and play back
transcribed_text = transcribe_audio(file_path)
display(Audio(file_path))

Choose mode: [1] Upload file, [2] Record from mic: 2
Enter recording duration in seconds: 20
🎤 Recording for 20 seconds...


PortAudioError: Error querying device -1

In [31]:
import sounddevice as sd
print(sd.query_devices())


