In [5]:
import os
import sys
import argparse
import queue
import threading
import time
import logging
from logging.handlers import RotatingFileHandler
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import csv
import numpy as np
import sounddevice as sd
import soundfile as sf
import webrtcvad
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from TTS.api import TTS
from dotenv import load_dotenv


In [6]:
# ----------------------
# Defaults & constants
# ----------------------
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
DEFAULT_MODEL = "openai/whisper-medium"
DEFAULT_SAMPLE_RATE = 16000
DEFAULT_FRAME_MS = 20
DEFAULT_VAD_AGGR = 2
DEFAULT_MAX_SILENCE_FRAMES = 12  # ~240ms
TRANSCRIPT_CSV = "transcripts.csv"
LOG_FILE = "realtime_translator.log"



# Filler/short outputs to ignore
IGNORE_SET = {"thank you", "thanks", "ok", "okay", "hmm", "mm", "mhm", "yeah", "no", "nah"}


In [7]:
def setup_logger(logfile: str = LOG_FILE, level=logging.INFO):
    logger = logging.getLogger("realtime_translator")
    logger.setLevel(level)
    fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

    ch = logging.StreamHandler()
    ch.setFormatter(fmt)
    logger.addHandler(ch)

    fh = RotatingFileHandler(logfile, maxBytes=5 * 1024 * 1024, backupCount=3)
    fh.setFormatter(fmt)
    logger.addHandler(fh)

    return logger

logger = setup_logger()

def parse_args():
    class Args:
        model = DEFAULT_MODEL
        sample_rate = DEFAULT_SAMPLE_RATE
        frame_ms = DEFAULT_FRAME_MS
        vad_aggr = DEFAULT_VAD_AGGR
        lang = None
        task = "translate"
        max_silence_frames = DEFAULT_MAX_SILENCE_FRAMES
        num_beams = 5
        output_csv = TRANSCRIPT_CSV
        no_tts = False
    return Args()


In [8]:
def save_transcript(csv_path, timestamp, input_lang, task, text):
    write_header = not os.path.exists(csv_path)
    with open(csv_path, "a", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        if write_header:
            w.writerow(["timestamp", "input_lang", "task", "text"])
        w.writerow([timestamp, input_lang or "", task, text])


In [9]:
class RealtimeTranslator:
    def __init__(self, args):
        self.args = args
        self.sample_rate = args.sample_rate
        self.frame_ms = args.frame_ms
        self.chunk_samples = int(self.sample_rate * self.frame_ms / 1000)
        self.vad = webrtcvad.Vad(args.vad_aggr)
        self.q = queue.Queue()
        self.running = threading.Event()
        self.running.set()
        self.silence_threshold = args.max_silence_frames

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.dtype = torch.float16 if self.device == "cuda" else torch.float32
        logger.info(f"Using device={self.device} dtype={self.dtype}")

        logger.info("Loading Whisper model and processor...")
        self.processor = WhisperProcessor.from_pretrained(args.model, use_auth_token=HF_TOKEN)
        self.model = WhisperForConditionalGeneration.from_pretrained(
            args.model, torch_dtype=self.dtype, use_auth_token=HF_TOKEN
        )
        self.model.to(self.device)
        self.model.eval()

        gen_kw = dict(
            task=args.task,
            language=args.lang,
            num_beams=args.num_beams,
            temperature=0.0,
            no_repeat_ngram_size=3,
            min_length=4,
            length_penalty=1.0,
            suppress_tokens=[],
        )
        self.gen_kw = gen_kw

        self.no_tts = args.no_tts
        if not self.no_tts:
            logger.info("Loading TTS model...")
            try:
                self.tts = TTS(
                    model_name="tts_models/multilingual/multi-dataset/your_tts",
                    progress_bar=False, gpu=(self.device == "cuda")
                )
            except Exception as e:
                logger.exception("Failed to load TTS model - continuing in no-tts mode.")
                self.no_tts = True
                self.tts = None
        else:
            self.tts = None

        self.default_speaker = self.tts.speakers[0] if (self.tts and len(self.tts.speakers) > 0) else None
        logger.info(f"Default speaker: {self.default_speaker}")

        self.executor = ThreadPoolExecutor(max_workers=2)
        self._warmup()

    def _warmup(self):
        try:
            dummy = np.zeros((1600,), dtype=np.float32)
            inputs = self.processor(dummy, sampling_rate=self.sample_rate, return_tensors="pt")
            input_features = inputs.input_features.to(self.device, dtype=self.dtype)
            with torch.no_grad():
                _ = self.model.generate(input_features, max_length=1, **self.gen_kw)
            logger.info("Warmup complete.")
        except Exception as e:
            logger.warning("Warmup failed: %s", e)

    def audio_callback(self, indata, frames, time_info, status):
        if status:
            logger.debug("Input status: %s", status)
        self.q.put(bytes(indata))

    def bytes_to_tensor(self, fr):
        arr = np.frombuffer(fr, dtype=np.int16).astype(np.float32) / 32768.0
        return torch.from_numpy(arr).to(self.device, dtype=self.dtype)

    def preemphasis_torch(self, x: torch.Tensor, coeff: float = 0.97):
        if x.numel() == 0:
            return x
        return torch.cat([x[:1], x[1:] - coeff * x[:-1]])

    def tts_playback_worker(self, text, out_path="output.wav"):
        try:
            tts_lang = "en" if self.args.task == "translate" else None
            self.tts.tts_to_file(text=text, file_path=out_path,
                                 speaker=self.default_speaker, language=tts_lang)
            data, sr = sf.read(out_path)
            sd.play(data, sr)
            sd.wait()
            os.remove(out_path)
        except Exception:
            logger.exception("TTS/playback failed for text: %s", text)

    def run(self):
        logger.info("Starting real-time loop. Press Stop to interrupt.")
        ring = []
        silence_count = 0
        try:
            with sd.RawInputStream(
                samplerate=self.sample_rate,
                blocksize=self.chunk_samples,
                dtype="int16",
                channels=1,
                callback=self.audio_callback,
            ):
                while self.running.is_set():
                    try:
                        frame = self.q.get(timeout=0.2)
                    except queue.Empty:
                        continue

                    is_speech = self.vad.is_speech(frame, self.sample_rate)
                    tensor_frame = self.bytes_to_tensor(frame)

                    if is_speech:
                        ring.append(tensor_frame)
                        silence_count = 0
                    else:
                        if len(ring) > 0:
                            silence_count += 1
                            if silence_count < self.silence_threshold:
                                ring.append(tensor_frame)
                                continue

                            speech = torch.cat(ring)
                            ring.clear()
                            silence_count = 0

                            speech = self.preemphasis_torch(speech)
                            inputs = self.processor(
                                speech.cpu().numpy(),
                                sampling_rate=self.sample_rate,
                                return_tensors="pt"
                            )
                            input_features = inputs.input_features.to(self.device, dtype=self.dtype)

                            with torch.no_grad():
                                generated_ids = self.model.generate(input_features, **self.gen_kw)

                            text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
                            if not text:
                                continue

                            tlow = text.lower().strip(" .!,?")
                            if tlow in IGNORE_SET or (len(tlow.split()) <= 1 and len(tlow) < 3):
                                continue

                            ts = datetime.utcnow().isoformat()
                            logger.info("Result [%s]: %s", ts, text)

                            save_transcript(self.args.output_csv, ts, self.args.lang, self.args.task, text)

                            if not self.no_tts and self.tts:
                                outpath = f"output_{int(time.time()*1000)}.wav"
                                self.executor.submit(self.tts_playback_worker, text, outpath)

        except Exception:
            logger.exception("Unhandled exception in main loop.")
        finally:
            self.shutdown()

    def shutdown(self):
        logger.info("Shutting down tasks...")
        self.running.clear()
        self.executor.shutdown(wait=True)
        logger.info("Shutdown complete.")


In [10]:
args = parse_args()
rt = RealtimeTranslator(args)
rt.run()

2025-09-10 19:25:05,044 - INFO - Using device=cuda dtype=torch.float16
2025-09-10 19:25:05,044 - INFO - Using device=cuda dtype=torch.float16
2025-09-10 19:25:05,047 - INFO - Loading Whisper model and processor...
2025-09-10 19:25:05,047 - INFO - Loading Whisper model and processor...
Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
`torch_dtype` is deprecated! Use `dtype` instead!
2025-09-10 19:25:12,003 - INFO - Loading TTS model...
2025-09-10 19:25:12,003 - INFO - Loading TTS model...


 > tts_models/multilingual/multi-dataset/your_tts is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400
 > External Speaker Encoder Loaded !!
 > initialization of language-embedding layers.


2025-09-10 19:25:13,674 - INFO - Default speaker: female-en-5
2025-09-10 19:25:13,674 - INFO - Default speaker: female-en-5


 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
2025-09-10 19:25:14,480 - INFO - Starting real-time loop. Press Stop to interrupt.
2025-09-10 19:25:14,480 - INFO - Starting real-time loop. Press Stop to interrupt.
2025-09-10 19:25:19,272 - INFO - Result [2025-09-10T13:55:19.272338]: Hi, hello. How are you?
2025-09-10 19:25:19,272 - INFO - Result [2025-09-10T13:55:19.272338]: Hi, hello. How are you?


 > Text splitted to sentences.
['Hi, hello.', 'How are you?']
 > Processing time: 0.9971110820770264
 > Real-time factor: 0.35183877278653014


2025-09-10 19:25:20,597 - INFO - Result [2025-09-10T13:55:20.597146]: [BLANK_AUDIO]
2025-09-10 19:25:20,597 - INFO - Result [2025-09-10T13:55:20.597146]: [BLANK_AUDIO]


 > Text splitted to sentences.
['[BLANK_AUDIO]']
 > Processing time: 0.5074193477630615
 > Real-time factor: 0.3138029361552638


2025-09-10 19:25:22,088 - INFO - Result [2025-09-10T13:55:22.088496]: [BLANK_AUDIO]
2025-09-10 19:25:22,088 - INFO - Result [2025-09-10T13:55:22.088496]: [BLANK_AUDIO]


 > Text splitted to sentences.
['[BLANK_AUDIO]']
 > Processing time: 0.14701080322265625
 > Real-time factor: 0.08745437431448914


2025-09-10 19:25:26,645 - INFO - Result [2025-09-10T13:55:26.645775]: NARIO!
2025-09-10 19:25:26,645 - INFO - Result [2025-09-10T13:55:26.645775]: NARIO!


 > Text splitted to sentences.
['NARIO!']
 > Processing time: 0.12394928932189941
 > Real-time factor: 0.09440159125811075


2025-09-10 19:25:31,058 - INFO - Result [2025-09-10T13:55:31.058498]: Tomorrow, I...
2025-09-10 19:25:31,058 - INFO - Result [2025-09-10T13:55:31.058498]: Tomorrow, I...


 > Text splitted to sentences.
['Tomorrow, I...']
 > Processing time: 0.11621379852294922
 > Real-time factor: 0.08640431116947897


2025-09-10 19:25:32,615 - INFO - Result [2025-09-10T13:55:32.615996]: It's very difficult.
2025-09-10 19:25:32,615 - INFO - Result [2025-09-10T13:55:32.615996]: It's very difficult.


 > Text splitted to sentences.
["It's very difficult."]
 > Processing time: 0.12345027923583984
 > Real-time factor: 0.05588514225253048


2025-09-10 19:25:39,725 - INFO - Result [2025-09-10T13:55:39.725347]: (speaking in foreign language)
2025-09-10 19:25:39,725 - INFO - Result [2025-09-10T13:55:39.725347]: (speaking in foreign language)


 > Text splitted to sentences.
['(speaking in foreign language)']
 > Processing time: 0.12999248504638672
 > Real-time factor: 0.051728008375004665


2025-09-10 19:25:41,698 - INFO - Shutting down tasks...
2025-09-10 19:25:41,698 - INFO - Shutting down tasks...
2025-09-10 19:25:42,719 - INFO - Shutdown complete.
2025-09-10 19:25:42,719 - INFO - Shutdown complete.


KeyboardInterrupt: 