In [30]:
!pip install transformers




In [1]:
import os
import torch
import soundfile as sf
import sounddevice as sd
from scipy.io.wavfile import write
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    MarianTokenizer,
    MarianMTModel
)
from TTS.api import TTS
import torchaudio

In [2]:
# ========== Configuration ==========
MODEL_DIR = "./models"
ASR_DIR = os.path.join(MODEL_DIR, "asr")
MT_DIR = os.path.join(MODEL_DIR, "mt")
TTS_CACHE = MODEL_DIR  # both TTS + vocoder model folders go here

AUDIO_FILENAME = "recorded.wav"
OUTPUT_WAV = "output_jp.wav"
DURATION = 5  # seconds
SAMPLE_RATE = 16000


In [3]:
# ========== ASR: Load or Save ==========
if not os.path.exists(ASR_DIR):
    print("🔽 Downloading ASR model...")
    processor_asr = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
    model_asr = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
    processor_asr.save_pretrained(ASR_DIR)
    model_asr.save_pretrained(ASR_DIR)
else:
    print("✅ Loading ASR model from disk.")
    processor_asr = Wav2Vec2Processor.from_pretrained(ASR_DIR)
    model_asr = Wav2Vec2ForCTC.from_pretrained(ASR_DIR)
model_asr.eval()

✅ Loading ASR model from disk.


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

In [4]:
# ========== MT: Load or Save ==========
if not os.path.exists(MT_DIR):
    print("🔽 Downloading MT model...")
    tokenizer_mt = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-jap")
    model_mt = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-jap")
    tokenizer_mt.save_pretrained(MT_DIR)
    model_mt.save_pretrained(MT_DIR)
else:
    print("✅ Loading MT model from disk.")
    tokenizer_mt = MarianTokenizer.from_pretrained(MT_DIR)
    model_mt = MarianMTModel.from_pretrained(MT_DIR)


✅ Loading MT model from disk.




In [None]:
# ========== TTS: Set Local Cache and Load ==========
print("🔽 Loading TTS model from local cache...")
os.environ["TTS_CACHE_PATH"] = TTS_CACHE
tts = TTS(model_name="tts_models/ja/kokoro/tacotron2-DDC", progress_bar=False)


🔽 Loading TTS model from local cache...
 > tts_models/ja/kokoro/tacotron2-DDC is already downloaded.
 > vocoder_models/ja/kokoro/hifigan_v1 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:C:\Users\ADMIN\AppData\Local\tts\tts_models--ja--kokoro--tacotron2-DDC\scale_stats.npy
 | > base:10
 | > hop_length:256
 | >

In [24]:
# ========== Step 1: Record from Mic ==========
print(f"🎙️ Recording for {DURATION} seconds...")
recording = sd.rec(int(DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
sd.wait()
write(AUDIO_FILENAME, SAMPLE_RATE, recording)
print("✅ Recording saved as", AUDIO_FILENAME)

🎙️ Recording for 5 seconds...
✅ Recording saved as recorded.wav


In [40]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

def spacy_capitalize(text):
    doc = nlp(text)
    sentences = []

    for sent in doc.sents:
        capitalized = []
        for i, token in enumerate(sent):
            if i == 0 or token.ent_type_ or token.pos_ in ['PROPN']:
                capitalized.append(token.text.capitalize())
            else:
                capitalized.append(token.text.lower())

        sentence = ' '.join(capitalized)
        # Clean spacing
        sentence = sentence.replace(" ,", ",").replace(" .", ".").replace(" '", "'").replace(" n't", "n't")
        sentences.append(sentence)

    return ' '.join(sentences)


In [43]:
# ========== Step 2: ASR - English Speech → Text ==========
waveform_np, sr = sf.read(AUDIO_FILENAME)
waveform = torch.tensor(waveform_np).float().unsqueeze(0)

if sr != 16000:
    waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)

input_values = processor_asr(waveform.squeeze(), return_tensors="pt", sampling_rate=16000).input_values

with torch.no_grad():
    logits = model_asr(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
# From ASR output
english_text = processor_asr.batch_decode(predicted_ids)[0].lower().strip()
print(english_text)
english_text = spacy_capitalize(english_text)
print("📝 Restored:", english_text)


my name is john
📝 Restored: My name is John


In [44]:

inputs_mt = tokenizer_mt(english_text, return_tensors="pt", padding=True)
translated = model_mt.generate(**inputs_mt)
japanese_text = tokenizer_mt.decode(translated[0], skip_special_tokens=True)
print("🌐 Japanese Translation:", japanese_text)

🌐 Japanese Translation: わたし の 名 は ヨハネ と い う .


In [None]:
# ========== Step 4: TTS - Synthesize Japanese ==========
tts.tts_to_file(text=japanese_text, file_path=OUTPUT_WAV)
print(f"🔊 Japanese speech saved to '{OUTPUT_WAV}'")

 > Text splitted to sentences.
['わたし の 名 は ヨハネ と い う .']
 > Processing time: 0.9275715351104736
 > Real-time factor: 0.4074942689907942
🔊 Japanese speech saved to 'output_jp.wav'
