<a href="https://colab.research.google.com/github/Gopi138942/Voice_translator/blob/main/voice_cloning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# First install required packages
!pip install openai-whisper moviepy pydub googletrans==4.0.0-rc1 TTS ffmpeg-python
!apt-get install -y ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [None]:
!pip install --upgrade --force-reinstall torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (27 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (6.6 kB)
Collecting filelock (from torch)
  Downloading https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading https://download.pytorch.org/whl/typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting networkx (from torch)
  Downloading https://download.pytorch.org/whl/networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting jinja2 (from torch)
  Downloading https://download.pytorch.org/whl/Jinja2-3.1.4-p

In [None]:

import os
import whisper
from moviepy.editor import VideoFileClip, AudioFileClip
from pydub import AudioSegment
from googletrans import Translator
from TTS.api import TTS
import tempfile


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:


class SimpleVideoTranslator:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def extract_audio(self, video_path):
        """Extract audio using moviepy without temporary files"""
        with VideoFileClip(video_path) as video:
            return video.audio

    def transcribe(self, audio_clip):
        """Transcribe using Whisper directly"""
        model = whisper.load_model("base", device=self.device)

        # Save audio to temp file for Whisper
        with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
            audio_clip.write_audiofile(tmp.name, logger=None)
            result = model.transcribe(tmp.name)
        return result["text"]

    def translate_text(self, text, target_lang="es"):
        """Translate using googletrans"""
        translator = Translator()
        translation = translator.translate(text, dest=target_lang)
        return translation.text

    def synthesize_speech(self, text, reference_audio_clip):
        """Synthesize speech using TTS with voice cloning"""
        tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts").to(self.device)

        # Save reference audio to temp file
        with tempfile.NamedTemporaryFile(suffix=".wav") as ref_tmp:
            reference_audio_clip.write_audiofile(ref_tmp.name, logger=None)

            # Synthesize to temp file
            with tempfile.NamedTemporaryFile(suffix=".wav") as out_tmp:
                tts.tts_to_file(
                    text=text,
                    speaker_wav=ref_tmp.name,
                    file_path=out_tmp.name
                )
                return AudioFileClip(out_tmp.name)

    def process_video(self, input_path, output_path, target_lang="es"):
        """Complete processing pipeline"""
        try:
            # 1. Extract audio
            print("Extracting audio...")
            original_audio = self.extract_audio(input_path)

            # 2. Transcribe
            print("Transcribing audio...")
            original_text = self.transcribe(original_audio)
            print(f"Original: {original_text[:100]}...")

            # 3. Translate
            print("Translating text...")
            translated_text = self.translate_text(original_text, target_lang)
            print(f"Translated: {translated_text[:100]}...")

            # 4. Synthesize
            print("Synthesizing new audio...")
            new_audio = self.synthesize_speech(translated_text, original_audio)

            # 5. Combine with video
            print("Creating output video...")
            with VideoFileClip(input_path) as video:
                final = video.set_audio(new_audio)
                final.write_videofile(
                    output_path,
                    codec="libx264",
                    audio_codec="aac",
                    threads=4,
                    verbose=False
                )

            print(f"Success! Output saved to {output_path}")

        except Exception as e:
            print(f"Error: {str(e)}")
            raise

# Example usage
if __name__ == "__main__":
    # Download sample video
    !wget -q -O input.mp4 https://sample-videos.com/video123/mp4/720/big_buck_bunny_720p_1mb.mp4

    translator = SimpleVideoTranslator()
    translator.process_video("input.mp4", "output.mp4", "es")  # Translate to Spanish

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googl

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
^C
