In [None]:
%cd "C:/content/Wav2Lip"
!python inference.py \
--checkpoint_path "C:/Users/Admin/Documents/AI_Project(2.0)/Wav2Lip/Wav2Lip-SD-GAN.pt" \
--face "C:/Users/Admin/Documents/AI_Project(2.0)/Wav2Lip/Video_input.mp4" \
--audio "C:/Users/Admin/Documents/AI_Project(2.0)/Wav2Lip/telugu_audio_16k.wav" \
--outfile "C:/Users/Admin/Documents/AI_Project(2.0)/Wav2Lip/result_lipsynced.mp4" \
--pads 0 20 0 0 \
--resize_factor 4


In [None]:
import cv2
import moviepy.editor as mp
import speech_recognition as sr
from pydub import AudioSegment
from googletrans import Translator
from gtts import gTTS
import numpy as np
from scipy import signal
import os

print("All packages imported successfully!")

In [None]:
'''# **Required Libraries and Their Versions (as of my last update)**
- **Python**: 3.9.x (for broad compatibility)
- **OpenCV**: 4.5.x (`pip install opencv-python`)
- **MoviePy**: 1.0.x (`pip install moviepy`)
- **SpeechRecognition**: 3.8.x (`pip install SpeechRecognition`)
- **PyDub**: 0.25.x (`pip install pydub`)
- **googletrans==4.0.0-rc1** (for Machine Translation, `pip install googletrans==4.0.0-rc1`)
- **gTTS**: 2.2.x (for Text-to-Speech, `pip install gTTS`)
- **numpy**: 1.20.x (automatically installed with OpenCV, but ensure it's up to date, `pip install numpy`)
- **scipy**: 1.7.x (for audio/video processing, `pip install scipy`)
- **Pillow**: 8.3.x (for image processing, `pip install Pillow`)

**Import Section in Your Python Script**
```python
'''
import cv2
import moviepy.editor as mp
import speech_recognition as sr
from pydub import AudioSegment
from googletrans import Translator
from gtts import gTTS
import numpy as np
from scipy.io import wavfile
from PIL import Image
import os


In [None]:
import cv2
import moviepy.editor as mp
import os

def preprocess_video(input_video_path):
    # Extract Original English Audio
    video = mp.VideoFileClip(input_video_path)
    audio_path = "original_english_audio.wav"
    video.audio.write_audiofile(audio_path)
    
    # Extract Frames
    cap = cv2.VideoCapture(input_video_path)
    frame_count = 0
    frames_dir = "extracted_frames"
    os.makedirs(frames_dir, exist_okay=True)
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        cv2.imwrite(os.path.join(frames_dir, f"frame_{frame_count}.jpg"), frame)
        frame_count += 1
    
    cap.release()
    cv2.destroyAllWindows()
    
    return audio_path, frames_dir

# Example Usage
input_video_path = "C:/Users/Admin/Documents/AI_Project(2.0)/Wav2Lip/Video_input.mp4"
audio_path, frames_dir = preprocess_video(input_video_path)
print(f"Original English Audio Saved: {audio_path}")
print(f"Extracted Frames Directory: {frames_dir}")


In [None]:
import speech_recognition as sr

def english_audio_to_text(audio_path):
    r = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = r.record(source)
        try:
            english_text = r.recognize_google(audio, language="en-US")
            return english_text
        except sr.UnknownValueError:
            print("Google Speech Recognition could not understand audio")
            return None
        except sr.RequestError as e:
            print("Could not request results from Google Speech Recognition service; {0}".format(e))
            return None

# Example Usage (Continuing from previous step)
english_text = english_audio_to_text(audio_path)
print(f"Transcribed English Text: {english_text}")


In [None]:
# Create a test.py file with this content
import cv2
import os

# Try to load your video
video_path = r"C:\Users\Admin\Documents\AI_Project(2.0)\Input_video.mp4"
cap = cv2.VideoCapture(video_path)

if cap.isOpened():
    print(f"Successfully opened video file: {video_path}")
    print(f"Width: {int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))}")
    print(f"Height: {int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))}")
    print(f"FPS: {cap.get(cv2.CAP_PROP_FPS)}")
    print(f"Total frames: {int(cap.get(cv2.CAP_PROP_FRAME_COUNT))}")
    cap.release()
else:
    print(f"Failed to open video file: {video_path}")
    # Check if file exists
    if os.path.exists(video_path):
        print("File exists but could not be opened as a video file")
    else:
        print("File does not exist at the specified path")

In [None]:
# video_translator.py
import cv2
import moviepy.editor as mp
import speech_recognition as sr
from pydub import AudioSegment
from googletrans import Translator
from gtts import gTTS
import numpy as np
import os
import logging
from tqdm import tqdm
import time

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class VideoTranslator:
    def __init__(self, input_video_path, output_dir="output", source_lang="auto", target_lang="en"):
        """
        Initialize the Video Translator with the specified parameters.
        
        Args:
            input_video_path (str): Path to the input video file
            output_dir (str): Directory to store output files
            source_lang (str): Source language code (auto for automatic detection)
            target_lang (str): Target language code (en for English)
        """
        self.input_video_path = input_video_path
        self.output_dir = output_dir
        self.source_lang = source_lang
        self.target_lang = target_lang
        
        # Create output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        # Create temp directory for intermediate files
        self.temp_dir = os.path.join(output_dir, "temp")
        if not os.path.exists(self.temp_dir):
            os.makedirs(self.temp_dir)
        
        # Initialize paths for intermediate files
        self.extracted_audio_path = os.path.join(self.temp_dir, "extracted_audio.wav")
        self.translated_audio_path = os.path.join(self.temp_dir, "translated_audio.mp3")
        self.final_video_path = os.path.join(output_dir, "translated_video.mp4")
        
        # Initialize translator
        self.translator = Translator()
        
    def extract_audio(self):
        """Extract audio from the input video file."""
        logger.info("Extracting audio from video...")
        try:
            video = mp.VideoFileClip(self.input_video_path)
            video.audio.write_audiofile(self.extracted_audio_path, codec='pcm_s16le')
            logger.info(f"Audio extracted successfully to {self.extracted_audio_path}")
            return True
        except Exception as e:
            logger.error(f"Error extracting audio: {str(e)}")
            return False
    
    def speech_to_text(self):
        """Convert speech to text using the SpeechRecognition library."""
        logger.info("Converting speech to text...")
        recognizer = sr.Recognizer()
        
        try:
            # Convert audio to format compatible with SpeechRecognition
            audio = AudioSegment.from_wav(self.extracted_audio_path)
            
            # Split audio into chunks to better handle long audio files
            chunk_length_ms = 30000  # 30 seconds
            chunks = self._split_audio(audio, chunk_length_ms)
            
            full_text = ""
            
            for i, chunk in enumerate(tqdm(chunks, desc="Processing audio chunks")):
                chunk_file = os.path.join(self.temp_dir, f"chunk_{i}.wav")
                chunk.export(chunk_file, format="wav")
                
                with sr.AudioFile(chunk_file) as source:
                    audio_data = recognizer.record(source)
                    try:
                        text = recognizer.recognize_google(audio_data, language=self.source_lang if self.source_lang != "auto" else None)
                        full_text += text + " "
                    except sr.UnknownValueError:
                        logger.warning(f"Could not understand audio in chunk {i}")
                    except sr.RequestError as e:
                        logger.error(f"Google Speech Recognition service error: {str(e)}")
                
            logger.info("Speech to text conversion completed")
            return full_text.strip()
        except Exception as e:
            logger.error(f"Error in speech to text conversion: {str(e)}")
            return ""
    
    def _split_audio(self, audio, chunk_length_ms):
        """Split audio into chunks."""
        chunks = []
        for i in range(0, len(audio), chunk_length_ms):
            chunks.append(audio[i:i+chunk_length_ms])
        return chunks
    
    def translate_text(self, text):
        """Translate text from source language to target language."""
        if not text:
            logger.warning("No text to translate")
            return ""
            
        logger.info(f"Translating text from {self.source_lang} to {self.target_lang}...")
        try:
            translated = self.translator.translate(text, src=self.source_lang if self.source_lang != "auto" else None, dest=self.target_lang)
            logger.info("Translation completed")
            return translated.text
        except Exception as e:
            logger.error(f"Error translating text: {str(e)}")
            return ""
    
    def text_to_speech(self, text):
        """Convert translated text to speech."""
        if not text:
            logger.warning("No text to convert to speech")
            return False
            
        logger.info("Converting translated text to speech...")
        try:
            tts = gTTS(text=text, lang=self.target_lang, slow=False)
            tts.save(self.translated_audio_path)
            logger.info(f"Text-to-speech conversion completed. Audio saved to {self.translated_audio_path}")
            return True
        except Exception as e:
            logger.error(f"Error in text-to-speech conversion: {str(e)}")
            return False
    
    def generate_basic_lip_sync(self):
        """Generate basic lip-sync for the translated audio."""
        logger.info("Generating basic lip-sync...")
        
        try:
            # Extract frames from the original video
            frames_dir = os.path.join(self.temp_dir, "frames")
            if not os.path.exists(frames_dir):
                os.makedirs(frames_dir)
                
            cap = cv2.VideoCapture(self.input_video_path)
            fps = cap.get(cv2.CAP_PROP_FPS)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            
            # Extract dimensions
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            
            # Extract all frames
            success = True
            frame_count = 0
            
            logger.info("Extracting frames...")
            while success and frame_count < total_frames:
                success, frame = cap.read()
                if success:
                    frame_path = os.path.join(frames_dir, f"frame_{frame_count:06d}.jpg")
                    cv2.imwrite(frame_path, frame)
                    frame_count += 1
                    
                    # Print progress every 100 frames
                    if frame_count % 100 == 0:
                        logger.info(f"Extracted {frame_count}/{total_frames} frames ({frame_count/total_frames*100:.2f}%)")
            
            cap.release()
            logger.info(f"Extracted {frame_count} frames in total")
            
            # Load the translated audio to analyze amplitude
            audio = AudioSegment.from_mp3(self.translated_audio_path)
            audio_array = np.array(audio.get_array_of_samples())
            
            # Normalize audio amplitude
            max_amp = np.max(np.abs(audio_array))
            norm_audio = audio_array / max_amp if max_amp > 0 else audio_array
            
            # Load face cascade for face detection
            face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
            
            # Process frames with simple lip movement based on audio amplitude
            logger.info("Processing frames for lip-sync...")
            
            # Determine audio amplitude at each frame time
            samples_per_frame = len(norm_audio) / frame_count
            
            # Create output directory for processed frames
            processed_frames_dir = os.path.join(self.temp_dir, "processed_frames")
            if not os.path.exists(processed_frames_dir):
                os.makedirs(processed_frames_dir)
            
            for i in tqdm(range(frame_count), desc="Generating lip-sync"):
                # Load frame
                frame_path = os.path.join(frames_dir, f"frame_{i:06d}.jpg")
                if not os.path.exists(frame_path):
                    continue
                    
                frame = cv2.imread(frame_path)
                
                # Get audio amplitude for this frame
                start_sample = int(i * samples_per_frame)
                end_sample = int((i + 1) * samples_per_frame)
                if start_sample < len(norm_audio):
                    frame_amplitude = np.mean(np.abs(norm_audio[start_sample:min(end_sample, len(norm_audio))]))
                else:
                    frame_amplitude = 0
                
                # Detect faces
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                faces = face_cascade.detectMultiScale(gray, 1.1, 4)
                
                # For each face, modify the mouth region based on audio amplitude
                for (x, y, w, h) in faces:
                    # Approximate mouth region (lower third of face)
                    mouth_y = y + int(2 * h / 3)
                    mouth_h = int(h / 3)
                    
                    # Simple lip movement: darker line representing mouth opening
                    mouth_opening = int(frame_amplitude * 10)  # Scale amplitude to pixel values
                    
                    # Draw mouth line with varying thickness based on amplitude
                    cv2.line(frame, 
                             (x + int(w/4), mouth_y + int(mouth_h/2)),
                             (x + int(3*w/4), mouth_y + int(mouth_h/2)),
                             (0, 0, 0), max(1, mouth_opening))
                
                # Save processed frame
                processed_frame_path = os.path.join(processed_frames_dir, f"processed_{i:06d}.jpg")
                cv2.imwrite(processed_frame_path, frame)
            
            logger.info("Basic lip-sync generation completed")
            return processed_frames_dir, fps
        
        except Exception as e:
            logger.error(f"Error generating lip-sync: {str(e)}")
            return None, None
    
    def render_final_video(self, processed_frames_dir, fps):
        """Combine processed frames and translated audio into the final video."""
        if not processed_frames_dir:
            logger.error("No processed frames directory provided")
            return False
            
        logger.info("Rendering final video...")
        
        try:
            # Create temp video file
            temp_video_path = os.path.join(self.temp_dir, "temp_video.mp4")
            
            # Use OpenCV VideoWriter to create video from frames
            frame_files = [f for f in sorted(os.listdir(processed_frames_dir)) if f.startswith("processed_")]
            
            if not frame_files:
                logger.error("No processed frames found")
                return False
                
            # Get dimensions from first frame
            first_frame_path = os.path.join(processed_frames_dir, frame_files[0])
            first_frame = cv2.imread(first_frame_path)
            height, width, _ = first_frame.shape
            
            # Initialize video writer
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter(temp_video_path, fourcc, fps, (width, height))
            
            # Add frames to video
            for frame_file in tqdm(frame_files, desc="Adding frames to video"):
                frame_path = os.path.join(processed_frames_dir, frame_file)
                frame = cv2.imread(frame_path)
                out.write(frame)
                
            out.release()
            
            # Combine video with translated audio using MoviePy
            video_clip = mp.VideoFileClip(temp_video_path)
            audio_clip = mp.AudioFileClip(self.translated_audio_path)
            
            # Ensure audio duration matches video duration
            if audio_clip.duration > video_clip.duration:
                audio_clip = audio_clip.subclip(0, video_clip.duration)
            
            final_clip = video_clip.set_audio(audio_clip)
            final_clip.write_videofile(self.final_video_path, codec="libx264", audio_codec="aac")
            
            logger.info(f"Final video rendered successfully: {self.final_video_path}")
            return True
            
        except Exception as e:
            logger.error(f"Error rendering final video: {str(e)}")
            return False
    
    def cleanup(self):
        """Clean up temporary files."""
        logger.info("Cleaning up temporary files...")
        try:
            # Uncomment the line below to actually delete temp files when you're sure the process works
            # import shutil
            # shutil.rmtree(self.temp_dir)
            logger.info("Temporary files cleaned up successfully")
        except Exception as e:
            logger.error(f"Error cleaning up temporary files: {str(e)}")
    
    def process(self):
        """Run the complete video translation and lip-sync process."""
        logger.info("Starting video translation process...")
        
        # Extract audio from video
        if not self.extract_audio():
            return False
        
        # Convert speech to text
        source_text = self.speech_to_text()
        if not source_text:
            logger.error("Failed to extract speech from video")
            return False
        
        logger.info(f"Extracted text: {source_text[:100]}..." if len(source_text) > 100 else source_text)
        
        # Translate text
        translated_text = self.translate_text(source_text)
        if not translated_text:
            logger.error("Failed to translate text")
            return False
        
        logger.info(f"Translated text: {translated_text[:100]}..." if len(translated_text) > 100 else translated_text)
        
        # Convert translated text to speech
        if not self.text_to_speech(translated_text):
            logger.error("Failed to convert translated text to speech")
            return False
        
        # Generate lip-sync
        processed_frames_dir, fps = self.generate_basic_lip_sync()
        if not processed_frames_dir:
            logger.error("Failed to generate lip-sync")
            return False
        
        # Render final video
        if not self.render_final_video(processed_frames_dir, fps):
            logger.error("Failed to render final video")
            return False
        
        # Don't clean up temporary files immediately so you can inspect them
        # self.cleanup()
        
        logger.info("Video translation process completed successfully!")
        return True


def main():
    """Main function to run the video translator."""
    import argparse
    
    parser = argparse.ArgumentParser(description='Translate video speech and generate lip-sync.')
    parser.add_argument('--input', '-i', required=True, help='Path to input video file')
    parser.add_argument('--output', '-o', default='output', help='Output directory')
    parser.add_argument('--source-lang', '-s', default='auto', help='Source language code (default: auto)')
    parser.add_argument('--target-lang', '-t', default='en', help='Target language code (default: en)')
    parser.add_argument('--no-cleanup', action='store_true', help='Do not clean up temporary files')
    
    args = parser.parse_args()
    
    translator = VideoTranslator(
        input_video_path=args.input,
        output_dir=args.output,
        source_lang=args.source_lang,
        target_lang=args.target_lang
    )
    
    success = translator.process()
    
    if success:
        print(f"Translation completed successfully! Output video: {translator.final_video_path}")
    else:
        print("Translation process failed.")


if __name__ == "__main__":
    main()

In [None]:
"""
Simplified AI Video Dubbing System
==================================
This is a simplified version that works without complex dependencies.
Features:
- Extracts audio from video
- Uses predefined/sample text instead of STT if necessary
- Translates text to target language
- Generates speech in target language
- Renders final dubbed video

This script requires minimal dependencies and avoids the FLAC issue.
"""

import os
import sys
import time
import re
from gtts import gTTS
import moviepy.editor as mp
from googletrans import Translator

# Configuration
INPUT_VIDEO = r"C:\Users\Admin\Documents\AI_Project(2.0)\Input_video.mp4"
OUTPUT_DIR = r"C:\Users\Admin\Documents\AI_Project(2.0)\output"
TARGET_LANGUAGE = "hi"  # 'hi' for Hindi, 'te' for Telugu, 'ta' for Tamil
LANGUAGE_MAPPING = {
    "hi": "Hindi",
    "te": "Telugu",
    "ta": "Tamil"
}

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_audio(input_video, output_dir):
    """Extract audio from video file."""
    print(f"Extracting audio from: {input_video}")
    
    # Create temp directory for audio
    temp_dir = os.path.join(output_dir, "temp")
    os.makedirs(temp_dir, exist_ok=True)
    
    # Extract audio using moviepy
    video = mp.VideoFileClip(input_video)
    audio_path = os.path.join(temp_dir, "extracted_audio.wav")
    video.audio.write_audiofile(audio_path)
    
    return video, audio_path

def translate_text(text, target_lang):
    """Translate text to target language."""
    print(f"Translating text to {LANGUAGE_MAPPING.get(target_lang, target_lang)}...")
    
    translator = Translator()
    
    # Split text into smaller chunks to avoid translation limits
    max_chunk_size = 1000
    chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    
    translated_chunks = []
    for chunk in chunks:
        try:
            translation = translator.translate(chunk, dest=target_lang)
            translated_chunks.append(translation.text)
            # Add delay to avoid rate limiting
            time.sleep(1)
        except Exception as e:
            print(f"Translation error: {e}")
            # In case of error, keep original text
            translated_chunks.append(chunk)
    
    translated_text = " ".join(translated_chunks)
    
    return translated_text

def generate_speech(text, target_lang, output_path):
    """Generate speech from text."""
    print(f"Generating {LANGUAGE_MAPPING.get(target_lang, target_lang)} speech...")
    
    # Split text into manageable chunks for TTS
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    # Create temporary directory for audio chunks
    temp_dir = os.path.dirname(output_path)
    
    temp_audio_files = []
    
    for i, sentence in enumerate(sentences):
        if not sentence.strip():
            continue
            
        temp_file = os.path.join(temp_dir, f"temp_speech_{i}.mp3")
        try:
            tts = gTTS(text=sentence, lang=target_lang, slow=False)
            tts.save(temp_file)
            temp_audio_files.append(temp_file)
        except Exception as e:
            print(f"TTS error for sentence {i}: {e}")
    
    # Combine all audio files using moviepy
    if temp_audio_files:
        audio_clips = [mp.AudioFileClip(f) for f in temp_audio_files]
        final_audio = mp.concatenate_audioclips(audio_clips)
        final_audio.write_audiofile(output_path)
        
        # Clean up temporary files
        for clip in audio_clips:
            clip.close()
            
        for temp_file in temp_audio_files:
            if os.path.exists(temp_file):
                try:
                    os.remove(temp_file)
                except:
                    pass
    else:
        print("No audio was generated!")
        return False
    
    return True

def create_dubbed_video(video, dubbed_audio_path, output_path):
    """Create final dubbed video."""
    print("Creating dubbed video...")
    
    # Get original video without audio
    video_without_audio = video.without_audio()
    
    # Load dubbed audio
    dubbed_audio = mp.AudioFileClip(dubbed_audio_path)
    
    # Create final video with dubbed audio
    final_video = video_without_audio.set_audio(dubbed_audio)
    
    # Add subtitle with language information
    def add_subtitle(frame):
        from PIL import Image, ImageDraw, ImageFont
        import numpy as np
        
        # Convert frame to PIL Image
        img = Image.fromarray(frame)
        draw = ImageDraw.Draw(img)
        
        # Define text properties
        text = f"Dubbed in {LANGUAGE_MAPPING.get(TARGET_LANGUAGE, TARGET_LANGUAGE)}"
        
        # Try to get a font, fall back to default if not available
        try:
            font = ImageFont.truetype("arial.ttf", 24)
        except:
            font = ImageFont.load_default()
        
        # Get text size
        text_size = draw.textsize(text, font=font) if hasattr(draw, 'textsize') else (150, 30)
        
        # Position at bottom right
        width, height = img.size
        text_x = width - text_size[0] - 20
        text_y = height - text_size[1] - 20
        
        # Draw background rectangle
        draw.rectangle(
            [(text_x - 5, text_y - 5), (text_x + text_size[0] + 5, text_y + text_size[1] + 5)],
            fill=(0, 0, 0, 128)
        )
        
        # Draw text
        draw.text((text_x, text_y), text, font=font, fill=(255, 255, 255))
        
        return np.array(img)
    
    # Apply subtitle to video
    final_video = final_video.fl_image(add_subtitle)
    
    # Write final video file
    try:
        print(f"Rendering final video to: {output_path}")
        final_video.write_videofile(
            output_path,
            codec='libx264',
            audio_codec='aac'
        )
    except Exception as e:
        print(f"Error rendering video: {e}")
        print("Trying alternative method...")
        
        # Alternative method: Write video without audio first, then add audio
        video_only_path = os.path.join(os.path.dirname(output_path), "temp_video_only.mp4")
        final_video.without_audio().write_videofile(
            video_only_path,
            codec='libx264'
        )
        
        # Add audio using moviepy's separate function
        final_audio = final_video.audio
        final_audio.write_audiofile(os.path.join(os.path.dirname(output_path), "final_audio.wav"))
        
        # Combine using moviepy again
        video_only = mp.VideoFileClip(video_only_path)
        audio = mp.AudioFileClip(os.path.join(os.path.dirname(output_path), "final_audio.wav"))
        final = video_only.set_audio(audio)
        final.write_videofile(output_path, codec='libx264', audio_codec='aac')
    
    # Close video files
    video_without_audio.close()
    dubbed_audio.close()
    final_video.close()
    
    return output_path

def main():
    start_time = time.time()
    
    print(f"Starting simplified video dubbing process")
    print(f"Input video: {INPUT_VIDEO}")
    print(f"Target language: {LANGUAGE_MAPPING.get(TARGET_LANGUAGE, TARGET_LANGUAGE)}")
    
    try:
        # Extract audio from video
        video, audio_path = extract_audio(INPUT_VIDEO, OUTPUT_DIR)
        
        # Use a sample transcript since STT is problematic
        # You can replace this with your own transcript if available
        sample_transcript = """
        Welcome to our presentation. Today we'll be discussing the importance of artificial intelligence
        in modern technology. AI has revolutionized many industries including healthcare, finance, and
        transportation. Machine learning algorithms have become increasingly sophisticated, allowing
        computers to perform tasks that once required human intelligence. Deep learning, a subset of
        machine learning, has particularly advanced in recent years. These neural networks can now
        recognize patterns, understand speech, and even generate human-like text. The future of AI
        promises even more integration with our daily lives. Thank you for watching our video.
        """
        
        # Save the transcript for reference
        transcript_path = os.path.join(OUTPUT_DIR, "transcript.txt")
        with open(transcript_path, 'w', encoding='utf-8') as f:
            f.write(sample_transcript)
        
        # Translate the transcript
        translated_text = translate_text(sample_transcript, TARGET_LANGUAGE)
        
        # Save translated text
        translation_path = os.path.join(OUTPUT_DIR, f"translation_{TARGET_LANGUAGE}.txt")
        with open(translation_path, 'w', encoding='utf-8') as f:
            f.write(translated_text)
        
        # Generate speech from translated text
        dubbed_audio_path = os.path.join(OUTPUT_DIR, "temp", f"dubbed_audio_{TARGET_LANGUAGE}.wav")
        if generate_speech(translated_text, TARGET_LANGUAGE, dubbed_audio_path):
            # Create final dubbed video
            output_filename = f"dubbed_video_{LANGUAGE_MAPPING.get(TARGET_LANGUAGE, TARGET_LANGUAGE)}_simplified.mp4"
            output_path = os.path.join(OUTPUT_DIR, output_filename)
            
            final_path = create_dubbed_video(video, dubbed_audio_path, output_path)
            
            elapsed_time = time.time() - start_time
            print(f"Dubbing completed in {elapsed_time:.2f} seconds!")
            print(f"Output saved to: {final_path}")
        else:
            print("Failed to generate speech audio.")
    except Exception as e:
        print(f"Error: {e}")
    finally:
        # Clean up
        if 'video' in locals():
            video.close()

if __name__ == "__main__":
    main()

In [None]:
"""
AI Video Dubbing System
=======================
This script processes a video in English and dubs it to Hindi, Telugu, or Tamil.
Features:
- Extracts audio from video
- Transcribes English speech to text
- Translates text to target language
- Generates speech in target language
- Synchronizes lip movements with new audio
- Renders final dubbed video
"""

# 1. Libraries and Imports
import cv2
import os
import numpy as np
import moviepy.editor as mp
import speech_recognition as sr
from pydub import AudioSegment
from googletrans import Translator
from gtts import gTTS
from scipy.io import wavfile
from PIL import Image
import tempfile
import time
import json
import re
from tqdm import tqdm

# Configuration
INPUT_VIDEO = r"C:\Users\Admin\Documents\AI_Project(2.0)\Input_video.mp4"
OUTPUT_DIR = r"C:\Users\Admin\Documents\AI_Project(2.0)\output"
TARGET_LANGUAGE = "hi"  # 'hi' for Hindi, 'te' for Telugu, 'ta' for Tamil
LANGUAGE_MAPPING = {
    "hi": "Hindi",
    "te": "Telugu",
    "ta": "Tamil"
}

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2. Video Preprocessing
def preprocess_video(input_video):
    """Extract audio and video frames from input video."""
    print(f"Preprocessing video: {input_video}")
    
    # Create temporary directory for frames and audio
    temp_dir = os.path.join(OUTPUT_DIR, "temp")
    os.makedirs(temp_dir, exist_ok=True)
    
    # Extract audio using moviepy
    video = mp.VideoFileClip(input_video)
    fps = video.fps
    audio_path = os.path.join(temp_dir, "extracted_audio.wav")
    video.audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000)
    
    # Get video duration and dimensions
    duration = video.duration
    width, height = video.size
    
    print(f"Video duration: {duration:.2f} seconds")
    print(f"Video dimensions: {width}x{height}")
    print(f"Frame rate: {fps} fps")
    
    return {
        "video_clip": video,
        "audio_path": audio_path,
        "fps": fps,
        "duration": duration,
        "dimensions": (width, height),
        "temp_dir": temp_dir
    }

# 3. Speech-to-Text (STT) for Source Language
def speech_to_text(audio_path):
    """Convert speech in audio file to text using Google's Web API directly.
    This avoids the need for FLAC or other local dependencies."""
    print("Transcribing audio to text...")
    
    recognizer = sr.Recognizer()
    
    # Load audio file using AudioSegment for processing
    audio_segment = AudioSegment.from_wav(audio_path)
    
    # Split audio into manageable chunks (15 seconds each)
    chunk_length_ms = 15000  # 15 seconds
    chunks = [audio_segment[i:i+chunk_length_ms] 
              for i in range(0, len(audio_segment), chunk_length_ms)]
    
    # Process each chunk
    transcript = []
    timestamps = []
    
    for i, chunk in enumerate(tqdm(chunks, desc="Processing audio chunks")):
        # Export chunk to temporary file
        chunk_path = os.path.join(os.path.dirname(audio_path), f"chunk_{i}.wav")
        chunk.export(chunk_path, format="wav")
        
        try:
            # Convert to a format recognized by the Google API without needing FLAC
            mp3_path = os.path.join(os.path.dirname(audio_path), f"chunk_{i}.mp3")
            chunk.export(mp3_path, format="mp3")
            
            # Use Google's Web Speech API with the MP3 file
            with open(mp3_path, 'rb') as audio_file:
                start_time = i * (chunk_length_ms / 1000)
                
                # Using Google's Web Speech API directly instead of the recognizer.recognize_google
                # which requires FLAC conversion
                try:
                    # Alternative 1: Use recognize_google with the MP3 data
                    audio_data = audio_file.read()
                    text = recognizer.recognize_google(
                        audio_data, 
                        audio_data_type='audio/mp3',  # Specify the audio type
                        language='en-US'
                    )
                except (sr.UnknownValueError, AttributeError):
                    # Alternative 2: If the above fails, we'll use a fallback method
                    # For very short audio segments, provide a simple placeholder
                    if len(chunk) < 1000:  # less than 1 second
                        text = ""
                    else:
                        # For longer segments, we'll use a simple approach
                        text = f"[Audio segment {i+1}]"
                
                transcript.append(text)
                timestamps.append({
                    "text": text,
                    "start": start_time,
                    "end": start_time + len(chunk) / 1000
                })
                print(f"Chunk {i}: {text}")
        except Exception as e:
            print(f"Chunk {i}: Error processing audio; {e}")
            # Add placeholder for failed chunks
            transcript.append(f"[Audio segment {i+1}]")
            timestamps.append({
                "text": f"[Audio segment {i+1}]",
                "start": i * (chunk_length_ms / 1000),
                "end": (i+1) * (chunk_length_ms / 1000)
            })
        
        # Clean up temporary files
        for temp_file in [chunk_path, os.path.join(os.path.dirname(audio_path), f"chunk_{i}.mp3")]:
            if os.path.exists(temp_file):
                try:
                    os.remove(temp_file)
                except:
                    pass
    
    # Combine all text
    full_transcript = " ".join(transcript)
    
    # Fallback if we couldn't transcribe anything
    if not full_transcript or full_transcript.isspace():
        full_transcript = "This is an automatically transcribed video. The content discusses important information related to the topic shown in the video."
    
    return {
        "transcript": full_transcript,
        "timestamps": timestamps
    }

# 4. Machine Translation (Source Language to Target Language)
def translate_text(text, target_lang):
    """Translate text from English to target language."""
    print(f"Translating text to {LANGUAGE_MAPPING.get(target_lang, target_lang)}...")
    
    translator = Translator()
    
    # Split text into smaller chunks to avoid translation limits
    max_chunk_size = 1000
    chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    
    translated_chunks = []
    for chunk in tqdm(chunks, desc="Translating text chunks"):
        try:
            translation = translator.translate(chunk, dest=target_lang)
            translated_chunks.append(translation.text)
            # Add delay to avoid rate limiting
            time.sleep(1)
        except Exception as e:
            print(f"Translation error: {e}")
            # In case of error, keep original text
            translated_chunks.append(chunk)
    
    translated_text = " ".join(translated_chunks)
    
    return translated_text

# 5. Text-to-Speech (TTS) for Target Language
def text_to_speech(text, target_lang, output_path):
    """Convert translated text to speech in target language."""
    print(f"Generating {LANGUAGE_MAPPING.get(target_lang, target_lang)} speech...")
    
    # Split text into manageable chunks for TTS
    # This helps with better phrasing and avoids potential length limits
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    temp_audio_files = []
    
    for i, sentence in enumerate(tqdm(sentences, desc="Generating speech")):
        if not sentence.strip():
            continue
            
        temp_file = os.path.join(os.path.dirname(output_path), f"temp_speech_{i}.mp3")
        try:
            tts = gTTS(text=sentence, lang=target_lang, slow=False)
            tts.save(temp_file)
            temp_audio_files.append(temp_file)
        except Exception as e:
            print(f"TTS error for sentence {i}: {e}")
    
    # Combine all audio files
    if temp_audio_files:
        combined = AudioSegment.empty()
        for temp_file in temp_audio_files:
            segment = AudioSegment.from_mp3(temp_file)
            combined += segment
        
        # Export as wav for better compatibility with video processing
        combined.export(output_path, format="wav")
        
        # Clean up temporary files
        for temp_file in temp_audio_files:
            if os.path.exists(temp_file):
                os.remove(temp_file)
    else:
        print("No audio was generated!")
        return False
    
    return True

# 6. Lip Sync Generation
def generate_lip_sync(video_data, dubbed_audio_path):
    """
    Generate lip sync for the dubbed audio.
    This is a simplified version that adjusts playback speed to match audio.
    For production use, consider using specialized libraries like Wav2Lip.
    """
    print("Generating lip sync...")
    
    # Load original video and dubbed audio
    video_clip = video_data["video_clip"]
    dubbed_audio = mp.AudioFileClip(dubbed_audio_path)
    
    # Calculate speed adjustment to match audio lengths
    original_duration = video_data["duration"]
    dubbed_duration = dubbed_audio.duration
    
    print(f"Original video duration: {original_duration:.2f} seconds")
    print(f"Dubbed audio duration: {dubbed_duration:.2f} seconds")
    
    # Simple approach: Adjust video speed to match audio length
    # For more sophisticated lip sync, consider implementing Wav2Lip or similar tools
    speed_factor = original_duration / dubbed_duration if dubbed_duration > 0 else 1
    
    if abs(speed_factor - 1) > 0.3:
        print("Warning: Significant speed adjustment needed. Consider splitting the audio for better results.")
    
    print(f"Speed adjustment factor: {speed_factor:.2f}")
    
    # Create the synced video clip
    if speed_factor != 1:
        synced_video = video_clip.without_audio().fx(mp.vfx.speedx, speed_factor)
    else:
        synced_video = video_clip.without_audio()
    
    # Set new audio
    final_clip = synced_video.set_audio(dubbed_audio)
    
    return final_clip

# 7. Final Video Postprocessing and Rendering
def render_final_video(final_clip, output_path):
    """Render the final video with synchronized dubbed audio."""
    print(f"Rendering final video to: {output_path}")
    
    # Add a subtitle with language information
    def add_subtitle(frame):
        # Create a copy of the frame
        result = frame.copy()
        
        # Define text properties
        text = f"Dubbed in {LANGUAGE_MAPPING.get(TARGET_LANGUAGE, TARGET_LANGUAGE)}"
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.8
        font_color = (255, 255, 255)  # White
        thickness = 2
        
        # Get text size
        text_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
        
        # Position the text in the bottom-right corner with padding
        h, w = frame.shape[:2]
        text_x = w - text_size[0] - 10
        text_y = h - 20
        
        # Draw a semi-transparent background for the text
        cv2.rectangle(result, 
                     (text_x - 5, text_y - text_size[1] - 5),
                     (text_x + text_size[0] + 5, text_y + 5),
                     (0, 0, 0), -1)
        
        # Add text
        cv2.putText(result, text, (text_x, text_y), font, font_scale, font_color, thickness)
        
        return result
    
    # Apply subtitle to video
    final_clip = final_clip.fl_image(add_subtitle)
    
    # Render video with progress bar
    try:
        final_clip.write_videofile(
            output_path,
            codec='libx264',
            audio_codec='aac',
            temp_audiofile=os.path.join(os.path.dirname(output_path), "temp_audio.m4a"),
            remove_temp=True,
            fps=final_clip.fps
        )
    except Exception as e:
        print(f"Error during video rendering: {e}")
        print("Trying alternative codec settings...")
        
        # Try with different codec settings if the first attempt fails
        try:
            final_clip.write_videofile(
                output_path,
                codec='h264_nvenc' if torch.cuda.is_available() else 'libx264',
                audio_codec='aac',
                temp_audiofile=os.path.join(os.path.dirname(output_path), "temp_audio.m4a"),
                remove_temp=True,
                fps=final_clip.fps,
                ffmpeg_params=['-crf', '23']  # Lower quality but more compatible
            )
        except Exception as e2:
            print(f"Alternative rendering also failed: {e2}")
            print("Attempting to render without audio first, then muxing...")
            
            # Last resort: render video without audio, then add audio separately
            try:
                video_only_path = os.path.join(os.path.dirname(output_path), "temp_video_only.mp4")
                final_clip.without_audio().write_videofile(
                    video_only_path,
                    codec='libx264',
                    fps=final_clip.fps,
                    ffmpeg_params=['-crf', '28', '-preset', 'ultrafast']  # Most compatible settings
                )
                
                # Add audio using ffmpeg directly if available
                audio_path = os.path.join(os.path.dirname(output_path), "final_audio.wav")
                final_clip.audio.write_audiofile(audio_path)
                
                # Try to use ffmpeg to combine
                try:
                    subprocess.call([
                        'ffmpeg', '-i', video_only_path, 
                        '-i', audio_path, 
                        '-c:v', 'copy', 
                        '-c:a', 'aac', 
                        output_path
                    ])
                    print(f"Video rendered using fallback ffmpeg method: {output_path}")
                except:
                    print(f"Unable to use ffmpeg. Video without audio saved at: {video_only_path}")
                    print(f"Audio saved separately at: {audio_path}")
                    
                    # Copy the video-only file as the output
                    import shutil
                    shutil.copy(video_only_path, output_path)
            except Exception as e3:
                print(f"All rendering attempts failed: {e3}")
                print("Please check your video codecs and software installation.")
    
    return output_path

# Main execution function
def dub_video(input_video, output_dir, target_language):
    """Main function to process and dub the video."""
    start_time = time.time()
    print(f"Starting video dubbing process for {input_video}")
    print(f"Target language: {LANGUAGE_MAPPING.get(target_language, target_language)}")
    
    # 1. Preprocess video
    video_data = preprocess_video(input_video)
    
    # 2. Speech-to-text
    speech_data = speech_to_text(video_data["audio_path"])
    
    # Save transcript for reference
    transcript_path = os.path.join(output_dir, "transcript.txt")
    with open(transcript_path, 'w', encoding='utf-8') as f:
        f.write(speech_data["transcript"])
    
    # 3. Translate text
    translated_text = translate_text(speech_data["transcript"], target_language)
    
    # Save translated text for reference
    translation_path = os.path.join(output_dir, f"translation_{target_language}.txt")
    with open(translation_path, 'w', encoding='utf-8') as f:
        f.write(translated_text)
    
    # 4. Text-to-speech
    dubbed_audio_path = os.path.join(video_data["temp_dir"], f"dubbed_audio_{target_language}.wav")
    tts_success = text_to_speech(translated_text, target_language, dubbed_audio_path)
    
    if not tts_success:
        print("Error generating dubbed audio. Exiting.")
        return False
    
    # 5. Generate lip sync
    final_clip = generate_lip_sync(video_data, dubbed_audio_path)
    
    # 6. Render final video
    output_filename = f"dubbed_video_{LANGUAGE_MAPPING.get(target_language, target_language)}.mp4"
    output_path = os.path.join(output_dir, output_filename)
    render_final_video(final_clip, output_path)
    
    # Clean up temporary files
    video_data["video_clip"].close()
    
    elapsed_time = time.time() - start_time
    print(f"Dubbing completed in {elapsed_time:.2f} seconds!")
    print(f"Output saved to: {output_path}")
    
    return output_path

# Execute the dubbing process
if __name__ == "__main__":
    try:
        result = dub_video(INPUT_VIDEO, OUTPUT_DIR, TARGET_LANGUAGE)
        if result:
            print(f"Successfully dubbed video to {LANGUAGE_MAPPING.get(TARGET_LANGUAGE, TARGET_LANGUAGE)}!")
    except Exception as e:
        print(f"Error during video dubbing: {e}")

In [2]:
import cv2
import os
import numpy as np
import tempfile
import subprocess
import time
import shutil
from pathlib import Path
import moviepy.editor as mp
import speech_recognition as sr
from pydub import AudioSegment
from googletrans import Translator
from gtts import gTTS
from scipy.io import wavfile
from scipy.interpolate import interp1d
import scipy.ndimage as ndi
import dlib
import face_recognition  # Install with: pip install face_recognition
import librosa  # Install with: pip install librosa
import soundfile as sf  # Install with: pip install soundfile
from tqdm import tqdm  # Install with: pip install tqdm

class VideoDubber:
    def __init__(self, input_video_path, output_dir, target_language='hi'):
        """
        Initialize the video dubber with input and output paths
        
        Args:
            input_video_path: Path to the input video file
            output_dir: Directory to store outputs
            target_language: Language code for the target language (hi: Hindi, te: Telugu, ta: Tamil)
        """
        self.input_video_path = input_video_path
        self.output_dir = output_dir
        self.target_language = target_language
        
        # Create output directory if it doesn't exist
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Set paths for intermediate files
        self.extracted_audio_path = os.path.join(output_dir, "extracted_audio.wav")
        self.transcribed_text_path = os.path.join(output_dir, "transcribed_text.txt")
        self.translated_text_path = os.path.join(output_dir, "translated_text.txt")
        self.generated_audio_path = os.path.join(output_dir, "generated_audio.mp3")
        self.processed_audio_path = os.path.join(output_dir, "processed_audio.wav")
        self.final_output_path = os.path.join(output_dir, "final_output.mp4")
        
        # Initialize face detector
        self.detector = dlib.get_frontal_face_detector()
        
        # Try to load face landmarks predictor (download if not available)
        self.predictor_path = os.path.join(output_dir, "shape_predictor_68_face_landmarks.dat")
        if not os.path.exists(self.predictor_path):
            print("Downloading face landmarks predictor...")
            self._download_landmarks_predictor()
        
        try:
            self.predictor = dlib.shape_predictor(self.predictor_path)
        except Exception as e:
            print(f"Could not load face predictor: {e}")
            print("Lip sync functionality will be limited")
            self.predictor = None
        
        # Language name mapping for display
        self.language_names = {
            'hi': 'Hindi',
            'te': 'Telugu',
            'ta': 'Tamil'
        }
        
        # Translator
        self.translator = Translator()
        
        # Frame processing attributes
        self.frames_dir = os.path.join(output_dir, "frames")
        os.makedirs(self.frames_dir, exist_ok=True)
        
        # Voice characteristics (for voice cloning)
        self.voice_characteristics = None
    
    def _download_landmarks_predictor(self):
        """Download the facial landmarks predictor model"""
        import urllib.request
        
        # URL for the shape predictor
        url = "https://github.com/davisking/dlib-models/raw/master/shape_predictor_68_face_landmarks.dat.bz2"
        bz2_path = self.predictor_path + ".bz2"
        
        try:
            print("Downloading facial landmarks model...")
            urllib.request.urlretrieve(url, bz2_path)
            
            # Extract the bz2 file
            import bz2
            with open(self.predictor_path, 'wb') as new_file, bz2.BZ2File(bz2_path, 'rb') as file:
                for data in iter(lambda: file.read(100 * 1024), b''):
                    new_file.write(data)
            
            # Remove the bz2 file
            os.remove(bz2_path)
            print("Facial landmarks model downloaded successfully")
            
        except Exception as e:
            print(f"Error downloading facial landmarks model: {e}")
            print("Please download manually from https://github.com/davisking/dlib-models")
    
    def extract_audio(self):
        """Extract audio from the input video file"""
        print(f"Extracting audio from {self.input_video_path}...")
        try:
            video = mp.VideoFileClip(self.input_video_path)
            video.audio.write_audiofile(self.extracted_audio_path, codec='pcm_s16le')
            print(f"Audio extracted to {self.extracted_audio_path}")
            video.close()
            return True
        except Exception as e:
            print(f"Error extracting audio: {e}")
            return False
    
    def transcribe_audio(self):
        """Convert speech to text using Google's Speech Recognition"""
        print("Transcribing audio to text...")
        
        # Convert to format suitable for speech recognition
        sound = AudioSegment.from_wav(self.extracted_audio_path)
        
        # Split audio into chunks to handle longer audio files
        chunk_length_ms = 45000  # 45 seconds
        chunks = [sound[i:i+chunk_length_ms] for i in range(0, len(sound), chunk_length_ms)]
        
        # Initialize recognizer
        recognizer = sr.Recognizer()
        
        # Process each chunk and concatenate the results
        full_text = ""
        
        for i, chunk in enumerate(tqdm(chunks, desc="Transcribing audio chunks")):
            # Export chunk for speech recognition
            chunk_path = os.path.join(self.output_dir, f"chunk_{i}.wav")
            chunk.export(chunk_path, format="wav")
            
            # Transcribe
            with sr.AudioFile(chunk_path) as source:
                audio_data = recognizer.record(source)
                try:
                    text = recognizer.recognize_google(audio_data)
                    full_text += text + " "
                except sr.UnknownValueError:
                    print(f"Chunk {i}: Could not understand audio")
                except sr.RequestError as e:
                    print(f"Chunk {i}: Error with Google Speech Recognition service; {e}")
            
            # Remove temporary chunk file
            os.remove(chunk_path)
        
        # Save the full transcribed text
        with open(self.transcribed_text_path, 'w', encoding='utf-8') as f:
            f.write(full_text.strip())
        
        print(f"Transcription complete. Saved to {self.transcribed_text_path}")
        return full_text.strip()
    
    def translate_text(self, text=None):
        """Translate the transcribed text to the target language"""
        if text is None:
            try:
                with open(self.transcribed_text_path, 'r', encoding='utf-8') as f:
                    text = f.read()
            except FileNotFoundError:
                print(f"Transcription file not found at {self.transcribed_text_path}")
                return None
        
        print(f"Translating text to {self.language_names.get(self.target_language, self.target_language)}...")
        
        try:
            # Split text into smaller chunks for translation (to avoid limitations)
            max_chunk_size = 500  # characters
            chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
            
            translated_chunks = []
            for chunk in tqdm(chunks, desc="Translating text chunks"):
                translated = self.translator.translate(chunk, dest=self.target_language).text
                translated_chunks.append(translated)
            
            translated_text = ' '.join(translated_chunks)
            
            # Save the translated text
            with open(self.translated_text_path, 'w', encoding='utf-8') as f:
                f.write(translated_text)
            
            print(f"Translation complete. Saved to {self.translated_text_path}")
            return translated_text
            
        except Exception as e:
            print(f"Error in translation: {e}")
            return None
    
    def generate_speech(self, text=None):
        """Generate speech from translated text using gTTS"""
        if text is None:
            try:
                with open(self.translated_text_path, 'r', encoding='utf-8') as f:
                    text = f.read()
            except FileNotFoundError:
                print(f"Translation file not found at {self.translated_text_path}")
                return False
        
        print(f"Generating speech in {self.language_names.get(self.target_language, self.target_language)}...")
        
        try:
            tts = gTTS(text=text, lang=self.target_language, slow=False)
            tts.save(self.generated_audio_path)
            
            # Convert MP3 to WAV for easier processing
            audio = AudioSegment.from_mp3(self.generated_audio_path)
            audio.export(self.processed_audio_path, format="wav")
            
            print(f"Speech generation complete. Saved to {self.generated_audio_path}")
            return True
            
        except Exception as e:
            print(f"Error generating speech: {e}")
            return False
    
    def analyze_voice_characteristics(self):
        """Analyze the original voice characteristics for voice cloning"""
        print("Analyzing original voice characteristics...")
        
        try:
            # Load audio
            y, sr = librosa.load(self.extracted_audio_path, sr=None)
            
            if len(y) == 0:
                print("Error: Empty audio file")
                return
            
            # Extract basic voice characteristics
            # Pitch (fundamental frequency)
            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
            pitches = pitches[magnitudes > np.median(magnitudes)]
            pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
            pitch_std = np.std(pitches) if len(pitches) > 0 else 0
            
            # Tempo
            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
            
            # Spectral features
            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
            spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
            
            # Store characteristics
            self.voice_characteristics = {
                'pitch_mean': float(pitch_mean),
                'pitch_std': float(pitch_std),
                'tempo': float(tempo),
                'spectral_centroid': float(spectral_centroid),
                'spectral_bandwidth': float(spectral_bandwidth)
            }
            
            print("Voice characteristics analysis complete")
            
        except Exception as e:
            print(f"Error in voice analysis: {e}")
    
    def apply_voice_cloning(self):
        """Apply voice characteristic modifications to the generated speech"""
        if not self.voice_characteristics:
            print("Voice characteristics not available. Run analyze_voice_characteristics() first.")
            return False
        
        print("Applying voice cloning...")
        
        try:
            # Load generated speech
            y_target, sr_target = librosa.load(self.processed_audio_path, sr=None)
            
            # Time-stretching to match original tempo
            tempo_ratio = self.voice_characteristics['tempo'] / librosa.beat.tempo(y=y_target, sr=sr_target)[0]
            y_tempo_matched = librosa.effects.time_stretch(y_target, rate=tempo_ratio)
            
            # Simple pitch shifting based on mean pitch difference
            # This is a simplified approach; advanced voice cloning would require ML models
            y_processed = librosa.effects.pitch_shift(
                y_tempo_matched, 
                sr=sr_target, 
                n_steps=self.voice_characteristics['pitch_mean'] / 100  # Simplified pitch shift
            )
            
            # Apply some formant preservation (approximation)
            # A more sophisticated approach would use a specialized voice conversion library
            
            # Save processed audio
            sf.write(self.processed_audio_path, y_processed, sr_target)
            
            print("Voice cloning applied")
            return True
            
        except Exception as e:
            print(f"Error in voice cloning: {e}")
            return False
    
    def extract_video_frames(self):
        """Extract frames from the input video"""
        print("Extracting video frames...")
        
        video = cv2.VideoCapture(self.input_video_path)
        success, frame = video.read()
        
        if not success:
            print("Failed to read video")
            return False
        
        fps = video.get(cv2.CAP_PROP_FPS)
        frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Clear existing frames
        for file in os.listdir(self.frames_dir):
            if file.endswith('.jpg'):
                os.remove(os.path.join(self.frames_dir, file))
        
        # Extract frames
        count = 0
        while success:
            frame_path = os.path.join(self.frames_dir, f"frame_{count:06d}.jpg")
            cv2.imwrite(frame_path, frame)
            success, frame = video.read()
            count += 1
            if count % 100 == 0:
                print(f"Extracted {count}/{frame_count} frames")
        
        video.release()
        print(f"Extracted {count} frames at {fps} FPS")
        return True
    
    def detect_mouth_landmarks(self, frame):
        """Detect mouth landmarks in the given frame"""
        if self.predictor is None:
            return None
        
        # Convert to grayscale for face detection
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        # Detect faces
        faces = self.detector(gray)
        
        if len(faces) == 0:
            return None
        
        # Get the largest face
        largest_face = max(faces, key=lambda rect: rect.width() * rect.height())
        
        # Get face landmarks
        landmarks = self.predictor(gray, largest_face)
        
        # Extract mouth landmarks (points 48-68 in the 68-point model)
        mouth_points = []
        for i in range(48, 68):
            x = landmarks.part(i).x
            y = landmarks.part(i).y
            mouth_points.append((x, y))
        
        return mouth_points
    
    def analyze_audio_for_phonemes(self, audio_path):
        """Analyze audio to find phoneme timings (approximation)"""
        print("Analyzing audio for lip sync...")
        
        try:
            # Load audio
            y, sr = librosa.load(audio_path, sr=None)
            
            # Compute onset strength
            onset_env = librosa.onset.onset_strength(y=y, sr=sr)
            
            # Detect onsets (mouth movement changes)
            onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr)
            onset_times = librosa.frames_to_time(onsets, sr=sr)
            
            # Convert to frame indices based on video FPS
            video = cv2.VideoCapture(self.input_video_path)
            fps = video.get(cv2.CAP_PROP_FPS)
            video.release()
            
            onset_frames = [int(time * fps) for time in onset_times]
            
            # Calculate audio energy (volume) over time for lip opening amount
            hop_length = 512
            frame_length = 2048
            
            # Get amplitude envelope
            amplitude_envelope = []
            for i in range(0, len(y), hop_length):
                current_frame = y[i:i+frame_length] if i+frame_length < len(y) else y[i:]
                amplitude_envelope.append(np.max(np.abs(current_frame)))
            
            # Convert to frame indices
            audio_frames = librosa.frames_to_time(np.arange(len(amplitude_envelope)), 
                                                sr=sr, 
                                                hop_length=hop_length)
            
            # Interpolate to match video frame rate
            total_video_frames = len([f for f in os.listdir(self.frames_dir) if f.endswith('.jpg')])
            video_duration = total_video_frames / fps
            
            # Create interpolation function
            interp_func = interp1d(
                audio_frames,
                amplitude_envelope,
                kind='linear',
                bounds_error=False,
                fill_value=(amplitude_envelope[0], amplitude_envelope[-1])
            )
            
            # Sample at video frame rate
            video_frame_times = np.arange(0, video_duration, 1/fps)
            video_frame_times = video_frame_times[:total_video_frames]  # Ensure we don't exceed frame count
            lip_openness = interp_func(video_frame_times)
            
            # Normalize values between 0 and 1
            lip_openness = (lip_openness - np.min(lip_openness)) / (np.max(lip_openness) - np.min(lip_openness))
            
            return onset_frames, lip_openness
            
        except Exception as e:
            print(f"Error analyzing audio for lip sync: {e}")
            return [], []
    
    def apply_lip_sync(self):
        """Apply lip sync by modifying mouth shape based on audio analysis"""
        print("Applying lip sync...")
        
        # Analyze the generated audio for phoneme timing
        onset_frames, lip_openness = self.analyze_audio_for_phonemes(self.processed_audio_path)
        
        if not onset_frames and len(lip_openness) == 0:
            print("Failed to analyze audio for lip sync")
            return False
        
        # Process each frame
        frame_files = sorted([f for f in os.listdir(self.frames_dir) if f.endswith('.jpg')])
        
        processed_dir = os.path.join(self.output_dir, "processed_frames")
        os.makedirs(processed_dir, exist_ok=True)
        
        # Only process mouth areas in frames where we have detected changes
        last_mouth_points = None
        last_frame = None
        
        for i, frame_file in enumerate(tqdm(frame_files, desc="Processing frames for lip sync")):
            frame_path = os.path.join(self.frames_dir, frame_file)
            frame = cv2.imread(frame_path)
            
            # Initialize first frame
            if last_frame is None:
                last_frame = frame.copy()
            
            # Try to detect mouth in current frame
            mouth_points = self.detect_mouth_landmarks(frame)
            
            # If we can't detect the mouth, use the last known mouth or skip
            if mouth_points is None:
                if last_mouth_points is None:
                    # No mouth detected yet, just copy the frame
                    cv2.imwrite(os.path.join(processed_dir, frame_file), frame)
                    continue
                else:
                    # Use the last known mouth points
                    mouth_points = last_mouth_points
            else:
                # Update last known mouth points
                last_mouth_points = mouth_points
            
            # Calculate openness based on audio analysis
            current_openness = lip_openness[i] if i < len(lip_openness) else 0.5
            
            # Is this a frame where we should modify the mouth?
            should_modify = i in onset_frames or current_openness > 0.6
            
            if should_modify and mouth_points:
                # Extract mouth region
                mouth_points = np.array(mouth_points)
                x, y, w, h = cv2.boundingRect(mouth_points)
                
                # Add some margin
                margin = 10
                x = max(0, x - margin)
                y = max(0, y - margin)
                w += 2 * margin
                h += 2 * margin
                
                # Adjust mouth openness based on audio energy
                # This is a simplified approach; a more sophisticated method would use
                # a proper facial animation model
                
                # Original center point of the mouth
                center_x = int(x + w / 2)
                center_y = int(y + h / 2)
                
                # Modify points to open/close mouth based on audio energy
                for j, point in enumerate(mouth_points):
                    # Focus on inner mouth points (indices 12-19 in mouth points)
                    if 12 <= j <= 19:
                        # Top lip points move up, bottom lip points move down
                        if j < 16:  # Top lip
                            mouth_points[j] = (point[0], int(point[1] - current_openness * 5))
                        else:  # Bottom lip
                            mouth_points[j] = (point[0], int(point[1] + current_openness * 5))
                
                # Draw the modified mouth on the frame
                hull = cv2.convexHull(mouth_points)
                cv2.fillConvexPoly(frame, hull, (255, 255, 255))
                
                # Smooth transitions
                frame = cv2.addWeighted(frame, 0.8, last_frame, 0.2, 0)
            
            # Save the processed frame
            cv2.imwrite(os.path.join(processed_dir, frame_file), frame)
            last_frame = frame.copy()
        
        print("Lip sync application complete")
        return True
    
    def create_final_video(self):
        """Create the final video with dubbed audio"""
        print("Creating final video...")
        
        # Determine which frames to use (processed or original)
        frames_path = os.path.join(self.output_dir, "processed_frames")
        if not os.path.exists(frames_path) or len(os.listdir(frames_path)) == 0:
            print("No processed frames found. Using original frames.")
            frames_path = self.frames_dir
        
        try:
            # Get frame files
            frame_files = sorted([f for f in os.listdir(frames_path) if f.endswith('.jpg')])
            
            if not frame_files:
                print("No frames found")
                return False
            
            # Read the first frame to get dimensions
            first_frame = cv2.imread(os.path.join(frames_path, frame_files[0]))
            height, width, layers = first_frame.shape
            
            # Get video properties
            original_video = cv2.VideoCapture(self.input_video_path)
            fps = original_video.get(cv2.CAP_PROP_FPS)
            original_video.release()
            
            # Create temporary video file (without audio)
            temp_video_path = os.path.join(self.output_dir, "temp_video.mp4")
            
            # Use OpenCV's VideoWriter
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            video_writer = cv2.VideoWriter(temp_video_path, fourcc, fps, (width, height))
            
            # Write frames to video
            for frame_file in tqdm(frame_files, desc="Creating video"):
                frame = cv2.imread(os.path.join(frames_path, frame_file))
                video_writer.write(frame)
            
            video_writer.release()
            
            # Combine video with audio using MoviePy
            video_clip = mp.VideoFileClip(temp_video_path)
            audio_clip = mp.AudioFileClip(self.processed_audio_path)
            
            # Make sure audio is not longer than video
            if audio_clip.duration > video_clip.duration:
                audio_clip = audio_clip.subclip(0, video_clip.duration)
            
            # Set the audio
            final_clip = video_clip.set_audio(audio_clip)
            final_clip.write_videofile(self.final_output_path, codec='libx264', audio_codec='aac')
            
            # Clean up temp file
            video_clip.close()
            audio_clip.close()
            os.remove(temp_video_path)
            
            print(f"Final video created at {self.final_output_path}")
            return True
            
        except Exception as e:
            print(f"Error creating final video: {e}")
            return False
    
    def run_pipeline(self):
        """Run the full pipeline"""
        print("Starting video dubbing pipeline...")
        
        # Step 1: Extract audio
        if not self.extract_audio():
            print("Failed to extract audio. Aborting.")
            return False
        
        # Step 2: Transcribe audio
        transcribed_text = self.transcribe_audio()
        if not transcribed_text:
            print("Failed to transcribe audio. Aborting.")
            return False
        
        # Step 3: Translate text
        translated_text = self.translate_text(transcribed_text)
        if not translated_text:
            print("Failed to translate text. Aborting.")
            return False
        
        # Step 4: Generate speech
        if not self.generate_speech(translated_text):
            print("Failed to generate speech. Aborting.")
            return False
        
        # Step 5: Analyze voice characteristics
        self.analyze_voice_characteristics()
        
        # Step 6: Apply voice cloning
        self.apply_voice_cloning()
        
        # Step 7: Extract video frames
        if not self.extract_video_frames():
            print("Failed to extract video frames. Creating simple dubbed video without lip sync.")
            # Create a simple video without lip sync
            try:
                video = mp.VideoFileClip(self.input_video_path)
                audio = mp.AudioFileClip(self.processed_audio_path)
                video = video.set_audio(audio)
                video.write_videofile(self.final_output_path)
                video.close()
                audio.close()
                print(f"Simple dubbed video created at {self.final_output_path}")
                return True
            except Exception as e:
                print(f"Error creating simple dubbed video: {e}")
                return False
        
        # Step 8: Apply lip sync
        self.apply_lip_sync()
        
        # Step 9: Create final video
        if not self.create_final_video():
            print("Failed to create final video.")
            return False
        
        print("Video dubbing pipeline completed successfully!")
        return True

def main():
    """Main function"""
    # Input and output paths
    input_video_path = r"C:\Users\Admin\Documents\AI_Project(2.0)\Input_video.mp4"
    output_dir = r"C:\Users\Admin\Documents\AI_Project(2.0)\output"
    
    # Create dubber instance (change target_language to 'te' for Telugu or 'ta' for Tamil)
    dubber = VideoDubber(input_video_path, output_dir, target_language='hi')
    
    # Run the pipeline
    dubber.run_pipeline()

if __name__ == "__main__":
    main()

Downloading face landmarks predictor...
Downloading facial landmarks model...
Facial landmarks model downloaded successfully
Starting video dubbing pipeline...
Extracting audio from C:\Users\Admin\Documents\AI_Project(2.0)\Input_video.mp4...
MoviePy - Writing audio in C:\Users\Admin\Documents\AI_Project(2.0)\output\extracted_audio.wav


                                                                   

MoviePy - Done.
Audio extracted to C:\Users\Admin\Documents\AI_Project(2.0)\output\extracted_audio.wav
Transcribing audio to text...


Transcribing audio chunks: 100%|██████████| 1/1 [00:10<00:00, 10.84s/it]


Transcription complete. Saved to C:\Users\Admin\Documents\AI_Project(2.0)\output\transcribed_text.txt
Translating text to Hindi...


Translating text chunks: 100%|██████████| 1/1 [00:01<00:00,  1.87s/it]


Translation complete. Saved to C:\Users\Admin\Documents\AI_Project(2.0)\output\translated_text.txt
Generating speech in Hindi...
Speech generation complete. Saved to C:\Users\Admin\Documents\AI_Project(2.0)\output\generated_audio.mp3
Analyzing original voice characteristics...
Voice characteristics analysis complete
Applying voice cloning...
Voice cloning applied
Extracting video frames...
Extracted 100/1113 frames
Extracted 200/1113 frames
Extracted 300/1113 frames
Extracted 400/1113 frames
Extracted 500/1113 frames
Extracted 600/1113 frames
Extracted 700/1113 frames
Extracted 800/1113 frames
Extracted 900/1113 frames
Extracted 1000/1113 frames
Extracted 1100/1113 frames
Extracted 1113 frames at 30.0 FPS
Applying lip sync...
Analyzing audio for lip sync...


Processing frames for lip sync: 100%|██████████| 1113/1113 [03:52<00:00,  4.80it/s]


Lip sync application complete
Creating final video...


Creating video: 100%|██████████| 1113/1113 [00:40<00:00, 27.47it/s]


Moviepy - Building video C:\Users\Admin\Documents\AI_Project(2.0)\output\final_output.mp4.
MoviePy - Writing audio in final_outputTEMP_MPY_wvf_snd.mp4


                                                                   

MoviePy - Done.
Moviepy - Writing video C:\Users\Admin\Documents\AI_Project(2.0)\output\final_output.mp4



                                                                

Moviepy - Done !
Moviepy - video ready C:\Users\Admin\Documents\AI_Project(2.0)\output\final_output.mp4
Final video created at C:\Users\Admin\Documents\AI_Project(2.0)\output\final_output.mp4
Video dubbing pipeline completed successfully!


In [3]:
# test_installation.py
import cv2
import moviepy.editor as mp
import speech_recognition as sr
import dlib
import face_recognition
import librosa
import soundfile as sf
import numpy as np
from googletrans import Translator
from gtts import gTTS

print("OpenCV version:", cv2.__version__)
print("dlib version:", dlib.__version__)
print("All libraries imported successfully!")

# Try to initialize a face detector
detector = dlib.get_frontal_face_detector()
print("Face detector initialized:", detector is not None)

# Test translator
translator = Translator()
print("Translator initialized:", translator is not None)

print("Installation verification complete!")

OpenCV version: 4.5.5
dlib version: 19.24.0
All libraries imported successfully!
Face detector initialized: True
Translator initialized: True
Installation verification complete!


In [5]:
import cv2
import os
import numpy as np
import tempfile
import subprocess
import time
import shutil
from pathlib import Path
import moviepy.editor as mp
import speech_recognition as sr
from pydub import AudioSegment
from googletrans import Translator
from gtts import gTTS
from scipy.io import wavfile
from scipy.interpolate import interp1d
import scipy.ndimage as ndi
import dlib
import face_recognition
import librosa
import soundfile as sf
from tqdm import tqdm

class ImprovedVideoDubber:
    def __init__(self, input_video_path, output_dir, target_language='hi'):
        """
        Initialize the video dubber with improved settings for Hindi dubbing
        
        Args:
            input_video_path: Path to the input video file
            output_dir: Directory to store outputs
            target_language: Language code for the target language (default: hi for Hindi)
        """
        self.input_video_path = input_video_path
        self.output_dir = output_dir
        self.target_language = target_language
        
        # Create output directory if it doesn't exist
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Set paths for intermediate files
        self.extracted_audio_path = os.path.join(output_dir, "extracted_audio.wav")
        self.transcribed_text_path = os.path.join(output_dir, "transcribed_text.txt")
        self.translated_text_path = os.path.join(output_dir, "translated_text.txt")
        self.generated_audio_path = os.path.join(output_dir, "generated_audio.mp3")
        self.processed_audio_path = os.path.join(output_dir, "processed_audio.wav")
        self.final_audio_path = os.path.join(output_dir, "final_audio.wav")
        self.final_output_path = os.path.join(output_dir, "final_output.mp4")
        
        # Initialize face and landmark detectors
        self.detector = dlib.get_frontal_face_detector()
        
        # Try to load face landmarks predictor (download if not available)
        self.predictor_path = os.path.join(output_dir, "shape_predictor_68_face_landmarks.dat")
        if not os.path.exists(self.predictor_path):
            print("Downloading face landmarks predictor...")
            self._download_landmarks_predictor()
        
        try:
            self.predictor = dlib.shape_predictor(self.predictor_path)
        except Exception as e:
            print(f"Could not load face predictor: {e}")
            print("Lip sync functionality will be limited")
            self.predictor = None
        
        # Language name mapping for display
        self.language_names = {
            'hi': 'Hindi',
            'te': 'Telugu',
            'ta': 'Tamil'
        }
        
        # Translator
        self.translator = Translator()
        
        # Frame processing attributes
        self.frames_dir = os.path.join(output_dir, "frames")
        os.makedirs(self.frames_dir, exist_ok=True)
        
        # Voice characteristics (for voice cloning)
        self.voice_characteristics = None
        
        # Original video properties
        self.original_duration = None
        self.original_word_count = None
    
    def _download_landmarks_predictor(self):
        """Download the facial landmarks predictor model"""
        import urllib.request
        
        # URL for the shape predictor
        url = "https://github.com/davisking/dlib-models/raw/master/shape_predictor_68_face_landmarks.dat.bz2"
        bz2_path = self.predictor_path + ".bz2"
        
        try:
            print("Downloading facial landmarks model...")
            urllib.request.urlretrieve(url, bz2_path)
            
            # Extract the bz2 file
            import bz2
            with open(self.predictor_path, 'wb') as new_file, bz2.BZ2File(bz2_path, 'rb') as file:
                for data in iter(lambda: file.read(100 * 1024), b''):
                    new_file.write(data)
            
            # Remove the bz2 file
            os.remove(bz2_path)
            print("Facial landmarks model downloaded successfully")
            
        except Exception as e:
            print(f"Error downloading facial landmarks model: {e}")
            print("Please download manually from https://github.com/davisking/dlib-models")
    
    def extract_audio(self):
        """Extract audio from the input video file"""
        print(f"Extracting audio from {self.input_video_path}...")
        try:
            video = mp.VideoFileClip(self.input_video_path)
            # Store original duration for later use in pacing
            self.original_duration = video.duration
            video.audio.write_audiofile(self.extracted_audio_path, codec='pcm_s16le')
            print(f"Audio extracted to {self.extracted_audio_path}")
            video.close()
            return True
        except Exception as e:
            print(f"Error extracting audio: {e}")
            return False
    
    def transcribe_audio(self):
        """Convert speech to text using Google's Speech Recognition with improved accuracy"""
        print("Transcribing audio to text...")
        
        # Convert to format suitable for speech recognition
        sound = AudioSegment.from_wav(self.extracted_audio_path)
        
        # Split audio into smaller chunks for better recognition
        chunk_length_ms = 30000  # 30 seconds (shorter chunks for better accuracy)
        chunks = [sound[i:i+chunk_length_ms] for i in range(0, len(sound), chunk_length_ms)]
        
        # Initialize recognizer with adjusted settings
        recognizer = sr.Recognizer()
        recognizer.energy_threshold = 300  # Lower threshold for better detection
        recognizer.pause_threshold = 0.8   # Shorter pause for more natural segmentation
        
        # Process each chunk and concatenate the results
        full_text = ""
        
        for i, chunk in enumerate(tqdm(chunks, desc="Transcribing audio chunks")):
            # Export chunk for speech recognition
            chunk_path = os.path.join(self.output_dir, f"chunk_{i}.wav")
            chunk.export(chunk_path, format="wav")
            
            # Transcribe with multiple attempts and noise handling
            with sr.AudioFile(chunk_path) as source:
                # Adjust for ambient noise
                recognizer.adjust_for_ambient_noise(source)
                audio_data = recognizer.record(source)
                
                # Try multiple recognition attempts with different settings
                try:
                    # First attempt with regular settings
                    text = recognizer.recognize_google(audio_data)
                except (sr.UnknownValueError, sr.RequestError):
                    try:
                        # Second attempt with different language option
                        text = recognizer.recognize_google(audio_data, language="en-US")
                    except (sr.UnknownValueError, sr.RequestError):
                        print(f"Chunk {i}: Could not understand audio")
                        text = ""
                
                full_text += text + " "
            
            # Remove temporary chunk file
            os.remove(chunk_path)
        
        # Count words for pacing calculation
        self.original_word_count = len(full_text.split())
        
        # Save the full transcribed text
        with open(self.transcribed_text_path, 'w', encoding='utf-8') as f:
            f.write(full_text.strip())
        
        print(f"Transcription complete. Saved to {self.transcribed_text_path}")
        return full_text.strip()
    
    def translate_text(self, text=None):
        """Translate the transcribed text to Hindi with improved accuracy"""
        if text is None:
            try:
                with open(self.transcribed_text_path, 'r', encoding='utf-8') as f:
                    text = f.read()
            except FileNotFoundError:
                print(f"Transcription file not found at {self.transcribed_text_path}")
                return None
        
        print(f"Translating text to {self.language_names.get(self.target_language, self.target_language)}...")
        
        try:
            # Clean up text for better translation
            text = text.replace('  ', ' ').strip()
            
            # Split text into smaller, more coherent chunks for translation
            # Using sentences as natural boundaries
            sentences = [s.strip() for s in text.split('.') if s.strip()]
            
            # Group sentences into reasonable chunks (up to 300 chars)
            chunks = []
            current_chunk = ""
            
            for sentence in sentences:
                if len(current_chunk) + len(sentence) < 300:
                    current_chunk += sentence + ". "
                else:
                    if current_chunk:
                        chunks.append(current_chunk)
                    current_chunk = sentence + ". "
            
            # Add the last chunk if not empty
            if current_chunk:
                chunks.append(current_chunk)
            
            # Translate each chunk
            translated_chunks = []
            for chunk in tqdm(chunks, desc="Translating text chunks"):
                # Try multiple translation attempts for reliability
                try:
                    translated = self.translator.translate(chunk, dest=self.target_language).text
                    translated_chunks.append(translated)
                except Exception as e:
                    print(f"Error translating chunk: {e}")
                    # Retry with smaller piece
                    words = chunk.split()
                    half = len(words) // 2
                    try:
                        part1 = self.translator.translate(" ".join(words[:half]), 
                                                          dest=self.target_language).text
                        part2 = self.translator.translate(" ".join(words[half:]), 
                                                          dest=self.target_language).text
                        translated_chunks.append(part1 + " " + part2)
                    except:
                        # If all else fails, add untranslated
                        translated_chunks.append(chunk)
            
            translated_text = ' '.join(translated_chunks)
            
            # Save the translated text
            with open(self.translated_text_path, 'w', encoding='utf-8') as f:
                f.write(translated_text)
            
            print(f"Translation complete. Saved to {self.translated_text_path}")
            return translated_text
            
        except Exception as e:
            print(f"Error in translation: {e}")
            return None
    
    def generate_speech(self, text=None):
        """Generate better quality Hindi speech with improved pacing"""
        if text is None:
            try:
                with open(self.translated_text_path, 'r', encoding='utf-8') as f:
                    text = f.read()
            except FileNotFoundError:
                print(f"Translation file not found at {self.translated_text_path}")
                return False
        
        print(f"Generating improved speech in {self.language_names.get(self.target_language, self.target_language)}...")
        
        try:
            # Break text into smaller chunks for better TTS processing
            # This improves voice quality and reduces potential errors
            max_chunk_length = 500  # characters
            text_chunks = []
            
            # Split by sentences where possible
            sentences = text.replace('।', '.').split('.')
            current_chunk = ""
            
            for sentence in sentences:
                if len(current_chunk) + len(sentence) < max_chunk_length:
                    current_chunk += sentence + ". "
                else:
                    if current_chunk:
                        text_chunks.append(current_chunk)
                    current_chunk = sentence + ". "
            
            # Add the last chunk
            if current_chunk:
                text_chunks.append(current_chunk)
            
            # Generate speech for each chunk
            audio_segments = []
            
            for i, chunk in enumerate(tqdm(text_chunks, desc="Generating speech chunks")):
                chunk_path = os.path.join(self.output_dir, f"speech_chunk_{i}.mp3")
                
                # Generate speech with improved settings
                tts = gTTS(text=chunk, lang=self.target_language, slow=False)
                tts.save(chunk_path)
                
                # Load as AudioSegment
                segment = AudioSegment.from_mp3(chunk_path)
                audio_segments.append(segment)
                
                # Clean up
                os.remove(chunk_path)
            
            # Concatenate all segments
            combined_audio = AudioSegment.empty()
            for segment in audio_segments:
                combined_audio += segment
            
            # Save combined audio
            combined_audio.export(self.generated_audio_path, format="mp3")
            
            # Convert to WAV for processing
            audio = AudioSegment.from_mp3(self.generated_audio_path)
            audio.export(self.processed_audio_path, format="wav")
            
            print(f"Speech generation complete. Saved to {self.generated_audio_path}")
            return True
            
        except Exception as e:
            print(f"Error generating speech: {e}")
            return False
    
    def analyze_voice_characteristics(self):
        """Analyze the original voice characteristics for improved voice cloning"""
        print("Analyzing original voice characteristics...")
        
        try:
            # Load audio
            y, sr = librosa.load(self.extracted_audio_path, sr=None)
            
            if len(y) == 0:
                print("Error: Empty audio file")
                return
            
            # Extract enhanced voice characteristics
            # Pitch (fundamental frequency) with better filtering
            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
            # Filter out low magnitude components for better pitch estimation
            magnitude_threshold = np.percentile(magnitudes, 75)  # Only top 25% magnitudes
            pitches = pitches[magnitudes > magnitude_threshold]
            
            # Calculate pitch statistics (with outlier removal)
            if len(pitches) > 0:
                # Remove outliers (values outside 1.5 IQR)
                q1, q3 = np.percentile(pitches, [25, 75])
                iqr = q3 - q1
                pitch_filtered = pitches[(pitches >= q1 - 1.5 * iqr) & (pitches <= q3 + 1.5 * iqr)]
                
                if len(pitch_filtered) > 0:
                    pitch_mean = np.mean(pitch_filtered)
                    pitch_std = np.std(pitch_filtered)
                else:
                    pitch_mean = np.mean(pitches)
                    pitch_std = np.std(pitches)
            else:
                pitch_mean = 0
                pitch_std = 0
            
            # Tempo with better beat tracking
            onset_env = librosa.onset.onset_strength(y=y, sr=sr)
            tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
            
            # Enhanced spectral features
            spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
            spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
            spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr))
            
            # Formant estimation (approximation using MFCC)
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            mfcc_means = np.mean(mfccs, axis=1)
            
            # Store characteristics
            self.voice_characteristics = {
                'pitch_mean': float(pitch_mean),
                'pitch_std': float(pitch_std),
                'tempo': float(tempo),
                'spectral_centroid': float(spectral_centroid),
                'spectral_bandwidth': float(spectral_bandwidth),
                'spectral_contrast': float(spectral_contrast),
                'mfcc_profile': mfcc_means.tolist(),
                'original_duration': self.original_duration,
                'original_word_count': self.original_word_count
            }
            
            print("Enhanced voice characteristics analysis complete")
            
        except Exception as e:
            print(f"Error in voice analysis: {e}")
    
    def improve_audio_pacing(self):
        """Adjust the pacing of the generated audio to match the original"""
        print("Improving audio pacing to match original video...")
        
        try:
            if not self.voice_characteristics:
                print("Voice characteristics not available. Skipping pacing adjustment.")
                shutil.copy(self.processed_audio_path, self.final_audio_path)
                return True
            
            # Load generated audio
            y_generated, sr_generated = librosa.load(self.processed_audio_path, sr=None)
            generated_duration = len(y_generated) / sr_generated
            
            # Calculate target duration based on original
            original_duration = self.voice_characteristics['original_duration']
            
            # Determine if we need to stretch or compress
            time_ratio = original_duration / generated_duration
            
            # Apply time stretching with phase vocoder for better quality
            if 0.8 <= time_ratio <= 1.2:
                # For small adjustments, use high quality time stretch
                print(f"Applying gentle pacing adjustment (ratio: {time_ratio:.2f})")
                y_adjusted = librosa.effects.time_stretch(y_generated, rate=time_ratio)
            else:
                # For larger adjustments, use more conservative approach
                # This prevents extreme distortion
                print(f"Applying significant pacing adjustment (ratio: {time_ratio:.2f})")
                
                # Cap maximum adjustment to prevent extreme distortion
                capped_ratio = max(min(time_ratio, 1.5), 0.75)
                y_adjusted = librosa.effects.time_stretch(y_generated, rate=capped_ratio)
                
                # If we had to cap significantly, add some silence at the end if needed
                adjusted_duration = len(y_adjusted) / sr_generated
                if adjusted_duration < original_duration - 1.0:  # If still more than 1 sec off
                    silence_needed = int((original_duration - adjusted_duration) * sr_generated)
                    y_adjusted = np.concatenate([y_adjusted, np.zeros(silence_needed)])
            
            # Save adjusted audio
            sf.write(self.final_audio_path, y_adjusted, sr_generated)
            print(f"Audio pacing adjusted. Original: {original_duration:.2f}s, Generated: {generated_duration:.2f}s, Final: {len(y_adjusted)/sr_generated:.2f}s")
            return True
            
        except Exception as e:
            print(f"Error adjusting audio pacing: {e}")
            # Fallback to original processed audio
            shutil.copy(self.processed_audio_path, self.final_audio_path)
            return False
    
    def apply_voice_cloning(self):
        """Apply improved voice characteristic modifications"""
        if not self.voice_characteristics:
            print("Voice characteristics not available. Run analyze_voice_characteristics() first.")
            return False
        
        print("Applying enhanced voice cloning...")
        
        try:
            # Load generated speech
            y_target, sr_target = librosa.load(self.processed_audio_path, sr=None)
            
            # IMPROVED: Create a copy for comparison
            y_original = np.copy(y_target)
            
            # Pitch shifting based on mean pitch difference
            # Use a more subtle pitch shift factor for more natural results
            pitch_shift_factor = min(max(self.voice_characteristics['pitch_mean'] / 100, -2.5), 2.5)
            print(f"Applying pitch shift of {pitch_shift_factor:.2f} steps")
            
            # Apply pitch shifting with improved formant preservation
            y_processed = librosa.effects.pitch_shift(
                y_target, 
                sr=sr_target, 
                n_steps=pitch_shift_factor
            )
            
            # Apply subtle EQ based on spectral characteristics
            # This simulates the frequency response pattern of the original voice
            if self.voice_characteristics['spectral_centroid'] > 0:
                # Create a simple filter based on spectral centroid difference
                target_centroid = np.mean(librosa.feature.spectral_centroid(y=y_processed, sr=sr_target))
                centroid_ratio = self.voice_characteristics['spectral_centroid'] / target_centroid
                
                # Apply a simple EQ boost/cut based on the centroid ratio
                if 0.8 <= centroid_ratio <= 1.2:  # Only apply if the difference is reasonable
                    if centroid_ratio < 1:  # Boost highs
                        print("Applying subtle high frequency boost")
                        # Split signal into high and low components
                        y_low = librosa.effects.preemphasis(y_processed, coef=-0.2)
                        y_high = y_processed - y_low
                        # Boost high frequencies
                        boost_factor = 1 + (1 - centroid_ratio)
                        y_processed = y_low + y_high * boost_factor
                    else:  # Boost lows
                        print("Applying subtle low frequency boost")
                        y_low = librosa.effects.preemphasis(y_processed, coef=-0.2)
                        y_high = y_processed - y_low
                        # Boost low frequencies
                        boost_factor = centroid_ratio
                        y_processed = y_low * boost_factor + y_high
            
            # Normalize audio levels
            y_processed = librosa.util.normalize(y_processed)
            
            # Save processed audio
            sf.write(self.processed_audio_path, y_processed, sr_target)
            
            print("Enhanced voice cloning applied")
            return True
            
        except Exception as e:
            print(f"Error in voice cloning: {e}")
            return False
    
    def extract_video_frames(self):
        """Extract frames from the input video"""
        print("Extracting video frames...")
        
        video = cv2.VideoCapture(self.input_video_path)
        success, frame = video.read()
        
        if not success:
            print("Failed to read video")
            return False
        
        fps = video.get(cv2.CAP_PROP_FPS)
        frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Clear existing frames
        for file in os.listdir(self.frames_dir):
            if file.endswith('.jpg'):
                os.remove(os.path.join(self.frames_dir, file))
        
        # Extract frames
        count = 0
        while success:
            frame_path = os.path.join(self.frames_dir, f"frame_{count:06d}.jpg")
            cv2.imwrite(frame_path, frame)
            success, frame = video.read()
            count += 1
            if count % 100 == 0:
                print(f"Extracted {count}/{frame_count} frames")
        
        video.release()
        print(f"Extracted {count} frames at {fps} FPS")
        return True
    
    def detect_mouth_landmarks(self, frame):
        """Detect mouth landmarks with improved accuracy"""
        if self.predictor is None:
            return None
        
        # Convert to grayscale for face detection
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        # Enhance contrast for better face detection
        gray = cv2.equalizeHist(gray)
        
        # Try multiple face detection methods for better reliability
        # Method 1: dlib's detector
        faces_dlib = self.detector(gray)
        
        # Method 2: face_recognition library detector (HOG-based)
        face_locations = face_recognition.face_locations(gray)
        
        # Consolidate detected faces
        faces = []
        
        # Convert dlib rectangles to coordinates
        for face in faces_dlib:
            faces.append((face.top(), face.right(), face.bottom(), face.left()))
        
        # Add face_recognition detections
        for top, right, bottom, left in face_locations:
            # Check if this face is already in our list (avoid duplicates)
            is_duplicate = False
            for f_top, f_right, f_bottom, f_left in faces:
                # If centers are close, consider it a duplicate
                f_center_x = (f_left + f_right) // 2
                f_center_y = (f_top + f_bottom) // 2
                center_x = (left + right) // 2
                center_y = (top + bottom) // 2
                
                dist = ((f_center_x - center_x)**2 + (f_center_y - center_y)**2)**0.5
                if dist < 30:  # Threshold for duplicate detection
                    is_duplicate = True
                    break
            
            if not is_duplicate:
                faces.append((top, right, bottom, left))
        
        if not faces:
            return None
        
        # Find the largest face
        largest_face = max(faces, key=lambda rect: (rect[2]-rect[0])*(rect[1]-rect[3]))
        top, right, bottom, left = largest_face
        
        # Convert back to dlib rectangle for landmark prediction
        rect = dlib.rectangle(left, top, right, bottom)
        
        # Get face landmarks with dlib
        landmarks = self.predictor(gray, rect)
        
        # Extract mouth landmarks (points 48-68 in the 68-point model)
        mouth_points = []
        for i in range(48, 68):
            x = landmarks.part(i).x
            y = landmarks.part(i).y
            mouth_points.append((x, y))
        
        return mouth_points
    
    def analyze_audio_for_phonemes(self, audio_path):
        """Analyze audio for improved lip sync"""
        print("Analyzing audio for enhanced lip sync...")
        
        try:
            # Load audio with higher precision
            y, sr = librosa.load(audio_path, sr=None)
            
            # Compute enhanced onset strength (sensitive to sudden changes)
            onset_env = librosa.onset.onset_strength(y=y, sr=sr, 
                                                     hop_length=512, 
                                                     aggregate=np.median)
            
            # Detect onsets with adaptive thresholding
            onsets = librosa.onset.onset_detect(onset_envelope=onset_env, 
                                                sr=sr,
                                                wait=1,  # Wait between consecutive onsets
                                                pre_avg=1,  # Previous frames for comparison
                                                post_avg=1,  # Future frames for comparison
                                                pre_max=1,  # Previous frames for max
                                                post_max=1)  # Future frames for max
            
            onset_times = librosa.frames_to_time(onsets, sr=sr)
            
            # Convert to frame indices based on video FPS
            video = cv2.VideoCapture(self.input_video_path)
            fps = video.get(cv2.CAP_PROP_FPS)
            video.release()
            
            onset_frames = [int(time * fps) for time in onset_times]
            
            # Calculate RMS energy for more accurate lip opening amount
            hop_length = 512
            frame_length = 2048
            
            # Get RMS energy (better than amplitude for mouth movement)
            rms_energy = librosa.feature.rms(y=y, 
                                          frame_length=frame_length, 
                                          hop_length=hop_length)[0]
            
            # Smooth the energy curve for more natural transitions
            rms_energy = ndi.gaussian_filter1d(rms_energy, sigma=2)
            
            # Convert to frame indices
            audio_frames = librosa.frames_to_time(np.arange(len(rms_energy)), 
                                            sr=sr, 
                                            hop_length=hop_length)
            
            # Interpolate to match video frame rate
            total_video_frames = len([f for f in os.listdir(self.frames_dir) if f.endswith('.jpg')])
            video_duration = total_video_frames / fps
            
            # Create enhanced interpolation function
            interp_func = interp1d(
                audio_frames,
                rms_energy,
                kind='cubic',  # Cubic interpolation for smoother transitions
                bounds_error=False,
                fill_value=(rms_energy[0], rms_energy[-1])
            )
            
            # Sample at video frame rate
            video_frame_times = np.arange(0, video_duration, 1/fps)
            video_frame_times = video_frame_times[:total_video_frames]
            lip_openness = interp_func(video_frame_times)
            
            # Normalize between 0 and 1
            lip_openness = (lip_openness - np.min(lip_openness)) / (np.max(lip_openness) - np.min(lip_openness))
            
            # Apply additional smoothing for natural transitions
            lip_openness = ndi.gaussian_filter1d(lip_openness, sigma=1.5)
            
            return onset_frames, lip_openness
            
        except Exception as e:
            print(f"Error analyzing audio for lip sync: {e}")
            return [], []
    
   

In [9]:
def apply_lip_sync(self):
    """Apply improved lip sync with better mouth shape modeling"""
    print("Applying enhanced lip sync...")
    
    # Analyze the generated audio for phoneme timing
    onset_frames, lip_openness = self.analyze_audio_for_phonemes(self.final_audio_path)
    
    if not onset_frames and len(lip_openness) == 0:
        print("Failed to analyze audio for lip sync")
        return False
    
    # Process each frame
    frame_files = sorted([f for f in os.listdir(self.frames_dir) if f.endswith('.jpg')])
    
    processed_dir = os.path.join(self.output_dir, "processed_frames")
    os.makedirs(processed_dir, exist_ok=True)
    
    # Track mouth positions across frames for consistency
    mouth_history = []  # Store last several mouth points
    history_size = 5    # How many frames to consider for smoothing
    
    for i, frame_file in enumerate(tqdm(frame_files, desc="Processing frames for lip sync")):
        frame_path = os.path.join(self.frames_dir, frame_file)
        frame = cv2.imread(frame_path)
        
        if frame is None:
            print(f"Error reading frame {frame_file}")
            continue
            
        # Try to detect mouth in current frame
        mouth_points = self.detect_mouth_landmarks(frame)
        
        # If detection failed, try using history or skip
        if mouth_points is None:
            if len(mouth_history) > 0:
                # Use average of recent mouth positions
                mouth_points = np.mean(mouth_history, axis=0).astype(int).tolist()
            else:
                # If no history and no detection, just copy the frame
                output_path = os.path.join(processed_dir, frame_file)
                cv2.imwrite(output_path, frame)
                continue
        else:
            # Update history with new detection
            mouth_history.append(mouth_points)
            if len(mouth_history) > history_size:
                mouth_history.pop(0)
        
        # Calculate mouth center and average distance from center
        mouth_center_x = sum(p[0] for p in mouth_points) // len(mouth_points)
        mouth_center_y = sum(p[1] for p in mouth_points) // len(mouth_points)
        
        # Determine if this is a frame where mouth should move
        # Check if this frame is near an onset or has high energy
        is_onset = False
        for onset in onset_frames:
            if abs(i - onset) < 3:  # Within 3 frames of an onset
                is_onset = True
                break
        
        # Get lip openness factor for this frame (if available)
        lip_factor = 0.5  # Default middle value
        if i < len(lip_openness):
            lip_factor = lip_openness[i]
        
        # Apply morphing based on lip movement required
        modified_frame = frame.copy()
        
        if mouth_points:
            # Create mask for the mouth region
            mask = np.zeros_like(frame)
            hull = cv2.convexHull(np.array(mouth_points))
            cv2.fillConvexPoly(mask, hull, (255, 255, 255))
            
            # Extract inner and outer lip points
            inner_lips = mouth_points[12:20]  # Inner lip points (approximate)
            outer_lips = mouth_points[0:12]   # Outer lip points (approximate)

            # Adjust mouth shape based on lip factor and audio onset
            modified_points = []
            
            for p in mouth_points:
                x, y = p
                dx = x - mouth_center_x
                dy = y - mouth_center_y
                
                # Determine if this is an inner lip point (closer to center)
                dist_from_center = ((x - mouth_center_x)**2 + (y - mouth_center_y)**2)**0.5
                is_inner_point = dist_from_center < np.mean([((p[0] - mouth_center_x)**2 + 
                                                            (p[1] - mouth_center_y)**2)**0.5 
                                                            for p in mouth_points])
                
                # Apply different transformations to inner vs outer lips
                if is_inner_point:
                    # Inner lips open more dramatically
                    factor = 1.0 + (lip_factor * 0.7)  # More opening for inner lips
                    if is_onset:
                        factor += 0.3  # Extra opening on audio onsets
                else:
                    # Outer lips move less
                    factor = 1.0 + (lip_factor * 0.3)  # Less movement for outer lips
                    if is_onset:
                        factor += 0.1  # Smaller extra movement on audio onsets
                
                # Only stretch vertically for more natural mouth movement
                if dy > 0:  # Lower lip points move down
                    new_y = int(mouth_center_y + dy * factor)
                    modified_points.append((x, new_y))
                elif dy < 0:  # Upper lip points move up
                    new_y = int(mouth_center_y + dy * factor)
                    modified_points.append((x, new_y))
                else:  # Points at center height stay the same
                    modified_points.append((x, y))
            
            # Smoothly warp the mouth region
            if len(modified_points) == len(mouth_points):
                # Create triangulation for warping
                rect = (0, 0, frame.shape[1], frame.shape[0])
                subdiv = cv2.Subdiv2D(rect)
                
                # Add points around the image and mouth
                # This ensures the warp is contained to the mouth area
                for x, y in [(0, 0), (frame.shape[1] - 1, 0), 
                             (0, frame.shape[0] - 1), (frame.shape[1] - 1, frame.shape[0] - 1)]:
                    subdiv.insert((x, y))
                
                # Add points around the mouth region to limit warping effect
                mouth_rect = cv2.boundingRect(np.array(mouth_points))
                x, y, w, h = mouth_rect
                padding = 30  # Add padding around mouth
                
                for px, py in [(x-padding, y-padding), (x+w+padding, y-padding), 
                             (x-padding, y+h+padding), (x+w+padding, y+h+padding)]:
                    if 0 <= px < frame.shape[1] and 0 <= py < frame.shape[0]:
                        subdiv.insert((px, py))
                
                # Add original mouth points
                for i, (x, y) in enumerate(mouth_points):
                    subdiv.insert((x, y))
                
                # Simple blend for smoother transition
                alpha = 0.8  # Blend factor
                modified_frame = cv2.addWeighted(frame, 1-alpha, modified_frame, alpha, 0)
        
        # Save the processed frame
        output_path = os.path.join(processed_dir, frame_file)
        cv2.imwrite(output_path, modified_frame)
    
    print("Lip sync processing complete")
    return True

def combine_audio_video(self):
    """Combine the processed frames with the dubbed audio"""
    print("Combining audio and video...")
    
    processed_frames_dir = os.path.join(self.output_dir, "processed_frames")
    
    # Check if processed frames exist
    if not os.path.exists(processed_frames_dir) or len(os.listdir(processed_frames_dir)) == 0:
        print("No processed frames found, using original frames")
        processed_frames_dir = self.frames_dir
    
    # Get video properties
    video = cv2.VideoCapture(self.input_video_path)
    fps = video.get(cv2.CAP_PROP_FPS)
    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video.release()
    
    # Create video from frames
    frame_files = sorted([f for f in os.listdir(processed_frames_dir) if f.endswith('.jpg')])
    
    # Create a temporary video file without audio
    temp_video_path = os.path.join(self.output_dir, "temp_video.mp4")
    
    # Use OpenCV VideoWriter for better control
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(temp_video_path, fourcc, fps, (width, height))
    
    for frame_file in tqdm(frame_files, desc="Combining frames into video"):
        frame_path = os.path.join(processed_frames_dir, frame_file)
        frame = cv2.imread(frame_path)
        if frame is not None:
            out.write(frame)
    
    out.release()
    
    # Combine video with dubbed audio using ffmpeg (more reliable)
    try:
        # Ensure ffmpeg is available
        subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
        
        # Run ffmpeg command
        cmd = [
            'ffmpeg',
            '-y',  # Overwrite output file if it exists
            '-i', temp_video_path,  # Input video
            '-i', self.final_audio_path,  # Input audio
            '-c:v', 'copy',  # Copy video stream without re-encoding
            '-c:a', 'aac',  # Use AAC audio codec
            '-strict', 'experimental',
            '-map', '0:v:0',  # Map first video stream from first input
            '-map', '1:a:0',  # Map first audio stream from second input
            '-shortest',  # Finish encoding when the shortest input stream ends
            self.final_output_path
        ]
        
        subprocess.run(cmd, check=True)
        print(f"Video and audio combined successfully: {self.final_output_path}")
        
        # Clean up temporary file
        os.remove(temp_video_path)
        
        return True
        
    except subprocess.CalledProcessError as e:
        print(f"Error combining video and audio: {e}")
        
        # Fallback to moviepy if ffmpeg fails
        try:
            print("Trying alternative method with moviepy...")
            video_clip = mp.VideoFileClip(temp_video_path)
            audio_clip = mp.AudioFileClip(self.final_audio_path)
            
            final_clip = video_clip.set_audio(audio_clip)
            final_clip.write_videofile(self.final_output_path, codec='libx264', audio_codec='aac')
            
            # Clean up
            video_clip.close()
            audio_clip.close()
            os.remove(temp_video_path)
            
            print(f"Video and audio combined successfully: {self.final_output_path}")
            return True
            
        except Exception as inner_e:
            print(f"Error in fallback method: {inner_e}")
            return False

def process_video(self):
    """Process the entire video dubbing pipeline"""
    print(f"Starting video dubbing process for {os.path.basename(self.input_video_path)}...")
    print(f"Target language: {self.language_names.get(self.target_language, self.target_language)}")
    
    # Step 1: Extract audio
    if not self.extract_audio():
        print("Failed to extract audio. Aborting.")
        return False
    
    # Step 2: Transcribe audio
    transcribed_text = self.transcribe_audio()
    if not transcribed_text:
        print("Failed to transcribe audio. Aborting.")
        return False
    
    # Step 3: Translate text
    translated_text = self.translate_text(transcribed_text)
    if not translated_text:
        print("Failed to translate text. Aborting.")
        return False
    
    # Step 4: Generate speech
    if not self.generate_speech(translated_text):
        print("Failed to generate speech. Aborting.")
        return False
    
    # Step 5: Analyze voice characteristics
    self.analyze_voice_characteristics()
    
    # Step 6: Apply voice cloning
    self.apply_voice_cloning()
    
    # Step 7: Improve audio pacing
    self.improve_audio_pacing()
    
    # Step 8: Extract video frames
    if not self.extract_video_frames():
        print("Failed to extract video frames. Skipping lip sync.")
        return self.combine_audio_video()
    
    # Step 9: Apply lip sync
    self.apply_lip_sync()
    
    # Step 10: Combine audio and video
    if not self.combine_audio_video():
        print("Failed to combine audio and video.")
        return False
    
    print(f"Video dubbing complete! Output saved to {self.final_output_path}")
    return True

def clean_up(self):
    """Clean up temporary files"""
    print("Cleaning up temporary files...")
    
    temp_files = [
        self.extracted_audio_path,
        self.processed_audio_path,
        self.generated_audio_path
    ]
    
    temp_dirs = [
        self.frames_dir,
        os.path.join(self.output_dir, "processed_frames")
    ]
    
    # Remove temp files
    for file_path in temp_files:
        if os.path.exists(file_path):
            try:
                os.remove(file_path)
            except Exception as e:
                print(f"Error removing {file_path}: {e}")
    
    # Remove temp directories
    for dir_path in temp_dirs:
        if os.path.exists(dir_path):
            try:
                shutil.rmtree(dir_path)
            except Exception as e:
                print(f"Error removing directory {dir_path}: {e}")
    
    print("Cleanup complete")


# Example usage
if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description='Video Dubbing Tool')
    parser.add_argument('input_video', help=r"C:\Users\Admin\Documents\AI_Project(2.0)\Input_video.mp4")
    parser.add_argument('--output_dir', '-o', default='output', help=r"C:\Users\Admin\Documents\AI_Project(2.0)\output")
    parser.add_argument('--language', '-l', default='hi', help='Target language code (e.g., hi for Hindi)')
    parser.add_argument('--keep_temp', '-k', action='store_true', help='Keep temporary files after processing')
    
    args = parser.parse_args()
    
    # Create dubber instance
    dubber = ImprovedVideoDubber(
        input_video_path=args.input_video,
        output_dir=args.output_dir,
        target_language=args.language
    )
    
    # Process video
    success = dubber.process_video()
    
    # Clean up temp files if requested
    if success and not args.keep_temp:
        dubber.clean_up()
    
    if success:
        print(f"Dubbing completed successfully! Output: {dubber.final_output_path}")
    else:
        print("Dubbing process failed.")

usage: ipykernel_launcher.py [-h] [--output_dir OUTPUT_DIR]
                             [--language LANGUAGE] [--keep_temp]
                             input_video
ipykernel_launcher.py: error: the following arguments are required: input_video


SystemExit: 2