# Wav2Lip Setup Guide - Mini(1.0) Environment

## Step 1: Create Conda Environment

```bash
# Create conda environment with Python 3.6
conda create -n "Mini(1.0)" python=3.6 -y

# Activate the environment
conda activate "Mini(1.0)"
```

## Step 2: Install Core Dependencies

```bash
# Install PyTorch (compatible with Python 3.6)
conda install pytorch=1.7.1 torchvision=0.8.2 torchaudio=0.7.2 cpuonly -c pytorch

# For GPU support (if available)
# conda install pytorch=1.7.1 torchvision=0.8.2 torchaudio=0.7.2 cudatoolkit=10.2 -c pytorch

# Install OpenCV
pip install opencv-python==4.5.5.64

# Install other required packages
pip install numpy==1.19.5
pip install scipy==1.5.4
pip install scikit-image==0.17.2
pip install pillow==8.4.0
pip install librosa==0.8.1
pip install matplotlib==3.3.4
pip install tqdm==4.64.1
pip install numba==0.53.1
```

## Step 3: Additional Dependencies

```bash
# Install face detection and alignment
pip install dlib==19.22.1
pip install face-recognition==1.3.0

# Install audio processing
pip install soundfile==0.10.3
pip install resampy==0.2.2

# Install video processing
pip install imageio==2.9.0
pip install imageio-ffmpeg==0.4.7
```

## Step 4: Download Pre-trained Models

```bash
# Create models directory
mkdir -p models

# Download Wav2Lip model (you'll need to download this manually)
# wget -O models/wav2lip_gan.pth "https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp2pgHDc"

# Download face detection model
wget -O models/s3fd.pth "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"
```

## Step 5: Verify Installation

```bash
# Test imports
python -c "import torch; print('PyTorch:', torch.__version__)"
python -c "import cv2; print('OpenCV:', cv2.__version__)"
python -c "import numpy; print('NumPy:', numpy.__version__)"
python -c "import librosa; print('Librosa:', librosa.__version__)"
```

## Important Notes

1. **Python 3.6 Compatibility**: Some newer packages might not be available for Python 3.6. The versions specified above are tested to work together.

2. **Model Downloads**: You'll need to manually download the Wav2Lip GAN model from the official repository due to licensing.

3. **GPU Support**: If you have a compatible GPU, replace the CPU-only PyTorch installation with the CUDA version.

4. **Face Detection**: The S3FD model is used for robust face detection in videos.

## Environment Management

```bash
# List all conda environments
conda env list

# Activate the environment
conda activate "Mini(1.0)"

# Deactivate when done
conda deactivate

# Remove environment (if needed)
conda env remove -n "Mini(1.0)"
```

## Next Steps

After setting up the environment, you can proceed with:
1. Implementing the Wav2Lip inference code
2. Setting up preprocessing utilities
3. Creating the main sync application

The environment "Mini(1.0)" is now ready for Wav2Lip development!

 Download and install CMake from the official website
##### https://cmake.org/download/
###### OR install via conda
conda install -c anaconda cmake


# Errors
dlib  pip install dlib==19.8.1
# Alternative OpenCV installation
conda install -c conda-forge opencv

In [7]:
import cv2
print("OpenCV version:", cv2.__version__)

# Test face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
print("Face cascade loaded:", face_cascade.empty() == False)

OpenCV version: 4.5.5
Face cascade loaded: False


In [2]:
import requests
import os
from tqdm import tqdm

def download_file(url, filename):
    """Download file with progress bar"""
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    with open(filename, 'wb') as file, tqdm(
        desc=filename,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as pbar:
        for data in response.iter_content(chunk_size=1024):
            size = file.write(data)
            pbar.update(size)

# Download OpenCV face detection model (smaller alternative)
print("Downloading OpenCV face detection model...")
download_file(
    "https://github.com/opencv/opencv/raw/4.x/samples/dnn/face_detector/opencv_face_detector_uint8.pb",
    "models/opencv_face_detector_uint8.pb"
)

download_file(
    "https://github.com/opencv/opencv/raw/4.x/samples/dnn/face_detector/opencv_face_detector.pbtxt",
    "models/opencv_face_detector.pbtxt"
)

print("Models downloaded successfully!")

Downloading OpenCV face detection model...


models/opencv_face_detector_uint8.pb: 278kiB [00:00, 4.17MiB/s]
models/opencv_face_detector.pbtxt: 34.2kiB [00:00, 7.07MiB/s]                  

Models downloaded successfully!





In [14]:
import cv2
import numpy as np
import os
import torch
import librosa
import subprocess

def test_face_detection():
    """Test face detection"""
    print("=== Face Detection Test ===")
    
    # Test Haar Cascade
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    if not face_cascade.empty():
        print("✅ Haar Cascade face detection is working")
        return True
    else:
        print("❌ Haar Cascade face detection failed")
        return False

# Simple Wav2Lip implementation
def run_wav2lip(face_video, audio_file, model_type="gan"):
    """Run Wav2Lip on a video and audio file"""
    # Setup paths
    external_checkpoint_dir = "E:\\checkpoints"
    input_video_dir = "input_videos"
    input_audio_dir = "input_audios"
    results_dir = "results"
    
    # Available models
    models = {
        "gan": "Wav2Lip-SD-GAN.pt",
        "nogan": "Wav2Lip-SD-NOGAN.pt"
    }
    
    # Create directories
    os.makedirs(input_video_dir, exist_ok=True)
    os.makedirs(input_audio_dir, exist_ok=True)
    os.makedirs(results_dir, exist_ok=True)
    
    # Get model path
    model_path = os.path.join(external_checkpoint_dir, models.get(model_type.lower(), "Wav2Lip-SD-GAN.pt"))
    
    # Set file paths
    video_path = os.path.join(input_video_dir, face_video)
    audio_path = os.path.join(input_audio_dir, audio_file)
    output_name = f"{os.path.splitext(face_video)[0]}_{os.path.splitext(audio_file)[0]}.mp4"
    output_path = os.path.join(results_dir, output_name)
    
    # Run Wav2Lip
    cmd = f"python Wav2Lip/inference.py --checkpoint_path {model_path} --face {video_path} --audio {audio_path} --outfile {output_path}"
    print(f"Running: {cmd}")
    subprocess.run(cmd, shell=True)
    
    if os.path.exists(output_path):
        print(f"✅ Output saved to {output_path}")
        return output_path
    else:
        print(f"❌ Failed to generate output")
        return None

# Test face detection
test_face_detection()

# Example usage
# run_wav2lip("input.mp4", "audio.wav", "gan")


=== Face Detection Test ===
❌ Haar Cascade face detection failed


False

#### Clone the official repository
git clone https://github.com/Rudrabha/Wav2Lip.git
cd Wav2Lip

#### Place it in: models/wav2lip_gan.pth

#### Clone the official repository
git clone https://github.com/Rudrabha/Wav2Lip.git
cd Wav2Lip

#### Place it in: models/wav2lip_gan.pth

#### Clone the official repository
git clone https://github.com/Rudrabha/Wav2Lip.git
cd Wav2Lip

#### Place it in: models/wav2lip_gan.pth

#### Clone the official repository
git clone https://github.com/Rudrabha/Wav2Lip.git
cd Wav2Lip

#### Place it in: models/wav2lip_gan.pth

#### Clone the official repository
git clone https://github.com/Rudrabha/Wav2Lip.git
cd Wav2Lip

#### Place it in: models/wav2lip_gan.pth

In [15]:
import cv2
import os
import subprocess

def test_face_detection():
    """Test face detection with proper path"""
    print("=== Face Detection Test ===")
    
    # Try to load Haar Cascade from standard location
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    
    # If that fails, try to load from models directory
    if face_cascade.empty():
        model_path = "models/haarcascade_frontalface_default.xml"
        if os.path.exists(model_path):
            face_cascade = cv2.CascadeClassifier(model_path)
            
    if not face_cascade.empty():
        print("✅ Haar Cascade face detection is working")
        return face_cascade
    else:
        print("❌ Haar Cascade face detection failed")
        print("Downloading Haar Cascade model...")
        
        # Create models directory if it doesn't exist
        os.makedirs("models", exist_ok=True)
        
        # Download Haar Cascade model
        import urllib.request
        url = "https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml"
        model_path = "models/haarcascade_frontalface_default.xml"
        urllib.request.urlretrieve(url, model_path)
        
        # Try loading again
        face_cascade = cv2.CascadeClassifier(model_path)
        if not face_cascade.empty():
            print("✅ Haar Cascade face detection is now working")
            return face_cascade
        else:
            print("❌ Still failed to load Haar Cascade")
            return None

# Run Wav2Lip with face detection
def run_wav2lip(face_video, audio_file, model_type="gan"):
    """Run Wav2Lip on a video and audio file"""
    # Setup paths
    external_checkpoint_dir = "E:\\checkpoints"
    input_video_dir = "input_videos"
    input_audio_dir = "input_audios"
    results_dir = "results"
    
    # Available models
    models = {
        "gan": "Wav2Lip-SD-GAN.pt",
        "nogan": "Wav2Lip-SD-NOGAN.pt"
    }
    
    # Create directories
    os.makedirs(input_video_dir, exist_ok=True)
    os.makedirs(input_audio_dir, exist_ok=True)
    os.makedirs(results_dir, exist_ok=True)
    
    # Get model path
    model_path = os.path.join(external_checkpoint_dir, models.get(model_type.lower(), "Wav2Lip-SD-GAN.pt"))
    
    # Set file paths
    video_path = os.path.join(input_video_dir, face_video)
    audio_path = os.path.join(input_audio_dir, audio_file)
    output_name = f"{os.path.splitext(face_video)[0]}_{os.path.splitext(audio_file)[0]}.mp4"
    output_path = os.path.join(results_dir, output_name)
    
    # Run Wav2Lip
    cmd = f"python Wav2Lip/inference.py --checkpoint_path {model_path} --face {video_path} --audio {audio_path} --outfile {output_path}"
    print(f"Running: {cmd}")
    subprocess.run(cmd, shell=True)
    
    if os.path.exists(output_path):
        print(f"✅ Output saved to {output_path}")
        return output_path
    else:
        print(f"❌ Failed to generate output")
        return None

# Test face detection
face_detector = test_face_detection()

# Example usage
# run_wav2lip("input.mp4", "audio.wav", "gan")


=== Face Detection Test ===
✅ Haar Cascade face detection is working


In [1]:
import subprocess

# Define paths
input_face = r"E:\MINI_0.1\results\video_only.mp4"
input_audio = r"E:\MINI_0.1\input_audios\telugu_audio.wav"
output_video = r"E:\MINI_0.1\results\final_telugu_video.mp4"
checkpoint_path = r"E:\checkpoints\Wav2Lip-SD-GAN.pt"

# Run Wav2Lip
command = [
    "python", 
    "Wav2Lip/inference.py",
    "--checkpoint_path", checkpoint_path,
    "--face", input_face,
    "--audio", input_audio,
    "--outfile", output_video,
    "--pads", "0", "0", "0", "0",
    "--nosmooth"
]

print("Running Wav2Lip with custom checkpoint. This may take some time...")
subprocess.run(command)
print(f"Lip-synced video saved to {output_video}")


Running Wav2Lip with custom checkpoint. This may take some time...
Lip-synced video saved to E:\MINI_0.1\results\final_telugu_video.mp4


In [2]:
# test_env.py
import sys
print(f"Python version: {sys.version}")

try:
    import numpy
    print(f"NumPy version: {numpy.__version__}")
except ImportError:
    print("NumPy not installed. Installing...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.19.5"])

try:
    import cv2
    print(f"OpenCV version: {cv2.__version__}")
except ImportError:
    print("OpenCV not installed or DLL error. Installing via conda...")
    import subprocess
    subprocess.check_call(["conda", "install", "-c", "conda-forge", "opencv=4.5.5", "-y"])

try:
    import scipy
    print(f"SciPy version: {scipy.__version__}")
except ImportError:
    print("SciPy not installed. Installing...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scipy==1.5.4"])

print("Environment check complete. If no errors above, you should be able to run Wav2Lip.")


Python version: 3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 11:37:27) [MSC v.1916 64 bit (AMD64)]
NumPy version: 1.19.5
OpenCV not installed or DLL error. Installing via conda...


FileNotFoundError: [WinError 2] The system cannot find the file specified

In [3]:
# fix_face_detection.py
import os
import cv2
import numpy as np
import subprocess

# Define paths
input_video = r"E:\MINI_0.1\results\video_only.mp4"
input_audio = r"E:\MINI_0.1\input_audios\telugu_audio.wav"
output_video = r"E:\MINI_0.1\results\final_telugu_video.mp4"
checkpoint_path = r"E:\checkpoints\Wav2Lip-SD-GAN.pt"

# Extract first frame to detect face manually
def extract_first_frame():
    cap = cv2.VideoCapture(input_video)
    ret, frame = cap.read()
    cap.release()
    
    if ret:
        # Save first frame
        first_frame_path = r"E:\MINI_0.1\results\first_frame.jpg"
        cv2.imwrite(first_frame_path, frame)
        print(f"First frame saved to {first_frame_path}")
        
        # Get frame dimensions
        height, width = frame.shape[:2]
        print(f"Frame dimensions: {width}x{height}")
        
        # Suggest a default face box (center of the frame)
        center_x, center_y = width // 2, height // 2
        box_size = min(width, height) // 2
        
        # Define box coordinates [y1, y2, x1, x2]
        box = [
            center_y - box_size//2,  # y1 (top)
            center_y + box_size//2,  # y2 (bottom)
            center_x - box_size//2,  # x1 (left)
            center_x + box_size//2   # x2 (right)
        ]
        
        return first_frame_path, box
    
    return None, None

# Run Wav2Lip with box parameter
def run_wav2lip_with_box(box):
    # Format box for command line
    box_str = " ".join(map(str, box))
    
    # Run Wav2Lip with box parameter
    command = [
        "python", 
        "inference.py",
        "--checkpoint_path", checkpoint_path,
        "--face", input_video,
        "--audio", input_audio,
        "--outfile", output_video,
        "--box", *map(str, box),
        "--nosmooth"
    ]
    
    print("Running Wav2Lip with manual face box...")
    print(" ".join(command))
    subprocess.run(command)

# Main execution
first_frame_path, suggested_box = extract_first_frame()
if first_frame_path:
    print(f"Suggested face box: {suggested_box}")
    print("Please verify this box in the first frame image.")
    print("If needed, adjust the box coordinates and run:")
    print(f"python inference.py --checkpoint_path \"{checkpoint_path}\" --face \"{input_video}\" --audio \"{input_audio}\" --outfile \"{output_video}\" --box {suggested_box[0]} {suggested_box[1]} {suggested_box[2]} {suggested_box[3]} --nosmooth")
    
    # Uncomment to run automatically with suggested box
    # run_wav2lip_with_box(suggested_box)


First frame saved to E:\MINI_0.1\results\first_frame.jpg
Frame dimensions: 848x480
Suggested face box: [120, 360, 304, 544]
Please verify this box in the first frame image.
If needed, adjust the box coordinates and run:
python inference.py --checkpoint_path "E:\checkpoints\Wav2Lip-SD-GAN.pt" --face "E:\MINI_0.1\results\video_only.mp4" --audio "E:\MINI_0.1\input_audios\telugu_audio.wav" --outfile "E:\MINI_0.1\results\final_telugu_video.mp4" --box 120 360 304 544 --nosmooth


In [5]:
# simple_wav2lip.py
import cv2
import numpy as np
import os
import subprocess

# Define paths
input_video = r"E:\MINI_0.1\results\video_only.mp4"
input_audio = r"E:\MINI_0.1\input_audios\telugu_audio.wav"
output_video = r"E:\MINI_0.1\results\final_telugu_video.mp4"
checkpoint_path = r"E:\checkpoints\Wav2Lip-SD-GAN.pt"

# Create a simple manual box for the face
def create_manual_box():
    # Extract first frame
    cap = cv2.VideoCapture(input_video)
    ret, frame = cap.read()
    cap.release()
    
    if ret:
        # Get frame dimensions
        height, width = frame.shape[:2]
        
        # Create a box in the center of the frame (50% of frame size)
        # Format: [y1, y2, x1, x2] - [top, bottom, left, right]
        box = [
            height//4,           # y1 (top)
            height*3//4,         # y2 (bottom)
            width//4,            # x1 (left)
            width*3//4           # x2 (right)
        ]
        
        print(f"Frame size: {width}x{height}")
        print(f"Using manual box: {box}")
        return box
    else:
        # Default box if video can't be read
        return [0, 480, 0, 640]

# Run Wav2Lip with manual box
box = create_manual_box()

# Create command with smaller batch size and manual box
command = [
    "python", 
    "inference.py",
    "--checkpoint_path", checkpoint_path,
    "--face", input_video,
    "--audio", input_audio,
    "--outfile", output_video,
    "--box", str(box[0]), str(box[1]), str(box[2]), str(box[3]),
    "--wav2lip_batch_size", "32",  # Smaller batch size to avoid memory issues
    "--nosmooth"
]

print("Running Wav2Lip with manual face box...")
print(" ".join(command))
subprocess.run(command)


Frame size: 848x480
Using manual box: [120, 360, 212, 636]
Running Wav2Lip with manual face box...
python inference.py --checkpoint_path E:\checkpoints\Wav2Lip-SD-GAN.pt --face E:\MINI_0.1\results\video_only.mp4 --audio E:\MINI_0.1\input_audios\telugu_audio.wav --outfile E:\MINI_0.1\results\final_telugu_video.mp4 --box 120 360 212 636 --wav2lip_batch_size 32 --nosmooth


CompletedProcess(args=['python', 'inference.py', '--checkpoint_path', 'E:\\checkpoints\\Wav2Lip-SD-GAN.pt', '--face', 'E:\\MINI_0.1\\results\\video_only.mp4', '--audio', 'E:\\MINI_0.1\\input_audios\\telugu_audio.wav', '--outfile', 'E:\\MINI_0.1\\results\\final_telugu_video.mp4', '--box', '120', '360', '212', '636', '--wav2lip_batch_size', '32', '--nosmooth'], returncode=2)

In [None]:
# Run Wav2Lip with fixed script
def run_fixed_wav2lip():
    """Run the fixed Wav2Lip implementation"""
    input_video = r"E:\MINI_0.1\results\video_only.mp4"
    input_audio = r"E:\MINI_0.1\input_audios\telugu_audio.wav"
    output_video = r"E:\MINI_0.1\results\final_telugu_video.mp4"
    
    # Run the batch file
    subprocess.call([r"E:\MINI_0.1\run_wav2lip_fixed.bat"])
    
    # Check if output exists
    if os.path.exists(output_video):
        print(f"✅ Wav2Lip successful! Output saved to: {output_video}")
        
        # Get video info
        cap = cv2.VideoCapture(output_video)
        if cap.isOpened():
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            fps = cap.get(cv2.CAP_PROP_FPS)
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            duration = frame_count / fps
            
            print(f"Video resolution: {width}x{height}")
            print(f"FPS: {fps}")
            print(f"Duration: {duration:.2f} seconds")
            print(f"Total frames: {frame_count}")
            
            cap.release()
        return True
    else:
        print("❌ Wav2Lip failed. Check the error messages.")
        return False

# Run the function
run_fixed_wav2lip()


In [1]:
# Create a PowerShell script to run Wav2Lip
with open("E:\\MINI_0.1\\run_wav2lip.ps1", "w") as f:
    f.write("""
# PowerShell script to run Wav2Lip
Write-Host "Running Wav2Lip with fixed script..." -ForegroundColor Green

# Change to Wav2Lip directory
Set-Location -Path "E:\\MINI_0.1\\wav2lip"

# Run the command
python inference_fixed.py `
--checkpoint_path "E:\\checkpoints\\Wav2Lip-SD-GAN.pt" `
--face "E:\\MINI_0.1\\results\\video_only.mp4" `
--audio "E:\\MINI_0.1\\input_audios\\telugu_audio.wav" `
--outfile "E:\\MINI_0.1\\results\\final_telugu_video.mp4" `
--box 120 360 212 636 `
--wav2lip_batch_size 16 `
--nosmooth

# Check if successful
if ($LASTEXITCODE -eq 0) {
    Write-Host "Success! Output saved to: E:\\MINI_0.1\\results\\final_telugu_video.mp4" -ForegroundColor Green
} else {
    Write-Host "Failed to generate output video. Check errors above." -ForegroundColor Red
}

Write-Host "Press any key to continue..."
$null = $Host.UI.RawUI.ReadKey("NoEcho,IncludeKeyDown")
""")

print("PowerShell script created at E:\\MINI_0.1\\run_wav2lip.ps1")
print("Run it by right-clicking and selecting 'Run with PowerShell'")


PowerShell script created at E:\MINI_0.1\run_wav2lip.ps1
Run it by right-clicking and selecting 'Run with PowerShell'


In [1]:
# Create a CMD script to run Wav2Lip
with open("E:\\MINI_0.1\\run_wav2lip.cmd", "w") as f:
    f.write("""@echo off
echo Running Wav2Lip with fixed script...

cd /d E:\\MINI_0.1\\wav2lip

python inference_fixed.py ^
--checkpoint_path "E:\\checkpoints\\Wav2Lip-SD-GAN.pt" ^
--face "E:\\MINI_0.1\\results\\video_only.mp4" ^
--audio "E:\\MINI_0.1\\input_audios\\telugu_audio.wav" ^
--outfile "E:\\MINI_0.1\\results\\final_telugu_video.mp4" ^
--box 120 360 212 636 ^
--wav2lip_batch_size 16 ^
--nosmooth

if %ERRORLEVEL% EQU 0 (
    echo Success! Output saved to: E:\\MINI_0.1\\results\\final_telugu_video.mp4
) else (
    echo Failed to generate output video. Check errors above.
)

pause
""")

print("CMD script created at E:\\MINI_0.1\\run_wav2lip.cmd")
print("Run it by double-clicking the file in Windows Explorer")


CMD script created at E:\MINI_0.1\run_wav2lip.cmd
Run it by double-clicking the file in Windows Explorer
