<a href="https://colab.research.google.com/github/MohamedAziz15/Lip-Sync/blob/main/Lip_sync_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Step 1: Set Up the Environment

# Install necessary libraries
!pip install torch torchvision torchaudio opencv-python ffmpeg-python

# Clone Wav2Lip repository
!git clone https://github.com/Rudrabha/Wav2Lip.git
%cd Wav2Lip

# Download pre-trained model weights
!gdown --id 1rwFhD1lzrUXJYFjT9xKE7KXbz0CSJ8iI -O checkpoints/wav2lip.pth

# Install additional dependencies
!pip install -r requirements.txt

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cac

In [3]:
# Step 2: Preprocess the Data

import cv2
import ffmpeg
import librosa
import numpy as np
import torch
from models import Wav2Lip

# Function to extract frames from video
def extract_frames(video_path):
    vidcap = cv2.VideoCapture(video_path)
    frames = []
    success, image = vidcap.read()
    while success:
        frames.append(image)
        success, image = vidcap.read()
    return frames

# Function to extract audio from video
def extract_audio(video_path):
    audio_path = 'audio.wav'
    (
        ffmpeg
        .input(video_path)
        .output(audio_path)
        .run(overwrite_output=True)
    )
    return audio_path

# Function to prepare audio features
def get_audio_features(audio_path):
    audio, sr = librosa.load(audio_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    return mfcc

video_path = '/content/drive/MyDrive/Colab Notebooks/Diverge/13_K.mp4'
frames = extract_frames(video_path)
audio_path = extract_audio(video_path)
audio_features = get_audio_features(audio_path)

In [None]:

# model = Wav2Lip().to('cuda' if torch.cuda.is_available() else 'cpu')
# checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/Diverge/wav2lip.pth', map_location='cpu')


# model.load_state_dict(checkpoint['state_dict'])
# model.eval()


# Step 9: Load the Pre-trained Model

model = Wav2Lip().to('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/Diverge/wav2lip.pth', map_location='cpu')

# Modify the keys in the checkpoint state dictionary to remove the 'module.' prefix
new_state_dict = {}
for k, v in checkpoint['state_dict'].items():
    name = k[7:] # remove 'module.'
    new_state_dict[name] = v

model.load_state_dict(new_state_dict) # Load the modified state dictionary
model.eval()

In [14]:
# Step 3: Load and Run the Model

# Function to generate lip-synced frames
def generate_lip_synced_frames(model, frames, audio_features):
    synced_frames = []
    for i in range(len(frames)):
        frame = frames[i]
        audio_feature = audio_features[:, i:i+80]  # Assuming 80 frames per second
        audio_feature = torch.FloatTensor(audio_feature).unsqueeze(0).to('cuda')
        #frame = torch.FloatTensor(frame).unsqueeze(0).to('cuda')
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # Convert to grayscale
        frame = torch.FloatTensor(frame).unsqueeze(0).to('cuda')
        # frame = torch.FloatTensor(frame).unsqueeze(0).permute(0, 3, 1, 2).to('cuda')

        synced_frame = model(frame, audio_feature)
        synced_frames.append(synced_frame.cpu().numpy())
    return synced_frames
synced_frames = generate_lip_synced_frames(model, frames, audio_features)


ValueError: expected 4D input (got 3D input)

In [5]:
# Step 3: Load and Run the Model

# model = Wav2Lip().to('cuda' if torch.cuda.is_available() else 'cpu')
# checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/Diverge/wav2lip.pth', map_location='cpu')
# model.load_state_dict(checkpoint['state_dict'])

# model.load_state_dict({k.replace("module.", ""): v for k, v in checkpoint.items()})


# model_save_location = "/content/drive/MyDrive/Colab Notebooks/Diverge/wav2lip.pth"

# state_dict = torch.load(model_save_location, map_location='cpu')
# model.load_state_dict({k.replace("module.", ""): v for k, v in state_dict.items()})


# # Function to generate lip-synced frames
# def generate_lip_synced_frames(model, frames, audio_features):
#     synced_frames = []
#     for i in range(len(frames)):
#         frame = frames[i]
#         audio_feature = audio_features[:, i:i+80]  # Assuming 80 frames per second
#         audio_feature = torch.FloatTensor(audio_feature).unsqueeze(0).to('cuda')
#         frame = torch.FloatTensor(frame).unsqueeze(0).to('cuda')
#         synced_frame = model(frame, audio_feature)
#         synced_frames.append(synced_frame.cpu().numpy())
#     return synced_frames
# synced_frames = generate_lip_synced_frames(model, frames, audio_features)
##########################################################################################
# Step 3: Load and Run the Model

# ... (previous code)

# # Function to generate lip-synced frames
# def generate_lip_synced_frames(model, frames, audio_features):
#     synced_frames = []
#     for i in range(len(frames)):
#         frame = frames[i]
#         audio_feature = audio_features[:, i:i+80]  # Assuming 80 frames per second
#         audio_feature = torch.FloatTensor(audio_feature).unsqueeze(0).to('cuda')

#         # Preprocess the frame to have the expected number of channels (1)
#         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # Convert to grayscale
#         frame = torch.FloatTensor(frame).unsqueeze(0).unsqueeze(0).to('cuda') # Add channel dimension

#         synced_frame = model(frame, audio_feature)
#         synced_frames.append(synced_frame.cpu().numpy())
#     return synced_frames

# synced_frames = generate_lip_synced_frames(model, frames, audio_features)

# Step 3: Load and Run the Model

# ... (previous code)

# # Function to generate lip-synced frames
# def generate_lip_synced_frames(model, frames, audio_features):
#     synced_frames = []
#     for i in range(len(frames)):
#         frame = frames[i]
#         audio_feature = audio_features[:, i:i+80]  # Assuming 80 frames per second
#         audio_feature = torch.FloatTensor(audio_feature).unsqueeze(0).to('cuda')

#         # Preprocess the frame to have the expected number of channels (3 for color images)
#         # If the model expects a different number of channels, adjust this accordingly
#         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # Convert to grayscale
#         frame = torch.FloatTensor(frame).permute(0,3 ).to('cuda') # Add channel dimension and permute to (batch, channels, height, width)

#         synced_frame = model(frame, audio_feature)
#         synced_frames.append(synced_frame.cpu().numpy())
#     return synced_frames

# synced_frames = generate_lip_synced_frames(model, frames, audio_features)



In [None]:
# Step 4: Combine Frames and Audio to Generate the Final Video

# Function to save video from frames
def save_video(frames, output_path, fps=25):
    height, width, layers = frames[0].shape
    video = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'MP4V'), fps, (width, height))
    for frame in frames:
        video.write(frame)
    video.release()

output_video_path = 'output_video.mp4'
save_video(synced_frames, output_video_path)

# Add audio to the video
output_video_with_audio = 'output_video_with_audio.mp4'
(
    ffmpeg
    .input(output_video_path)
    .input(audio_path)
    .output(output_video_with_audio)
    .run(overwrite_output=True)
)

# Step 5: Evaluate and Fine-Tune

print("Lip-syncing completed and saved to", output_video_with_audio)
