<a href="https://colab.research.google.com/github/MohamedAziz15/Lip-Sync/blob/main/Lip_sync_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Set Up the Environment

# Install necessary libraries
!pip install torch torchvision torchaudio opencv-python ffmpeg-python

# Clone Wav2Lip repository
!git clone https://github.com/Rudrabha/Wav2Lip.git
%cd Wav2Lip

# Download pre-trained model weights
!gdown --id 1rwFhD1lzrUXJYFjT9xKE7KXbz0CSJ8iI -O checkpoints/wav2lip.pth

# Install additional dependencies
!pip install -r requirements.txt

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cac

In [3]:
pip install ffmpeg-python



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### handle video as Frames

In [None]:
# The following two methods are taken from this tutorial:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
IMG_SIZE = 224


def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [None]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()

    ##take all classlabels from train_df column named 'tag' and store in labels
    labels = df["tag"].values

    #convert classlabels to label encoding
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool") # 145,20
    frame_features = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") #145,20,2048

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos(train_df, "train")
test_data, test_labels = prepare_all_videos(test_df, "test")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")



print(f"train_labels in train set: {train_labels.shape}")

print(f"test_labels in train set: {test_labels.shape}")

# MAX_SEQ_LENGTH = 20, NUM_FEATURES = 2048. We have defined this above under hyper parameters

In [5]:
# Step 2: Preprocess the Data

import cv2
import ffmpeg
import librosa
import numpy as np
import torch
from models import Wav2Lip

# Function to extract frames from video
def extract_frames(video_path):
    vidcap = cv2.VideoCapture(video_path)
    frames = []
    success, image = vidcap.read()
    while success:
        frames.append(image)
        success, image = vidcap.read()
    return frames

# Function to extract audio from video
def extract_audio(video_path):
    audio_path = 'audio.wav'
    (
        ffmpeg
        .input(video_path)
        .output(audio_path)
        .run(overwrite_output=True)
    )
    return audio_path

# Function to prepare audio features
def get_audio_features(audio_path):
    audio, sr = librosa.load(audio_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    return mfcc

video_path = '/content/drive/MyDrive/Colab Notebooks/Diverge/Resized_13_K.mp4'
frames = extract_frames(video_path)
audio_path = extract_audio(video_path)
audio_features = get_audio_features(audio_path)

In [8]:
#@title <h1>Step1: Setup Wav2Lip</h1>
#@markdown * Install dependency
#@markdown * Download pretrained model
from IPython.display import HTML, clear_output
!rm -rf /content/sample_data
!mkdir /content/sample_data

!git clone https://github.com/justinjohn0306/Wav2Lip

%cd /content/Wav2Lip

#download the pretrained model
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth' -O 'checkpoints/wav2lip.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth' -O 'checkpoints/wav2lip_gan.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth' -O 'checkpoints/resnet50.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth' -O 'checkpoints/mobilenet.pth'
a = !pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl
!pip install git+https://github.com/elliottzheng/batch-face.git@master

!pip install ffmpeg-python mediapipe==0.8.11

#this code for recording audio
"""
To write this piece of code I took inspiration/code from a lot of places.
It was late night, so I'm not sure how much I created or just copied o.O
Here are some of the possible references:
https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
https://stackoverflow.com/a/18650249
https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
https://stackoverflow.com/a/49019356
"""
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data);
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});

</script>
"""

%cd /
from ghc.l_ghc_cf import l_ghc_cf
%cd content

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])

  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)

  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr


from IPython.display import HTML
from base64 import b64encode
def showVideo(path):
  mp4 = open(str(path),'rb').read()
  data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
  return HTML("""
  <video width=700 controls>
        <source src="%s" type="video/mp4">
  </video>
  """ % data_url)

from IPython.display import clear_output

clear_output()
print("All set and ready!")

All set and ready!


In [11]:

model = Wav2Lip().to('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = torch.load('/content/Wav2Lip/checkpoints/wav2lip.pth', map_location='cpu')
model.load_state_dict(checkpoint['state_dict'])
model.eval()


# Step 9: Load the Pre-trained Model

# model = Wav2Lip().to('cuda' if torch.cuda.is_available() else 'cpu')
# checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/Diverge/wav2lip.pth', map_location='cpu')

# # Modify the keys in the checkpoint state dictionary to remove the 'module.' prefix
# new_state_dict = {}
# for k, v in checkpoint['state_dict'].items():
#     name = k[7:] # remove 'module.'
#     new_state_dict[name] = v

# model.load_state_dict(new_state_dict) # Load the modified state dictionary
# model.eval()

RuntimeError: Error(s) in loading state_dict for Wav2Lip:
	Missing key(s) in state_dict: "face_encoder_blocks.0.0.conv_block.0.weight", "face_encoder_blocks.0.0.conv_block.0.bias", "face_encoder_blocks.0.0.conv_block.1.weight", "face_encoder_blocks.0.0.conv_block.1.bias", "face_encoder_blocks.0.0.conv_block.1.running_mean", "face_encoder_blocks.0.0.conv_block.1.running_var", "face_encoder_blocks.1.0.conv_block.0.weight", "face_encoder_blocks.1.0.conv_block.0.bias", "face_encoder_blocks.1.0.conv_block.1.weight", "face_encoder_blocks.1.0.conv_block.1.bias", "face_encoder_blocks.1.0.conv_block.1.running_mean", "face_encoder_blocks.1.0.conv_block.1.running_var", "face_encoder_blocks.1.1.conv_block.0.weight", "face_encoder_blocks.1.1.conv_block.0.bias", "face_encoder_blocks.1.1.conv_block.1.weight", "face_encoder_blocks.1.1.conv_block.1.bias", "face_encoder_blocks.1.1.conv_block.1.running_mean", "face_encoder_blocks.1.1.conv_block.1.running_var", "face_encoder_blocks.1.2.conv_block.0.weight", "face_encoder_blocks.1.2.conv_block.0.bias", "face_encoder_blocks.1.2.conv_block.1.weight", "face_encoder_blocks.1.2.conv_block.1.bias", "face_encoder_blocks.1.2.conv_block.1.running_mean", "face_encoder_blocks.1.2.conv_block.1.running_var", "face_encoder_blocks.2.0.conv_block.0.weight", "face_encoder_blocks.2.0.conv_block.0.bias", "face_encoder_blocks.2.0.conv_block.1.weight", "face_encoder_blocks.2.0.conv_block.1.bias", "face_encoder_blocks.2.0.conv_block.1.running_mean", "face_encoder_blocks.2.0.conv_block.1.running_var", "face_encoder_blocks.2.1.conv_block.0.weight", "face_encoder_blocks.2.1.conv_block.0.bias", "face_encoder_blocks.2.1.conv_block.1.weight", "face_encoder_blocks.2.1.conv_block.1.bias", "face_encoder_blocks.2.1.conv_block.1.running_mean", "face_encoder_blocks.2.1.conv_block.1.running_var", "face_encoder_blocks.2.2.conv_block.0.weight", "face_encoder_blocks.2.2.conv_block.0.bias", "face_encoder_blocks.2.2.conv_block.1.weight", "face_encoder_blocks.2.2.conv_block.1.bias", "face_encoder_blocks.2.2.conv_block.1.running_mean", "face_encoder_blocks.2.2.conv_block.1.running_var", "face_encoder_blocks.2.3.conv_block.0.weight", "face_encoder_blocks.2.3.conv_block.0.bias", "face_encoder_blocks.2.3.conv_block.1.weight", "face_encoder_blocks.2.3.conv_block.1.bias", "face_encoder_blocks.2.3.conv_block.1.running_mean", "face_encoder_blocks.2.3.conv_block.1.running_var", "face_encoder_blocks.3.0.conv_block.0.weight", "face_encoder_blocks.3.0.conv_block.0.bias", "face_encoder_blocks.3.0.conv_block.1.weight", "face_encoder_blocks.3.0.conv_block.1.bias", "face_encoder_blocks.3.0.conv_block.1.running_mean", "face_encoder_blocks.3.0.conv_block.1.running_var", "face_encoder_blocks.3.1.conv_block.0.weight", "face_encoder_blocks.3.1.conv_block.0.bias", "face_encoder_blocks.3.1.conv_block.1.weight", "face_encoder_blocks.3.1.conv_block.1.bias", "face_encoder_blocks.3.1.conv_block.1.running_mean", "face_encoder_blocks.3.1.conv_block.1.running_var", "face_encoder_blocks.3.2.conv_block.0.weight", "face_encoder_blocks.3.2.conv_block.0.bias", "face_encoder_blocks.3.2.conv_block.1.weight", "face_encoder_blocks.3.2.conv_block.1.bias", "face_encoder_blocks.3.2.conv_block.1.running_mean", "face_encoder_blocks.3.2.conv_block.1.running_var", "face_encoder_blocks.4.0.conv_block.0.weight", "face_encoder_blocks.4.0.conv_block.0.bias", "face_encoder_blocks.4.0.conv_block.1.weight", "face_encoder_blocks.4.0.conv_block.1.bias", "face_encoder_blocks.4.0.conv_block.1.running_mean", "face_encoder_blocks.4.0.conv_block.1.running_var", "face_encoder_blocks.4.1.conv_block.0.weight", "face_encoder_blocks.4.1.conv_block.0.bias", "face_encoder_blocks.4.1.conv_block.1.weight", "face_encoder_blocks.4.1.conv_block.1.bias", "face_encoder_blocks.4.1.conv_block.1.running_mean", "face_encoder_blocks.4.1.conv_block.1.running_var", "face_encoder_blocks.4.2.conv_block.0.weight", "face_encoder_blocks.4.2.conv_block.0.bias", "face_encoder_blocks.4.2.conv_block.1.weight", "face_encoder_blocks.4.2.conv_block.1.bias", "face_encoder_blocks.4.2.conv_block.1.running_mean", "face_encoder_blocks.4.2.conv_block.1.running_var", "face_encoder_blocks.5.0.conv_block.0.weight", "face_encoder_blocks.5.0.conv_block.0.bias", "face_encoder_blocks.5.0.conv_block.1.weight", "face_encoder_blocks.5.0.conv_block.1.bias", "face_encoder_blocks.5.0.conv_block.1.running_mean", "face_encoder_blocks.5.0.conv_block.1.running_var", "face_encoder_blocks.5.1.conv_block.0.weight", "face_encoder_blocks.5.1.conv_block.0.bias", "face_encoder_blocks.5.1.conv_block.1.weight", "face_encoder_blocks.5.1.conv_block.1.bias", "face_encoder_blocks.5.1.conv_block.1.running_mean", "face_encoder_blocks.5.1.conv_block.1.running_var", "face_encoder_blocks.6.0.conv_block.0.weight", "face_encoder_blocks.6.0.conv_block.0.bias", "face_encoder_blocks.6.0.conv_block.1.weight", "face_encoder_blocks.6.0.conv_block.1.bias", "face_encoder_blocks.6.0.conv_block.1.running_mean", "face_encoder_blocks.6.0.conv_block.1.running_var", "face_encoder_blocks.6.1.conv_block.0.weight", "face_encoder_blocks.6.1.conv_block.0.bias", "face_encoder_blocks.6.1.conv_block.1.weight", "face_encoder_blocks.6.1.conv_block.1.bias", "face_encoder_blocks.6.1.conv_block.1.running_mean", "face_encoder_blocks.6.1.conv_block.1.running_var", "audio_encoder.0.conv_block.0.weight", "audio_encoder.0.conv_block.0.bias", "audio_encoder.0.conv_block.1.weight", "audio_encoder.0.conv_block.1.bias", "audio_encoder.0.conv_block.1.running_mean", "audio_encoder.0.conv_block.1.running_var", "audio_encoder.1.conv_block.0.weight", "audio_encoder.1.conv_block.0.bias", "audio_encoder.1.conv_block.1.weight", "audio_encoder.1.conv_block.1.bias", "audio_encoder.1.conv_block.1.running_mean", "audio_encoder.1.conv_block.1.running_var", "audio_encoder.2.conv_block.0.weight", "audio_encoder.2.conv_block.0.bias", "audio_encoder.2.conv_block.1.weight", "audio_encoder.2.conv_block.1.bias", "audio_encoder.2.conv_block.1.running_mean", "audio_encoder.2.conv_block.1.running_var", "audio_encoder.3.conv_block.0.weight", "audio_encoder.3.conv_block.0.bias", "audio_encoder.3.conv_block.1.weight", "audio_encoder.3.conv_block.1.bias", "audio_encoder.3.conv_block.1.running_mean", "audio_encoder.3.conv_block.1.running_var", "audio_encoder.4.conv_block.0.weight", "audio_encoder.4.conv_block.0.bias", "audio_encoder.4.conv_block.1.weight", "audio_encoder.4.conv_block.1.bias", "audio_encoder.4.conv_block.1.running_mean", "audio_encoder.4.conv_block.1.running_var", "audio_encoder.5.conv_block.0.weight", "audio_encoder.5.conv_block.0.bias", "audio_encoder.5.conv_block.1.weight", "audio_encoder.5.conv_block.1.bias", "audio_encoder.5.conv_block.1.running_mean", "audio_encoder.5.conv_block.1.running_var", "audio_encoder.6.conv_block.0.weight", "audio_encoder.6.conv_block.0.bias", "audio_encoder.6.conv_block.1.weight", "audio_encoder.6.conv_block.1.bias", "audio_encoder.6.conv_block.1.running_mean", "audio_encoder.6.conv_block.1.running_var", "audio_encoder.7.conv_block.0.weight", "audio_encoder.7.conv_block.0.bias", "audio_encoder.7.conv_block.1.weight", "audio_encoder.7.conv_block.1.bias", "audio_encoder.7.conv_block.1.running_mean", "audio_encoder.7.conv_block.1.running_var", "audio_encoder.8.conv_block.0.weight", "audio_encoder.8.conv_block.0.bias", "audio_encoder.8.conv_block.1.weight", "audio_encoder.8.conv_block.1.bias", "audio_encoder.8.conv_block.1.running_mean", "audio_encoder.8.conv_block.1.running_var", "audio_encoder.9.conv_block.0.weight", "audio_encoder.9.conv_block.0.bias", "audio_encoder.9.conv_block.1.weight", "audio_encoder.9.conv_block.1.bias", "audio_encoder.9.conv_block.1.running_mean", "audio_encoder.9.conv_block.1.running_var", "audio_encoder.10.conv_block.0.weight", "audio_encoder.10.conv_block.0.bias", "audio_encoder.10.conv_block.1.weight", "audio_encoder.10.conv_block.1.bias", "audio_encoder.10.conv_block.1.running_mean", "audio_encoder.10.conv_block.1.running_var", "audio_encoder.11.conv_block.0.weight", "audio_encoder.11.conv_block.0.bias", "audio_encoder.11.conv_block.1.weight", "audio_encoder.11.conv_block.1.bias", "audio_encoder.11.conv_block.1.running_mean", "audio_encoder.11.conv_block.1.running_var", "audio_encoder.12.conv_block.0.weight", "audio_encoder.12.conv_block.0.bias", "audio_encoder.12.conv_block.1.weight", "audio_encoder.12.conv_block.1.bias", "audio_encoder.12.conv_block.1.running_mean", "audio_encoder.12.conv_block.1.running_var", "face_decoder_blocks.0.0.conv_block.0.weight", "face_decoder_blocks.0.0.conv_block.0.bias", "face_decoder_blocks.0.0.conv_block.1.weight", "face_decoder_blocks.0.0.conv_block.1.bias", "face_decoder_blocks.0.0.conv_block.1.running_mean", "face_decoder_blocks.0.0.conv_block.1.running_var", "face_decoder_blocks.1.0.conv_block.0.weight", "face_decoder_blocks.1.0.conv_block.0.bias", "face_decoder_blocks.1.0.conv_block.1.weight", "face_decoder_blocks.1.0.conv_block.1.bias", "face_decoder_blocks.1.0.conv_block.1.running_mean", "face_decoder_blocks.1.0.conv_block.1.running_var", "face_decoder_blocks.1.1.conv_block.0.weight", "face_decoder_blocks.1.1.conv_block.0.bias", "face_decoder_blocks.1.1.conv_block.1.weight", "face_decoder_blocks.1.1.conv_block.1.bias", "face_decoder_blocks.1.1.conv_block.1.running_mean", "face_decoder_blocks.1.1.conv_block.1.running_var", "face_decoder_blocks.2.0.conv_block.0.weight", "face_decoder_blocks.2.0.conv_block.0.bias", "face_decoder_blocks.2.0.conv_block.1.weight", "face_decoder_blocks.2.0.conv_block.1.bias", "face_decoder_blocks.2.0.conv_block.1.running_mean", "face_decoder_blocks.2.0.conv_block.1.running_var", "face_decoder_blocks.2.1.conv_block.0.weight", "face_decoder_blocks.2.1.conv_block.0.bias", "face_decoder_blocks.2.1.conv_block.1.weight", "face_decoder_blocks.2.1.conv_block.1.bias", "face_decoder_blocks.2.1.conv_block.1.running_mean", "face_decoder_blocks.2.1.conv_block.1.running_var", "face_decoder_blocks.2.2.conv_block.0.weight", "face_decoder_blocks.2.2.conv_block.0.bias", "face_decoder_blocks.2.2.conv_block.1.weight", "face_decoder_blocks.2.2.conv_block.1.bias", "face_decoder_blocks.2.2.conv_block.1.running_mean", "face_decoder_blocks.2.2.conv_block.1.running_var", "face_decoder_blocks.3.0.conv_block.0.weight", "face_decoder_blocks.3.0.conv_block.0.bias", "face_decoder_blocks.3.0.conv_block.1.weight", "face_decoder_blocks.3.0.conv_block.1.bias", "face_decoder_blocks.3.0.conv_block.1.running_mean", "face_decoder_blocks.3.0.conv_block.1.running_var", "face_decoder_blocks.3.1.conv_block.0.weight", "face_decoder_blocks.3.1.conv_block.0.bias", "face_decoder_blocks.3.1.conv_block.1.weight", "face_decoder_blocks.3.1.conv_block.1.bias", "face_decoder_blocks.3.1.conv_block.1.running_mean", "face_decoder_blocks.3.1.conv_block.1.running_var", "face_decoder_blocks.3.2.conv_block.0.weight", "face_decoder_blocks.3.2.conv_block.0.bias", "face_decoder_blocks.3.2.conv_block.1.weight", "face_decoder_blocks.3.2.conv_block.1.bias", "face_decoder_blocks.3.2.conv_block.1.running_mean", "face_decoder_blocks.3.2.conv_block.1.running_var", "face_decoder_blocks.4.0.conv_block.0.weight", "face_decoder_blocks.4.0.conv_block.0.bias", "face_decoder_blocks.4.0.conv_block.1.weight", "face_decoder_blocks.4.0.conv_block.1.bias", "face_decoder_blocks.4.0.conv_block.1.running_mean", "face_decoder_blocks.4.0.conv_block.1.running_var", "face_decoder_blocks.4.1.conv_block.0.weight", "face_decoder_blocks.4.1.conv_block.0.bias", "face_decoder_blocks.4.1.conv_block.1.weight", "face_decoder_blocks.4.1.conv_block.1.bias", "face_decoder_blocks.4.1.conv_block.1.running_mean", "face_decoder_blocks.4.1.conv_block.1.running_var", "face_decoder_blocks.4.2.conv_block.0.weight", "face_decoder_blocks.4.2.conv_block.0.bias", "face_decoder_blocks.4.2.conv_block.1.weight", "face_decoder_blocks.4.2.conv_block.1.bias", "face_decoder_blocks.4.2.conv_block.1.running_mean", "face_decoder_blocks.4.2.conv_block.1.running_var", "face_decoder_blocks.5.0.conv_block.0.weight", "face_decoder_blocks.5.0.conv_block.0.bias", "face_decoder_blocks.5.0.conv_block.1.weight", "face_decoder_blocks.5.0.conv_block.1.bias", "face_decoder_blocks.5.0.conv_block.1.running_mean", "face_decoder_blocks.5.0.conv_block.1.running_var", "face_decoder_blocks.5.1.conv_block.0.weight", "face_decoder_blocks.5.1.conv_block.0.bias", "face_decoder_blocks.5.1.conv_block.1.weight", "face_decoder_blocks.5.1.conv_block.1.bias", "face_decoder_blocks.5.1.conv_block.1.running_mean", "face_decoder_blocks.5.1.conv_block.1.running_var", "face_decoder_blocks.5.2.conv_block.0.weight", "face_decoder_blocks.5.2.conv_block.0.bias", "face_decoder_blocks.5.2.conv_block.1.weight", "face_decoder_blocks.5.2.conv_block.1.bias", "face_decoder_blocks.5.2.conv_block.1.running_mean", "face_decoder_blocks.5.2.conv_block.1.running_var", "face_decoder_blocks.6.0.conv_block.0.weight", "face_decoder_blocks.6.0.conv_block.0.bias", "face_decoder_blocks.6.0.conv_block.1.weight", "face_decoder_blocks.6.0.conv_block.1.bias", "face_decoder_blocks.6.0.conv_block.1.running_mean", "face_decoder_blocks.6.0.conv_block.1.running_var", "face_decoder_blocks.6.1.conv_block.0.weight", "face_decoder_blocks.6.1.conv_block.0.bias", "face_decoder_blocks.6.1.conv_block.1.weight", "face_decoder_blocks.6.1.conv_block.1.bias", "face_decoder_blocks.6.1.conv_block.1.running_mean", "face_decoder_blocks.6.1.conv_block.1.running_var", "face_decoder_blocks.6.2.conv_block.0.weight", "face_decoder_blocks.6.2.conv_block.0.bias", "face_decoder_blocks.6.2.conv_block.1.weight", "face_decoder_blocks.6.2.conv_block.1.bias", "face_decoder_blocks.6.2.conv_block.1.running_mean", "face_decoder_blocks.6.2.conv_block.1.running_var", "output_block.0.conv_block.0.weight", "output_block.0.conv_block.0.bias", "output_block.0.conv_block.1.weight", "output_block.0.conv_block.1.bias", "output_block.0.conv_block.1.running_mean", "output_block.0.conv_block.1.running_var", "output_block.1.weight", "output_block.1.bias". 
	Unexpected key(s) in state_dict: "module.face_encoder_blocks.0.0.conv_block.0.weight", "module.face_encoder_blocks.0.0.conv_block.0.bias", "module.face_encoder_blocks.0.0.conv_block.1.weight", "module.face_encoder_blocks.0.0.conv_block.1.bias", "module.face_encoder_blocks.0.0.conv_block.1.running_mean", "module.face_encoder_blocks.0.0.conv_block.1.running_var", "module.face_encoder_blocks.0.0.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.1.0.conv_block.0.weight", "module.face_encoder_blocks.1.0.conv_block.0.bias", "module.face_encoder_blocks.1.0.conv_block.1.weight", "module.face_encoder_blocks.1.0.conv_block.1.bias", "module.face_encoder_blocks.1.0.conv_block.1.running_mean", "module.face_encoder_blocks.1.0.conv_block.1.running_var", "module.face_encoder_blocks.1.0.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.1.1.conv_block.0.weight", "module.face_encoder_blocks.1.1.conv_block.0.bias", "module.face_encoder_blocks.1.1.conv_block.1.weight", "module.face_encoder_blocks.1.1.conv_block.1.bias", "module.face_encoder_blocks.1.1.conv_block.1.running_mean", "module.face_encoder_blocks.1.1.conv_block.1.running_var", "module.face_encoder_blocks.1.1.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.1.2.conv_block.0.weight", "module.face_encoder_blocks.1.2.conv_block.0.bias", "module.face_encoder_blocks.1.2.conv_block.1.weight", "module.face_encoder_blocks.1.2.conv_block.1.bias", "module.face_encoder_blocks.1.2.conv_block.1.running_mean", "module.face_encoder_blocks.1.2.conv_block.1.running_var", "module.face_encoder_blocks.1.2.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.2.0.conv_block.0.weight", "module.face_encoder_blocks.2.0.conv_block.0.bias", "module.face_encoder_blocks.2.0.conv_block.1.weight", "module.face_encoder_blocks.2.0.conv_block.1.bias", "module.face_encoder_blocks.2.0.conv_block.1.running_mean", "module.face_encoder_blocks.2.0.conv_block.1.running_var", "module.face_encoder_blocks.2.0.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.2.1.conv_block.0.weight", "module.face_encoder_blocks.2.1.conv_block.0.bias", "module.face_encoder_blocks.2.1.conv_block.1.weight", "module.face_encoder_blocks.2.1.conv_block.1.bias", "module.face_encoder_blocks.2.1.conv_block.1.running_mean", "module.face_encoder_blocks.2.1.conv_block.1.running_var", "module.face_encoder_blocks.2.1.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.2.2.conv_block.0.weight", "module.face_encoder_blocks.2.2.conv_block.0.bias", "module.face_encoder_blocks.2.2.conv_block.1.weight", "module.face_encoder_blocks.2.2.conv_block.1.bias", "module.face_encoder_blocks.2.2.conv_block.1.running_mean", "module.face_encoder_blocks.2.2.conv_block.1.running_var", "module.face_encoder_blocks.2.2.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.2.3.conv_block.0.weight", "module.face_encoder_blocks.2.3.conv_block.0.bias", "module.face_encoder_blocks.2.3.conv_block.1.weight", "module.face_encoder_blocks.2.3.conv_block.1.bias", "module.face_encoder_blocks.2.3.conv_block.1.running_mean", "module.face_encoder_blocks.2.3.conv_block.1.running_var", "module.face_encoder_blocks.2.3.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.3.0.conv_block.0.weight", "module.face_encoder_blocks.3.0.conv_block.0.bias", "module.face_encoder_blocks.3.0.conv_block.1.weight", "module.face_encoder_blocks.3.0.conv_block.1.bias", "module.face_encoder_blocks.3.0.conv_block.1.running_mean", "module.face_encoder_blocks.3.0.conv_block.1.running_var", "module.face_encoder_blocks.3.0.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.3.1.conv_block.0.weight", "module.face_encoder_blocks.3.1.conv_block.0.bias", "module.face_encoder_blocks.3.1.conv_block.1.weight", "module.face_encoder_blocks.3.1.conv_block.1.bias", "module.face_encoder_blocks.3.1.conv_block.1.running_mean", "module.face_encoder_blocks.3.1.conv_block.1.running_var", "module.face_encoder_blocks.3.1.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.3.2.conv_block.0.weight", "module.face_encoder_blocks.3.2.conv_block.0.bias", "module.face_encoder_blocks.3.2.conv_block.1.weight", "module.face_encoder_blocks.3.2.conv_block.1.bias", "module.face_encoder_blocks.3.2.conv_block.1.running_mean", "module.face_encoder_blocks.3.2.conv_block.1.running_var", "module.face_encoder_blocks.3.2.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.4.0.conv_block.0.weight", "module.face_encoder_blocks.4.0.conv_block.0.bias", "module.face_encoder_blocks.4.0.conv_block.1.weight", "module.face_encoder_blocks.4.0.conv_block.1.bias", "module.face_encoder_blocks.4.0.conv_block.1.running_mean", "module.face_encoder_blocks.4.0.conv_block.1.running_var", "module.face_encoder_blocks.4.0.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.4.1.conv_block.0.weight", "module.face_encoder_blocks.4.1.conv_block.0.bias", "module.face_encoder_blocks.4.1.conv_block.1.weight", "module.face_encoder_blocks.4.1.conv_block.1.bias", "module.face_encoder_blocks.4.1.conv_block.1.running_mean", "module.face_encoder_blocks.4.1.conv_block.1.running_var", "module.face_encoder_blocks.4.1.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.4.2.conv_block.0.weight", "module.face_encoder_blocks.4.2.conv_block.0.bias", "module.face_encoder_blocks.4.2.conv_block.1.weight", "module.face_encoder_blocks.4.2.conv_block.1.bias", "module.face_encoder_blocks.4.2.conv_block.1.running_mean", "module.face_encoder_blocks.4.2.conv_block.1.running_var", "module.face_encoder_blocks.4.2.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.5.0.conv_block.0.weight", "module.face_encoder_blocks.5.0.conv_block.0.bias", "module.face_encoder_blocks.5.0.conv_block.1.weight", "module.face_encoder_blocks.5.0.conv_block.1.bias", "module.face_encoder_blocks.5.0.conv_block.1.running_mean", "module.face_encoder_blocks.5.0.conv_block.1.running_var", "module.face_encoder_blocks.5.0.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.5.1.conv_block.0.weight", "module.face_encoder_blocks.5.1.conv_block.0.bias", "module.face_encoder_blocks.5.1.conv_block.1.weight", "module.face_encoder_blocks.5.1.conv_block.1.bias", "module.face_encoder_blocks.5.1.conv_block.1.running_mean", "module.face_encoder_blocks.5.1.conv_block.1.running_var", "module.face_encoder_blocks.5.1.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.6.0.conv_block.0.weight", "module.face_encoder_blocks.6.0.conv_block.0.bias", "module.face_encoder_blocks.6.0.conv_block.1.weight", "module.face_encoder_blocks.6.0.conv_block.1.bias", "module.face_encoder_blocks.6.0.conv_block.1.running_mean", "module.face_encoder_blocks.6.0.conv_block.1.running_var", "module.face_encoder_blocks.6.0.conv_block.1.num_batches_tracked", "module.face_encoder_blocks.6.1.conv_block.0.weight", "module.face_encoder_blocks.6.1.conv_block.0.bias", "module.face_encoder_blocks.6.1.conv_block.1.weight", "module.face_encoder_blocks.6.1.conv_block.1.bias", "module.face_encoder_blocks.6.1.conv_block.1.running_mean", "module.face_encoder_blocks.6.1.conv_block.1.running_var", "module.face_encoder_blocks.6.1.conv_block.1.num_batches_tracked", "module.audio_encoder.0.conv_block.0.weight", "module.audio_encoder.0.conv_block.0.bias", "module.audio_encoder.0.conv_block.1.weight", "module.audio_encoder.0.conv_block.1.bias", "module.audio_encoder.0.conv_block.1.running_mean", "module.audio_encoder.0.conv_block.1.running_var", "module.audio_encoder.0.conv_block.1.num_batches_tracked", "module.audio_encoder.1.conv_block.0.weight", "module.audio_encoder.1.conv_block.0.bias", "module.audio_encoder.1.conv_block.1.weight", "module.audio_encoder.1.conv_block.1.bias", "module.audio_encoder.1.conv_block.1.running_mean", "module.audio_encoder.1.conv_block.1.running_var", "module.audio_encoder.1.conv_block.1.num_batches_tracked", "module.audio_encoder.2.conv_block.0.weight", "module.audio_encoder.2.conv_block.0.bias", "module.audio_encoder.2.conv_block.1.weight", "module.audio_encoder.2.conv_block.1.bias", "module.audio_encoder.2.conv_block.1.running_mean", "module.audio_encoder.2.conv_block.1.running_var", "module.audio_encoder.2.conv_block.1.num_batches_tracked", "module.audio_encoder.3.conv_block.0.weight", "module.audio_encoder.3.conv_block.0.bias", "module.audio_encoder.3.conv_block.1.weight", "module.audio_encoder.3.conv_block.1.bias", "module.audio_encoder.3.conv_block.1.running_mean", "module.audio_encoder.3.conv_block.1.running_var", "module.audio_encoder.3.conv_block.1.num_batches_tracked", "module.audio_encoder.4.conv_block.0.weight", "module.audio_encoder.4.conv_block.0.bias", "module.audio_encoder.4.conv_block.1.weight", "module.audio_encoder.4.conv_block.1.bias", "module.audio_encoder.4.conv_block.1.running_mean", "module.audio_encoder.4.conv_block.1.running_var", "module.audio_encoder.4.conv_block.1.num_batches_tracked", "module.audio_encoder.5.conv_block.0.weight", "module.audio_encoder.5.conv_block.0.bias", "module.audio_encoder.5.conv_block.1.weight", "module.audio_encoder.5.conv_block.1.bias", "module.audio_encoder.5.conv_block.1.running_mean", "module.audio_encoder.5.conv_block.1.running_var", "module.audio_encoder.5.conv_block.1.num_batches_tracked", "module.audio_encoder.6.conv_block.0.weight", "module.audio_encoder.6.conv_block.0.bias", "module.audio_encoder.6.conv_block.1.weight", "module.audio_encoder.6.conv_block.1.bias", "module.audio_encoder.6.conv_block.1.running_mean", "module.audio_encoder.6.conv_block.1.running_var", "module.audio_encoder.6.conv_block.1.num_batches_tracked", "module.audio_encoder.7.conv_block.0.weight", "module.audio_encoder.7.conv_block.0.bias", "module.audio_encoder.7.conv_block.1.weight", "module.audio_encoder.7.conv_block.1.bias", "module.audio_encoder.7.conv_block.1.running_mean", "module.audio_encoder.7.conv_block.1.running_var", "module.audio_encoder.7.conv_block.1.num_batches_tracked", "module.audio_encoder.8.conv_block.0.weight", "module.audio_encoder.8.conv_block.0.bias", "module.audio_encoder.8.conv_block.1.weight", "module.audio_encoder.8.conv_block.1.bias", "module.audio_encoder.8.conv_block.1.running_mean", "module.audio_encoder.8.conv_block.1.running_var", "module.audio_encoder.8.conv_block.1.num_batches_tracked", "module.audio_encoder.9.conv_block.0.weight", "module.audio_encoder.9.conv_block.0.bias", "module.audio_encoder.9.conv_block.1.weight", "module.audio_encoder.9.conv_block.1.bias", "module.audio_encoder.9.conv_block.1.running_mean", "module.audio_encoder.9.conv_block.1.running_var", "module.audio_encoder.9.conv_block.1.num_batches_tracked", "module.audio_encoder.10.conv_block.0.weight", "module.audio_encoder.10.conv_block.0.bias", "module.audio_encoder.10.conv_block.1.weight", "module.audio_encoder.10.conv_block.1.bias", "module.audio_encoder.10.conv_block.1.running_mean", "module.audio_encoder.10.conv_block.1.running_var", "module.audio_encoder.10.conv_block.1.num_batches_tracked", "module.audio_encoder.11.conv_block.0.weight", "module.audio_encoder.11.conv_block.0.bias", "module.audio_encoder.11.conv_block.1.weight", "module.audio_encoder.11.conv_block.1.bias", "module.audio_encoder.11.conv_block.1.running_mean", "module.audio_encoder.11.conv_block.1.running_var", "module.audio_encoder.11.conv_block.1.num_batches_tracked", "module.audio_encoder.12.conv_block.0.weight", "module.audio_encoder.12.conv_block.0.bias", "module.audio_encoder.12.conv_block.1.weight", "module.audio_encoder.12.conv_block.1.bias", "module.audio_encoder.12.conv_block.1.running_mean", "module.audio_encoder.12.conv_block.1.running_var", "module.audio_encoder.12.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.0.0.conv_block.0.weight", "module.face_decoder_blocks.0.0.conv_block.0.bias", "module.face_decoder_blocks.0.0.conv_block.1.weight", "module.face_decoder_blocks.0.0.conv_block.1.bias", "module.face_decoder_blocks.0.0.conv_block.1.running_mean", "module.face_decoder_blocks.0.0.conv_block.1.running_var", "module.face_decoder_blocks.0.0.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.1.0.conv_block.0.weight", "module.face_decoder_blocks.1.0.conv_block.0.bias", "module.face_decoder_blocks.1.0.conv_block.1.weight", "module.face_decoder_blocks.1.0.conv_block.1.bias", "module.face_decoder_blocks.1.0.conv_block.1.running_mean", "module.face_decoder_blocks.1.0.conv_block.1.running_var", "module.face_decoder_blocks.1.0.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.1.1.conv_block.0.weight", "module.face_decoder_blocks.1.1.conv_block.0.bias", "module.face_decoder_blocks.1.1.conv_block.1.weight", "module.face_decoder_blocks.1.1.conv_block.1.bias", "module.face_decoder_blocks.1.1.conv_block.1.running_mean", "module.face_decoder_blocks.1.1.conv_block.1.running_var", "module.face_decoder_blocks.1.1.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.2.0.conv_block.0.weight", "module.face_decoder_blocks.2.0.conv_block.0.bias", "module.face_decoder_blocks.2.0.conv_block.1.weight", "module.face_decoder_blocks.2.0.conv_block.1.bias", "module.face_decoder_blocks.2.0.conv_block.1.running_mean", "module.face_decoder_blocks.2.0.conv_block.1.running_var", "module.face_decoder_blocks.2.0.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.2.1.conv_block.0.weight", "module.face_decoder_blocks.2.1.conv_block.0.bias", "module.face_decoder_blocks.2.1.conv_block.1.weight", "module.face_decoder_blocks.2.1.conv_block.1.bias", "module.face_decoder_blocks.2.1.conv_block.1.running_mean", "module.face_decoder_blocks.2.1.conv_block.1.running_var", "module.face_decoder_blocks.2.1.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.2.2.conv_block.0.weight", "module.face_decoder_blocks.2.2.conv_block.0.bias", "module.face_decoder_blocks.2.2.conv_block.1.weight", "module.face_decoder_blocks.2.2.conv_block.1.bias", "module.face_decoder_blocks.2.2.conv_block.1.running_mean", "module.face_decoder_blocks.2.2.conv_block.1.running_var", "module.face_decoder_blocks.2.2.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.3.0.conv_block.0.weight", "module.face_decoder_blocks.3.0.conv_block.0.bias", "module.face_decoder_blocks.3.0.conv_block.1.weight", "module.face_decoder_blocks.3.0.conv_block.1.bias", "module.face_decoder_blocks.3.0.conv_block.1.running_mean", "module.face_decoder_blocks.3.0.conv_block.1.running_var", "module.face_decoder_blocks.3.0.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.3.1.conv_block.0.weight", "module.face_decoder_blocks.3.1.conv_block.0.bias", "module.face_decoder_blocks.3.1.conv_block.1.weight", "module.face_decoder_blocks.3.1.conv_block.1.bias", "module.face_decoder_blocks.3.1.conv_block.1.running_mean", "module.face_decoder_blocks.3.1.conv_block.1.running_var", "module.face_decoder_blocks.3.1.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.3.2.conv_block.0.weight", "module.face_decoder_blocks.3.2.conv_block.0.bias", "module.face_decoder_blocks.3.2.conv_block.1.weight", "module.face_decoder_blocks.3.2.conv_block.1.bias", "module.face_decoder_blocks.3.2.conv_block.1.running_mean", "module.face_decoder_blocks.3.2.conv_block.1.running_var", "module.face_decoder_blocks.3.2.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.4.0.conv_block.0.weight", "module.face_decoder_blocks.4.0.conv_block.0.bias", "module.face_decoder_blocks.4.0.conv_block.1.weight", "module.face_decoder_blocks.4.0.conv_block.1.bias", "module.face_decoder_blocks.4.0.conv_block.1.running_mean", "module.face_decoder_blocks.4.0.conv_block.1.running_var", "module.face_decoder_blocks.4.0.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.4.1.conv_block.0.weight", "module.face_decoder_blocks.4.1.conv_block.0.bias", "module.face_decoder_blocks.4.1.conv_block.1.weight", "module.face_decoder_blocks.4.1.conv_block.1.bias", "module.face_decoder_blocks.4.1.conv_block.1.running_mean", "module.face_decoder_blocks.4.1.conv_block.1.running_var", "module.face_decoder_blocks.4.1.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.4.2.conv_block.0.weight", "module.face_decoder_blocks.4.2.conv_block.0.bias", "module.face_decoder_blocks.4.2.conv_block.1.weight", "module.face_decoder_blocks.4.2.conv_block.1.bias", "module.face_decoder_blocks.4.2.conv_block.1.running_mean", "module.face_decoder_blocks.4.2.conv_block.1.running_var", "module.face_decoder_blocks.4.2.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.5.0.conv_block.0.weight", "module.face_decoder_blocks.5.0.conv_block.0.bias", "module.face_decoder_blocks.5.0.conv_block.1.weight", "module.face_decoder_blocks.5.0.conv_block.1.bias", "module.face_decoder_blocks.5.0.conv_block.1.running_mean", "module.face_decoder_blocks.5.0.conv_block.1.running_var", "module.face_decoder_blocks.5.0.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.5.1.conv_block.0.weight", "module.face_decoder_blocks.5.1.conv_block.0.bias", "module.face_decoder_blocks.5.1.conv_block.1.weight", "module.face_decoder_blocks.5.1.conv_block.1.bias", "module.face_decoder_blocks.5.1.conv_block.1.running_mean", "module.face_decoder_blocks.5.1.conv_block.1.running_var", "module.face_decoder_blocks.5.1.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.5.2.conv_block.0.weight", "module.face_decoder_blocks.5.2.conv_block.0.bias", "module.face_decoder_blocks.5.2.conv_block.1.weight", "module.face_decoder_blocks.5.2.conv_block.1.bias", "module.face_decoder_blocks.5.2.conv_block.1.running_mean", "module.face_decoder_blocks.5.2.conv_block.1.running_var", "module.face_decoder_blocks.5.2.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.6.0.conv_block.0.weight", "module.face_decoder_blocks.6.0.conv_block.0.bias", "module.face_decoder_blocks.6.0.conv_block.1.weight", "module.face_decoder_blocks.6.0.conv_block.1.bias", "module.face_decoder_blocks.6.0.conv_block.1.running_mean", "module.face_decoder_blocks.6.0.conv_block.1.running_var", "module.face_decoder_blocks.6.0.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.6.1.conv_block.0.weight", "module.face_decoder_blocks.6.1.conv_block.0.bias", "module.face_decoder_blocks.6.1.conv_block.1.weight", "module.face_decoder_blocks.6.1.conv_block.1.bias", "module.face_decoder_blocks.6.1.conv_block.1.running_mean", "module.face_decoder_blocks.6.1.conv_block.1.running_var", "module.face_decoder_blocks.6.1.conv_block.1.num_batches_tracked", "module.face_decoder_blocks.6.2.conv_block.0.weight", "module.face_decoder_blocks.6.2.conv_block.0.bias", "module.face_decoder_blocks.6.2.conv_block.1.weight", "module.face_decoder_blocks.6.2.conv_block.1.bias", "module.face_decoder_blocks.6.2.conv_block.1.running_mean", "module.face_decoder_blocks.6.2.conv_block.1.running_var", "module.face_decoder_blocks.6.2.conv_block.1.num_batches_tracked", "module.output_block.0.conv_block.0.weight", "module.output_block.0.conv_block.0.bias", "module.output_block.0.conv_block.1.weight", "module.output_block.0.conv_block.1.bias", "module.output_block.0.conv_block.1.running_mean", "module.output_block.0.conv_block.1.running_var", "module.output_block.0.conv_block.1.num_batches_tracked", "module.output_block.1.weight", "module.output_block.1.bias". 

In [7]:
# Step 3: Load and Run the Model

# Function to generate lip-synced frames
def generate_lip_synced_frames(model, frames, audio_features):
    synced_frames = []
    for i in range(len(frames)):
        frame = frames[i]
        audio_feature = audio_features[:, i:i+80]  # Assuming 80 frames per second
        audio_feature = torch.FloatTensor(audio_feature).unsqueeze(0).to('cuda')
        #frame = torch.FloatTensor(frame).unsqueeze(0).to('cuda')
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # Convert to grayscale
        frame = torch.FloatTensor(frame).unsqueeze(0).to('cuda')
        # frame = torch.FloatTensor(frame).unsqueeze(0).permute(0, 3, 1, 2).to('cuda')

        synced_frame = model(frame, audio_feature)
        synced_frames.append(synced_frame.cpu().numpy())
    return synced_frames
synced_frames = generate_lip_synced_frames(model, frames, audio_features)


ValueError: expected 4D input (got 3D input)

In [None]:
# Step 4: Combine Frames and Audio to Generate the Final Video

# Function to save video from frames
def save_video(frames, output_path, fps=25):
    height, width, layers = frames[0].shape
    video = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'MP4V'), fps, (width, height))
    for frame in frames:
        video.write(frame)
    video.release()

output_video_path = 'output_video.mp4'
save_video(synced_frames, output_video_path)

# Add audio to the video
output_video_with_audio = 'output_video_with_audio.mp4'
(
    ffmpeg
    .input(output_video_path)
    .input(audio_path)
    .output(output_video_with_audio)
    .run(overwrite_output=True)
)

# Step 5: Evaluate and Fine-Tune

print("Lip-syncing completed and saved to", output_video_with_audio)


In [None]:
# Step 3: Load and Run the Model

# model = Wav2Lip().to('cuda' if torch.cuda.is_available() else 'cpu')
# checkpoint = torch.load('/content/drive/MyDrive/Colab Notebooks/Diverge/wav2lip.pth', map_location='cpu')
# model.load_state_dict(checkpoint['state_dict'])

# model.load_state_dict({k.replace("module.", ""): v for k, v in checkpoint.items()})


# model_save_location = "/content/drive/MyDrive/Colab Notebooks/Diverge/wav2lip.pth"

# state_dict = torch.load(model_save_location, map_location='cpu')
# model.load_state_dict({k.replace("module.", ""): v for k, v in state_dict.items()})


# # Function to generate lip-synced frames
# def generate_lip_synced_frames(model, frames, audio_features):
#     synced_frames = []
#     for i in range(len(frames)):
#         frame = frames[i]
#         audio_feature = audio_features[:, i:i+80]  # Assuming 80 frames per second
#         audio_feature = torch.FloatTensor(audio_feature).unsqueeze(0).to('cuda')
#         frame = torch.FloatTensor(frame).unsqueeze(0).to('cuda')
#         synced_frame = model(frame, audio_feature)
#         synced_frames.append(synced_frame.cpu().numpy())
#     return synced_frames
# synced_frames = generate_lip_synced_frames(model, frames, audio_features)
##########################################################################################
# Step 3: Load and Run the Model

# ... (previous code)

# # Function to generate lip-synced frames
# def generate_lip_synced_frames(model, frames, audio_features):
#     synced_frames = []
#     for i in range(len(frames)):
#         frame = frames[i]
#         audio_feature = audio_features[:, i:i+80]  # Assuming 80 frames per second
#         audio_feature = torch.FloatTensor(audio_feature).unsqueeze(0).to('cuda')

#         # Preprocess the frame to have the expected number of channels (1)
#         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # Convert to grayscale
#         frame = torch.FloatTensor(frame).unsqueeze(0).unsqueeze(0).to('cuda') # Add channel dimension

#         synced_frame = model(frame, audio_feature)
#         synced_frames.append(synced_frame.cpu().numpy())
#     return synced_frames

# synced_frames = generate_lip_synced_frames(model, frames, audio_features)

# Step 3: Load and Run the Model

# ... (previous code)

# # Function to generate lip-synced frames
# def generate_lip_synced_frames(model, frames, audio_features):
#     synced_frames = []
#     for i in range(len(frames)):
#         frame = frames[i]
#         audio_feature = audio_features[:, i:i+80]  # Assuming 80 frames per second
#         audio_feature = torch.FloatTensor(audio_feature).unsqueeze(0).to('cuda')

#         # Preprocess the frame to have the expected number of channels (3 for color images)
#         # If the model expects a different number of channels, adjust this accordingly
#         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # Convert to grayscale
#         frame = torch.FloatTensor(frame).permute(0,3 ).to('cuda') # Add channel dimension and permute to (batch, channels, height, width)

#         synced_frame = model(frame, audio_feature)
#         synced_frames.append(synced_frame.cpu().numpy())
#     return synced_frames

# synced_frames = generate_lip_synced_frames(model, frames, audio_features)

