In [None]:
!pip install fairseq2==0.1 pydub yt-dlp
!git clone https://github.com/facebookresearch/seamless_communication.git
%cd seamless_communication
!git checkout 01c1042841f9bce66902eb2c7512dbdd71d42112 # We will use a stable version; if you want to use the latest version, comment out this line.
!pip install .

Utility Functions and Libraries

In [None]:
from seamless_communication.models.inference import Translator
from IPython.display import Audio
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub import AudioSegment
import torchaudio
import torch
import os

def save_and_play_audio(path_save, audio, sample_rate):
    torchaudio.save(
        path_save,
        audio[0].cpu(),
        sample_rate=sample_rate,
    )

    audio_play = Audio(path_save, rate=sample_rate, autoplay=True, normalize=True)
    display(audio_play)

def split_audio_with_max_duration(input_file, output_directory, min_silence_len=2500, silence_thresh=-60, max_chunk_duration=15000):

    sound = AudioSegment.from_wav(input_file)

    # Splitting on silence
    audio_chunks = split_on_silence(sound, min_silence_len=min_silence_len, silence_thresh=silence_thresh)

    # split for max_chunk_duration
    final_audio_chunks = []
    for chunk in audio_chunks:
        if len(chunk) > max_chunk_duration:
            num_subchunks = len(chunk) // max_chunk_duration + 1
            subchunk_size = len(chunk) // num_subchunks
            for i in range(num_subchunks):
                start_idx = i * subchunk_size
                end_idx = (i + 1) * subchunk_size
                subchunk = chunk[start_idx:end_idx]
                final_audio_chunks.append(subchunk)
        else:
            final_audio_chunks.append(chunk)

    # Export wav
    for i, chunk in enumerate(final_audio_chunks):
        output_file = f"{output_directory}/chunk{i}.wav"
        print("Exporting file", output_file)
        chunk.export(output_file, format="wav")

Load the model

In [None]:
# Initialize a Translator object with a multitask model, vocoder on the GPU.
translator = Translator(
    "seamlessM4T_large",
    "vocoder_36langs",
    torch.device("cuda:0")
)

Downloading the checkpoint of the model 'seamlessM4T_large'...
100%|██████████| 10.7G/10.7G [00:57<00:00, 200MB/s]
Downloading the tokenizer of the model 'seamlessM4T_large'...
100%|██████████| 4.93M/4.93M [00:00<00:00, 104MB/s]
Downloading the checkpoint of the model 'vocoder_36langs'...
100%|██████████| 160M/160M [00:00<00:00, 244MB/s]


In [None]:
text = 'En el bosque encantado'

In [None]:
translated_text, wav, sr = translator.predict(
    text,
    "t2st",
    tgt_lang='eng',
    src_lang='spa'
)

save_and_play_audio(
    '/content/seamless_communication/text2speech.wav',
    wav,
    sr,
)

In [None]:
!git clone https://github.com/ajay-sainy/Wav2Lip-GFPGAN.git
basePath = "/content/Wav2Lip-GFPGAN"
%cd {basePath}

In [None]:
wav2lipFolderName = 'Wav2Lip-master'
gfpganFolderName = 'GFPGAN-master'
wav2lipPath = basePath + '/' + wav2lipFolderName
gfpganPath = basePath + '/' + gfpganFolderName

!wget 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth' -O {wav2lipPath}'/face_detection/detection/sfd/s3fd.pth'
!gdown https://drive.google.com/uc?id=1fQtBSYEyuai9MjBOF8j7zZ4oQ9W2N64q --output {wav2lipPath}'/checkpoints/'

In [None]:
!pip install -r requirements.txt

In [None]:
import os
outputPath = basePath+'/outputs'
inputAudioPath = '/content/seamless_communication/text2speech.wav'
inputVideoPath = basePath + '/inputs/kimk_7s_raw.mp4'
lipSyncedOutputPath = basePath + '/outputs/result.mp4'

if not os.path.exists(outputPath):
  os.makedirs(outputPath)

!cd $wav2lipFolderName && python inference.py \
--checkpoint_path checkpoints/wav2lip.pth \
--face {inputVideoPath} \
--audio {inputAudioPath} \
--outfile {lipSyncedOutputPath}

In [None]:
!cd $gfpganFolderName && python setup.py develop
!wget https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth -P {gfpganFolderName}'/experiments/pretrained_models'

In [None]:
import cv2
from tqdm import tqdm
from os import path

import os

inputVideoPath = outputPath+'/result.mp4'
unProcessedFramesFolderPath = outputPath+'/frames'

if not os.path.exists(unProcessedFramesFolderPath):
  os.makedirs(unProcessedFramesFolderPath)

vidcap = cv2.VideoCapture(inputVideoPath)
numberOfFrames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = vidcap.get(cv2.CAP_PROP_FPS)
print("FPS: ", fps, "Frames: ", numberOfFrames)

for frameNumber in tqdm(range(numberOfFrames)):
    _,image = vidcap.read()
    cv2.imwrite(path.join(unProcessedFramesFolderPath, str(frameNumber).zfill(4)+'.jpg'), image)


In [None]:
!cd $gfpganFolderName && \
  python inference_gfpgan.py -i $unProcessedFramesFolderPath -o $outputPath -v 1.3 -s 2 --only_center_face --bg_upsampler None

In [None]:
import os
restoredFramesPath = outputPath + '/restored_imgs/'
processedVideoOutputPath = outputPath

dir_list = os.listdir(restoredFramesPath)
dir_list.sort()

import cv2
import numpy as np

batch = 0
batchSize = 300
from tqdm import tqdm
for i in tqdm(range(0, len(dir_list), batchSize)):
  img_array = []
  start, end = i, i+batchSize
  print("processing ", start, end)
  for filename in  tqdm(dir_list[start:end]):
      filename = restoredFramesPath+filename;
      img = cv2.imread(filename)
      if img is None:
        continue
      height, width, layers = img.shape
      size = (width,height)
      img_array.append(img)


  out = cv2.VideoWriter(processedVideoOutputPath+'/batch_'+str(batch).zfill(4)+'.avi',cv2.VideoWriter_fourcc(*'DIVX'), 30, size)
  batch = batch + 1

  for i in range(len(img_array)):
    out.write(img_array[i])
  out.release()


In [None]:
concatTextFilePath = outputPath + "/concat.txt"
concatTextFile=open(concatTextFilePath,"w")
for ips in range(batch):
  concatTextFile.write("file batch_" + str(ips).zfill(4) + ".avi\n")
concatTextFile.close()

concatedVideoOutputPath = outputPath + "/concated_output.avi"
!ffmpeg -y -f concat -i {concatTextFilePath} -c copy {concatedVideoOutputPath}

finalProcessedOuputVideo = processedVideoOutputPath+'/final_with_audio.avi'
!ffmpeg -y -i {concatedVideoOutputPath} -i {inputAudioPath} -map 0 -map 1:a -c:v copy -shortest {finalProcessedOuputVideo}

from google.colab import files
files.download(finalProcessedOuputVideo)