In [None]:
# https://github.com/OpenBMB/MiniCPM-V

In [None]:
# https://huggingface.co/openbmb/MiniCPM-V-2_6-int4

In [None]:
#@title Install and restart session
!pip install Pillow==10.1.0
# !pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cu121
# !pip install torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu121
!pip install transformers==4.40.0
!pip install sentencepiece==0.1.99
!pip install accelerate==0.30.1
!pip install bitsandbytes==0.43.1
!pip install flash_attn
!pip install gradio
!pip install pydub==0.25.1
!pip install edge-tts
!pip install deep_translator==1.11.4
from IPython.display import clear_output
clear_output()
import time
time.sleep(6)
import os
os.kill(os.getpid(), 9)

In [1]:
#@title Check cuda is available or not
import torch

# Check if CUDA is available
is_cuda_available = torch.cuda.is_available()

print("CUDA Available:", is_cuda_available)


CUDA Available: True


In [2]:
#@title utils
import torch
import gc
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import subprocess
import re
from decord import VideoReader, cpu    # pip install decord


from google.colab import files

import os
from pydub import AudioSegment
def process_video(input_video_path, input_audio_path, video_duration):
    # Ensure output directory exists
    output_dir = "/content/video_save"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Extract file name and generate output path
    base_name = os.path.basename(input_video_path)
    file_name, _ = os.path.splitext(base_name)
    output_video_path = os.path.join(output_dir, f"voice_over_{file_name}.mp4")
    shutil.copy(input_audio_path,f"{output_dir}/voice_over_{file_name}.wav")
    # Load audio
    audio = AudioSegment.from_file(input_audio_path)

    # Adjust the audio duration to match the given video duration
    audio_duration = len(audio) / 1000.0  # pydub audio duration in seconds
    if audio_duration < video_duration:
        # Add silence if audio is shorter
        silence_duration = (video_duration - audio_duration) * 1000  # in milliseconds
        silence =AudioSegment.silent(duration=silence_duration)
        audio = audio + silence
    elif audio_duration > video_duration:
        # Trim audio if it is longer
        audio = audio[:int(video_duration * 1000)]  # trim audio to match video duration

    # Save adjusted audio to a temporary file
    temp_audio_path = '/content/temp_audio.wav'
    audio.export(temp_audio_path, format='wav')

    # Use ffmpeg to replace the audio in the video
    command = f'ffmpeg -i {input_video_path} -i {temp_audio_path} -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest {output_video_path} -y'
    os.system(command)

    # Clean up temporary files
    os.remove(temp_audio_path)

    return output_video_path

from IPython.display import HTML
from base64 import b64encode

def show_video(video_path, video_width = 600):

  video_file = open(video_path, "r+b").read()

  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"""<video width={video_width} controls><source src="{video_url}"></video>""")





# Function to encode video into frames
# def encode_video(video_path, MAX_NUM_FRAMES=64):
#     def uniform_sample(l, n):
#         gap = len(l) / n
#         idxs = [int(i * gap + gap / 2) for i in range(n)]
#         return [l[i] for i in idxs]

#     vr = VideoReader(video_path, ctx=cpu(0))
#     sample_fps = round(vr.get_avg_fps() / 1)  # FPS
#     frame_idx = [i for i in range(0, len(vr), sample_fps)]
#     if len(frame_idx) > MAX_NUM_FRAMES:
#         frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
#     frames = vr.get_batch(frame_idx).asnumpy()
#     frames = [Image.fromarray(v.astype('uint8')) for v in frames]
#     print('num frames:', len(frames))
#     return frames

def encode_video(video_path, MAX_NUM_FRAMES=64, target_size=(640, 360)):
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]

    vr = VideoReader(video_path, ctx=cpu(0))
    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
    frame_idx = [i for i in range(0, len(vr), sample_fps)]
    if len(frame_idx) > MAX_NUM_FRAMES:
        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
    frames = vr.get_batch(frame_idx).asnumpy()
    frames = [Image.fromarray(v.astype('uint8')).resize(target_size) for v in frames]  # Resize frames
    print('num frames:', len(frames))
    return frames



# Function to get video duration
def get_video_duration(video_path):
    try:
        # Run the ffmpeg command to get video info
        result = subprocess.run(
            ['ffmpeg', '-i', video_path],
            stderr=subprocess.PIPE,
            stdout=subprocess.PIPE,
            text=True
        )
        # Extract duration using regular expression
        duration_search = re.search(r'Duration: (\d+:\d+:\d+.\d+)', result.stderr)
        if duration_search:
            duration_str = duration_search.group(1)
            # Convert the duration string to seconds
            h, m, s = map(float, duration_str.split(':'))
            return int(h * 3600 + m * 60 + s)
        else:
            return "Duration not found in video file."
    except Exception as e:
        return f"Error: {str(e)}"

# Function to analyze video with the model
def video_analyze(model, tokenizer, video_path, question="", MAX_NUM_FRAMES=64):
    frames = encode_video(video_path)
    # Ensure frames are converted to a format acceptable by the model
    frame_inputs = [frame.convert("RGB") for frame in frames]
    msgs = [
        {'role': 'user', 'content': frame_inputs + [question]},
    ]
    params = {
        "use_image_id": False,
        "max_slice_nums": 1  # use 1 if cuda OOM and video resolution > 448*448 else 2
    }

    answer = model.chat(
        image=None,
        msgs=msgs,
        tokenizer=tokenizer,
        **params
    )
    print(answer)
    return answer

# Function to initialize model and tokenizer and analyze the video
def MiniCPM_video(video_path, prompt="", MAX_NUM_FRAMES=64):
    model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True)
    model.eval()
    answer = video_analyze(model, tokenizer, video_path, question=prompt, MAX_NUM_FRAMES=MAX_NUM_FRAMES)
    del model
    gc.collect()
    torch.cuda.empty_cache()  # noqa
    return answer

def clean_answer(result):
    bad_punc = ['"']
    for i in bad_punc:
        if i in result:
            result = result.replace(i, '')
    return result

from IPython.display import clear_output

def generate_prompt(video_path, video_topic):
    # Generate base prompt
    if video_topic:
        prompt = (f"Create a short voiceover script in the style of a commentator or narrator on the topic of '{video_topic}' video. "
                  "The script should provide insightful commentary and capture the essence of the topic in a compelling way.")
    else:
        prompt = (f"Create a short voiceover script in the style of a commentator or narrator on the video. "
                  "The script should provide insightful commentary and capture the essence of the video in a compelling way.")

    # Get video duration and append it to the prompt
    duration = get_video_duration(video_path)
    prompt += (f" The voiceover should be concise, with the entire description of the video designed to fit within "
               f"{duration} seconds when read aloud.")

    return prompt.strip()

def video_narration(video_path,prompt):
  global Language,Gender
  result=MiniCPM_video(video_path, prompt)
  result=clean_answer(result)
  print(result)
  audio_path=gradio_talk(result,Language,Gender,translate_text_flag=True)
  clear_output()
  duration = get_video_duration(video_path)
  output_path = process_video(video_path, audio_path, duration)
  print(f"Processed video saved at: {output_path}")
  if os.path.exists("/content/gdrive/MyDrive/"):
    f_name=os.path.basename(output_path)
    drive_save_path=f"/content/gdrive/MyDrive/video/{f_name}"
    shutil.copy(output_path,drive_save_path)
  return result,output_path

In [3]:
#@title <-- Just run the cell (config edge TTS)

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

from deep_translator import GoogleTranslator

languages = {
    "Afrikaans": "af",
    "Amharic": "am",
    "Arabic": "ar",
    "Azerbaijani": "az",
    "Bulgarian": "bg",
    "Bengali": "bn",
    "Bosnian": "bs",
    "Catalan": "ca",
    "Czech": "cs",
    "Welsh": "cy",
    "Danish": "da",
    "German": "de",
    "Greek": "el",
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "Irish": "ga",
    "Galician": "gl",
    "Gujarati": "gu",
    "Hebrew": "he",
    "Hindi": "hi",
    "Croatian": "hr",
    "Hungarian": "hu",
    "Indonesian": "id",
    "Icelandic": "is",
    "Italian": "it",
    "Japanese": "ja",
    "Javanese": "jv",
    "Georgian": "ka",
    "Kazakh": "kk",
    "Khmer": "km",
    "Kannada": "kn",
    "Korean": "ko",
    "Lao": "lo",
    "Lithuanian": "lt",
    "Latvian": "lv",
    "Macedonian": "mk",
    "Malayalam": "ml",
    "Mongolian": "mn",
    "Marathi": "mr",
    "Malay": "ms",
    "Maltese": "mt",
    "Burmese": "my",
    "Norwegian Bokmål": "nb",
    "Nepali": "ne",
    "Dutch": "nl",
    "Polish": "pl",
    "Pashto": "ps",
    "Portuguese": "pt",
    "Romanian": "ro",
    "Russian": "ru",
    "Sinhala": "si",
    "Slovak": "sk",
    "Slovenian": "sl",
    "Somali": "so",
    "Albanian": "sq",
    "Serbian": "sr",
    "Sundanese": "su",
    "Swedish": "sv",
    "Swahili": "sw",
    "Tamil": "ta",
    "Telugu": "te",
    "Thai": "th",
    "Turkish": "tr",
    "Ukrainian": "uk",
    "Urdu": "ur",
    "Uzbek": "uz",
    "Vietnamese": "vi",
    "Chinese": "zh",
    "Zulu": "zu"
}

def translate_text(text, Language):
    # print("calling translate")
    target_language=languages[Language]
    if Language == "Chinese":
          target_language='zh-CN'
    translator = GoogleTranslator(target=target_language)
    translation = translator.translate(text.strip())
    t_text=str(translation)
    # print(f"{t_text}---{Language}----{target_language}")
    return t_text


female_voice_list={'Vietnamese': 'vi-VN-HoaiMyNeural',
 'Bengali': 'bn-BD-NabanitaNeural',
 'Thai': 'th-TH-PremwadeeNeural',
 'English': 'en-AU-NatashaNeural',
 'Portuguese': 'pt-BR-FranciscaNeural',
 'Arabic': 'ar-AE-FatimaNeural',
 'Turkish': 'tr-TR-EmelNeural',
 'Spanish': 'es-AR-ElenaNeural',
 'Korean': 'ko-KR-SunHiNeural',
 'French': 'fr-BE-CharlineNeural',
 'Indonesian': 'id-ID-GadisNeural',
 'Russian': 'ru-RU-SvetlanaNeural',
 'Hindi': 'hi-IN-SwaraNeural',
 'Japanese': 'ja-JP-NanamiNeural',
 'Afrikaans': 'af-ZA-AdriNeural',
 'Amharic': 'am-ET-MekdesNeural',
 'Azerbaijani': 'az-AZ-BanuNeural',
 'Bulgarian': 'bg-BG-KalinaNeural',
 'Bosnian': 'bs-BA-VesnaNeural',
 'Catalan': 'ca-ES-JoanaNeural',
 'Czech': 'cs-CZ-VlastaNeural',
 'Welsh': 'cy-GB-NiaNeural',
 'Danish': 'da-DK-ChristelNeural',
 'German': 'de-AT-IngridNeural',
 'Greek': 'el-GR-AthinaNeural',
 'Irish': 'ga-IE-OrlaNeural',
 'Galician': 'gl-ES-SabelaNeural',
 'Gujarati': 'gu-IN-DhwaniNeural',
 'Hebrew': 'he-IL-HilaNeural',
 'Croatian': 'hr-HR-GabrijelaNeural',
 'Hungarian': 'hu-HU-NoemiNeural',
 'Icelandic': 'is-IS-GudrunNeural',
 'Italian': 'it-IT-ElsaNeural',
 'Javanese': 'jv-ID-SitiNeural',
 'Georgian': 'ka-GE-EkaNeural',
 'Kazakh': 'kk-KZ-AigulNeural',
 'Khmer': 'km-KH-SreymomNeural',
 'Kannada': 'kn-IN-SapnaNeural',
 'Lao': 'lo-LA-KeomanyNeural',
 'Lithuanian': 'lt-LT-OnaNeural',
 'Latvian': 'lv-LV-EveritaNeural',
 'Macedonian': 'mk-MK-MarijaNeural',
 'Malayalam': 'ml-IN-SobhanaNeural',
 'Mongolian': 'mn-MN-YesuiNeural',
 'Marathi': 'mr-IN-AarohiNeural',
 'Malay': 'ms-MY-YasminNeural',
 'Maltese': 'mt-MT-GraceNeural',
 'Burmese': 'my-MM-NilarNeural',
 'Norwegian Bokmål': 'nb-NO-PernilleNeural',
 'Nepali': 'ne-NP-HemkalaNeural',
 'Dutch': 'nl-BE-DenaNeural',
 'Polish': 'pl-PL-ZofiaNeural',
 'Pashto': 'ps-AF-LatifaNeural',
 'Romanian': 'ro-RO-AlinaNeural',
 'Sinhala': 'si-LK-ThiliniNeural',
 'Slovak': 'sk-SK-ViktoriaNeural',
 'Slovenian': 'sl-SI-PetraNeural',
 'Somali': 'so-SO-UbaxNeural',
 'Albanian': 'sq-AL-AnilaNeural',
 'Serbian': 'sr-RS-SophieNeural',
 'Sundanese': 'su-ID-TutiNeural',
 'Swedish': 'sv-SE-SofieNeural',
 'Swahili': 'sw-KE-ZuriNeural',
 'Tamil': 'ta-IN-PallaviNeural',
 'Telugu': 'te-IN-ShrutiNeural',
 'Chinese': 'zh-CN-XiaoxiaoNeural',
 'Ukrainian': 'uk-UA-PolinaNeural',
 'Urdu': 'ur-IN-GulNeural',
 'Uzbek': 'uz-UZ-MadinaNeural',
 'Zulu': 'zu-ZA-ThandoNeural'}
male_voice_list= {'Vietnamese': 'vi-VN-NamMinhNeural',
 'Bengali': 'bn-BD-PradeepNeural',
 'Thai': 'th-TH-NiwatNeural',
 'English': 'en-US-BrianMultilingualNeural',
 'Portuguese': 'pt-BR-AntonioNeural',
 'Arabic': 'ar-AE-HamdanNeural',
 'Turkish': 'tr-TR-AhmetNeural',
 'Spanish': 'es-AR-TomasNeural',
 'Korean': 'ko-KR-HyunsuNeural',
 'French': 'fr-BE-GerardNeural',
 'Indonesian': 'id-ID-ArdiNeural',
 'Russian': 'ru-RU-DmitryNeural',
 'Hindi': 'hi-IN-MadhurNeural',
 'Japanese': 'ja-JP-KeitaNeural',
 'Afrikaans': 'af-ZA-WillemNeural',
 'Amharic': 'am-ET-AmehaNeural',
 'Azerbaijani': 'az-AZ-BabekNeural',
 'Bulgarian': 'bg-BG-BorislavNeural',
 'Bosnian': 'bs-BA-GoranNeural',
 'Catalan': 'ca-ES-EnricNeural',
 'Czech': 'cs-CZ-AntoninNeural',
 'Welsh': 'cy-GB-AledNeural',
 'Danish': 'da-DK-JeppeNeural',
 'German': 'de-AT-JonasNeural',
 'Greek': 'el-GR-NestorasNeural',
 'Irish': 'ga-IE-ColmNeural',
 'Galician': 'gl-ES-RoiNeural',
 'Gujarati': 'gu-IN-NiranjanNeural',
 'Hebrew': 'he-IL-AvriNeural',
 'Croatian': 'hr-HR-SreckoNeural',
 'Hungarian': 'hu-HU-TamasNeural',
 'Icelandic': 'is-IS-GunnarNeural',
 'Italian': 'it-IT-DiegoNeural',
 'Javanese': 'jv-ID-DimasNeural',
 'Georgian': 'ka-GE-GiorgiNeural',
 'Kazakh': 'kk-KZ-DauletNeural',
 'Khmer': 'km-KH-PisethNeural',
 'Kannada': 'kn-IN-GaganNeural',
 'Lao': 'lo-LA-ChanthavongNeural',
 'Lithuanian': 'lt-LT-LeonasNeural',
 'Latvian': 'lv-LV-NilsNeural',
 'Macedonian': 'mk-MK-AleksandarNeural',
 'Malayalam': 'ml-IN-MidhunNeural',
 'Mongolian': 'mn-MN-BataaNeural',
 'Marathi': 'mr-IN-ManoharNeural',
 'Malay': 'ms-MY-OsmanNeural',
 'Maltese': 'mt-MT-JosephNeural',
 'Burmese': 'my-MM-ThihaNeural',
 'Norwegian Bokmål': 'nb-NO-FinnNeural',
 'Nepali': 'ne-NP-SagarNeural',
 'Dutch': 'nl-BE-ArnaudNeural',
 'Polish': 'pl-PL-MarekNeural',
 'Pashto': 'ps-AF-GulNawazNeural',
 'Romanian': 'ro-RO-EmilNeural',
 'Sinhala': 'si-LK-SameeraNeural',
 'Slovak': 'sk-SK-LukasNeural',
 'Slovenian': 'sl-SI-RokNeural',
 'Somali': 'so-SO-MuuseNeural',
 'Albanian': 'sq-AL-IlirNeural',
 'Serbian': 'sr-RS-NicholasNeural',
 'Sundanese': 'su-ID-JajangNeural',
 'Swedish': 'sv-SE-MattiasNeural',
 'Swahili': 'sw-KE-RafikiNeural',
 'Tamil': 'ta-IN-ValluvarNeural',
 'Telugu': 'te-IN-MohanNeural',
 'Chinese': 'zh-CN-YunjianNeural',
 'Ukrainian': 'uk-UA-OstapNeural',
 'Urdu': 'ur-IN-SalmanNeural',
 'Uzbek': 'uz-UZ-SardorNeural',
 'Zulu': 'zu-ZA-ThembaNeural'}

def chunks_sentences(paragraph, join_limit=2):
    sentences = sent_tokenize(paragraph)
    # Initialize an empty list to store the new sentences
    new_sentences = []

    # Iterate through the list of sentences in steps of 'join_limit'
    for i in range(0, len(sentences), join_limit):
        # Join the sentences with a space between them
        new_sentence = ' '.join(sentences[i:i + join_limit])
        new_sentences.append(new_sentence)
    return new_sentences


def calculate_rate_string(input_value):
    rate = (input_value - 1) * 100
    sign = '+' if input_value >= 1 else '-'
    return f"{sign}{abs(int(rate))}"


def make_chunks(input_text, language):
    language="English"
    if language == "English":
      filtered_list=chunks_sentences(input_text, join_limit=2)
      # temp_list = input_text.strip().split(".")
      # filtered_list = [element.strip() + '.' for element in temp_list[:-1] if element.strip() and element.strip() != "'" and element.strip() != '"']
      # if temp_list[-1].strip():
      #     filtered_list.append(temp_list[-1].strip())
      return filtered_list




import re
import uuid
def tts_file_name(text):
    if text.endswith("."):
        text = text[:-1]
    text = text.lower()
    text = text.strip()
    text = text.replace(" ","_")
    truncated_text = text[:25] if len(text) > 25 else text if len(text) > 0 else "empty"
    random_string = uuid.uuid4().hex[:8].upper()
    file_name = f"/content/edge_tts_voice/{truncated_text}_{random_string}.mp3"
    return file_name


from pydub import AudioSegment
import shutil
import os
def merge_audio_files(audio_paths, output_path):
    # Initialize an empty AudioSegment
    merged_audio = AudioSegment.silent(duration=0)

    # Iterate through each audio file path
    for audio_path in audio_paths:
        # Load the audio file using Pydub
        audio = AudioSegment.from_file(audio_path)

        # Append the current audio file to the merged_audio
        merged_audio += audio

    # Export the merged audio to the specified output path
    merged_audio.export(output_path, format="mp3")

def edge_free_tts(chunks_list,speed,voice_name,save_path,translate_text_flag,Language):
  print(voice_name)
  print(chunks_list)
  if len(chunks_list)>1:
    chunk_audio_list=[]
    if os.path.exists("/content/edge_tts_voice"):
      shutil.rmtree("/content/edge_tts_voice")
    os.mkdir("/content/edge_tts_voice")
    k=1
    for i in chunks_list:
      print(i)
      if translate_text_flag:
        text=translate_text(i, Language)
      else:
        text=i
      edge_command=f'edge-tts  --rate={calculate_rate_string(speed)}% --voice {voice_name} --text "{text}" --write-media /content/edge_tts_voice/{k}.mp3'
      print(edge_command)
      var1=os.system(edge_command)
      if var1==0:
        pass
      else:
        print(f"Failed: {i}")
      chunk_audio_list.append(f"/content/edge_tts_voice/{k}.mp3")
      k+=1
    print(chunk_audio_list)
    merge_audio_files(chunk_audio_list, save_path)
  else:
    if translate_text_flag:
      text=translate_text(chunks_list[0], Language)
    else:
      text=chunks_list[0]
    edge_command=f'edge-tts  --rate={calculate_rate_string(speed)}% --voice {voice_name} --text "{text}" --write-media {save_path}'
    print(edge_command)
    var2=os.system(edge_command)
    if var2==0:
      pass
    else:
      print(f"Failed: {chunks_list[0]}")
  return save_path


# speed = 1  # @param {type: "number"}
# translate_text_flag  = True # @param {type:"boolean"}
# long_sentence = True # @param {type:"boolean"}








from IPython.display import clear_output
from IPython.display import Audio
if not os.path.exists("/content/audio"):
    os.mkdir("/content/audio")
import uuid
def random_audio_name_generate():
  random_uuid = uuid.uuid4()
  audio_extension = ".mp3"
  random_audio_name = str(random_uuid)[:8] + audio_extension
  return random_audio_name

def gradio_talk(input_text,Language,Gender,translate_text_flag=True):
  # global long_sentence,translate_text_flag,Language,speed,voice_name,Gender
  global male_voice_list,female_voice_list
  long_sentence=True
  # translate_text_flag=True
  speed=1
  voice_name=''
  if Gender=="Male":
    voice_name=male_voice_list[Language]
  if Gender=="Female":
    voice_name=female_voice_list[Language]
  if long_sentence==True and translate_text_flag==True:
    chunks_list=make_chunks(input_text,Language)
  elif long_sentence==True and translate_text_flag==False:
    chunks_list=make_chunks(input_text,"English")
  else:
    chunks_list=[input_text]
  save_path="/content/audio/"+random_audio_name_generate()
  print(chunks_list,speed,voice_name,save_path,translate_text_flag,Language)
  edge_save_path=edge_free_tts(chunks_list,speed,voice_name,save_path,translate_text_flag,Language)
  return edge_save_path


def talk(input_text):
  # global long_sentence,translate_text_flag,Language,speed,voice_name,Gender
  global Language, Gender,male_voice_list,female_voice_list
  long_sentence=True
  translate_text_flag=True
  speed=1

  if Gender=="Male":
    voice_name=male_voice_list[Language]
  if Gender=="Female":
    voice_name=female_voice_list[Language]
  if long_sentence==True and translate_text_flag==True:
    chunks_list=make_chunks(input_text,Language)
  elif long_sentence==True and translate_text_flag==False:
    chunks_list=make_chunks(input_text,"English")
  else:
    chunks_list=[input_text]
  save_path="/content/audio/"+random_audio_name_generate()
  edge_save_path=edge_free_tts(chunks_list,speed,voice_name,save_path,translate_text_flag,Language)
  return edge_save_path



text = "Config  Edge TTS"  # @param {type: "string"}
Language = "English" # @param ['English','Hindi','Bengali','Afrikaans', 'Amharic', 'Arabic', 'Azerbaijani', 'Bulgarian', 'Bosnian', 'Catalan', 'Czech', 'Welsh', 'Danish', 'German', 'Greek', 'Spanish', 'French', 'Irish', 'Galician', 'Gujarati', 'Hebrew', 'Croatian', 'Hungarian', 'Indonesian', 'Icelandic', 'Italian', 'Japanese', 'Javanese', 'Georgian', 'Kazakh', 'Khmer', 'Kannada', 'Korean', 'Lao', 'Lithuanian', 'Latvian', 'Macedonian', 'Malayalam', 'Mongolian', 'Marathi', 'Malay', 'Maltese', 'Burmese', 'Norwegian Bokmål', 'Nepali', 'Dutch', 'Polish', 'Pashto', 'Portuguese', 'Romanian', 'Russian', 'Sinhala', 'Slovak', 'Slovenian', 'Somali', 'Albanian', 'Serbian', 'Sundanese', 'Swedish', 'Swahili', 'Tamil', 'Telugu', 'Thai', 'Turkish', 'Ukrainian', 'Urdu', 'Uzbek', 'Vietnamese', 'Chinese', 'Zulu']

Gender = "Male"# @param ['Male', 'Female']
edge_save_path=talk(text)
from IPython.display import clear_output
clear_output()
print(f"Audio File Save at: {edge_save_path}")
Audio(edge_save_path, autoplay=False)

Audio File Save at: /content/audio/030fb44c.mp3


In [4]:

#@title Mount drive to upload video the speed is faster
from google.colab import drive
import os
drive.mount('/content/gdrive')
if not os.path.exists("/content/gdrive/MyDrive/video"):
  os.mkdir("/content/gdrive/MyDrive/video")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#Colab video upload is slow

### "already_uploaded_in_drive"
1.   Open google drive
2.   Find a folder called "```video```" inside google drive
3.   Upload your video inside "```video```" folder in google drive
4.   Run next cell and select "alread_uploaded_in_drive"

### "upload_now"
It's slow

In [5]:
# @title ##**Choose option** { display-mode: "form" }
choose= "already_uploaded_in_drive" #@param ['upload_now','already_uploaded_in_drive']
import os
from google.colab import drive
from IPython.display import clear_output

folder_path = "/content/gdrive/MyDrive/video"


from google.colab import files
import shutil

upload_folder = '/content/user_upload'

if not os.path.exists(upload_folder):
    os.mkdir(upload_folder)
upload_video=[]
if choose== "upload_now":
  uploaded = files.upload()
  for filename in uploaded.keys():
    dst_path = os.path.join(upload_folder, filename)
    print(f'move {filename} to {dst_path}')
    shutil.move(filename, dst_path)
    upload_video.append(dst_path)
    clear_output()
    print(upload_video[-1])


if choose== "already_uploaded_in_drive":
  drive.mount('/content/gdrive')
  clear_output()
  if not os.path.exists("/content/gdrive/MyDrive/video"):
    os.mkdir("/content/gdrive/MyDrive/video")
  video_id=1
  table={}
  ids=[]
  videos=[]
  id_monitor={}
  video_folder="/content/gdrive/MyDrive/video"
  for i in os.listdir(video_folder):
    ids.append(video_id)
    videos.append(i)
    id_monitor[video_id]=i
    video_id=video_id+1
  table["file_name"]=videos
  table["file_id"]=ids
  import pandas as pd
  df = pd.DataFrame(table)
  try:
    df.set_index('file_id', inplace=True)
  except:
    pass
  print(df)
  print("\n")
  print("Note the File Id")


                         file_name
file_id                           
1        81945-577442929_small.mp4
2                        audio.MP3
3                          cat.mp4


Note the File Id


In [6]:
# @title ##**leave blank if you selected 'upload_now'** { display-mode: "form" }

if choose == "already_uploaded_in_drive":
    video_id = '3'  # @param {type: "string"}
    if len(video_id) == 0:
        print("Enter Video ID")
    else:
        video_id = int(video_id)
        if video_id < len(id_monitor)+1:
            target_video_path = "/content/gdrive/MyDrive/video/" + id_monitor[video_id]
        else:
            print("Invalid Video ID")
            target_video_path = ''
else:
    target_video_path = upload_video[-1]

target_video_path

'/content/gdrive/MyDrive/video/cat.mp4'

In [7]:
#@title Run the model
# Example usage
video_topic="" # @param {type: "string"}
video_path = '/content/gdrive/MyDrive/video/cat.mp4'  # @param {type: "string"}
# prompt="" # @param {type: "string"}
prompt = generate_prompt(video_path, video_topic)
print(f"Prompt: {prompt}")
result,output_path=video_narration(video_path,prompt)
print("Video Analyze:")
result

Processed video saved at: /content/video_save/voice_over_cat.mp4
Video Analyze:


"Capturing the essence of a nocturnal encounter, this video unfolds as two kittens engage in playful antics on a dimly lit floor. Their youthful energy contrasts with the calm demeanor of an adult cat, which observes from a distance. The scene shifts to reveal a curious kitten investigating a snake, highlighting nature's unexpected interactions and the delicate balance within their environment."

In [None]:
#@title download and display video
print(f"Video Save at: {output_path}")
print(f"One copy save at in your google drive 'video' Folder ")
files.download(output_path)
# show_video(output_path)