In [2]:
# %pip install pptxtopdf PyMuPDF Pillow moviepy python-dotenv
# %pip install -q -U google-generativeai
# %pip install numpy PyMuPDF moviepy pptxtopdf mediapipe opencv-python azure-cognitiveservices-speech
# %pip install pydub

In [1]:
import os
import shutil
import time
import json
import requests
from dotenv import load_dotenv
import numpy as np
import PIL
from PIL import Image
import fitz  # PyMuPDF
from pptxtopdf import convert
import google.generativeai as genai
from moviepy.editor import *
import mediapipe as mp
import cv2
from tqdm import tqdm
import azure.cognitiveservices.speech as speechsdk
from moviepy.editor import concatenate_audioclips, AudioFileClip

  from .autonotebook import tqdm as notebook_tqdm


# Set the env variables consisting of API Keys and area
* make a .env file in the current working directory
* add following variables:
    -   SPEECH_SERVICEE_REGION
    -   SPEECH_SERVICE_API_KEY
    -   GOOGLE_GEMINI_API_KEY

In [2]:
if not load_dotenv('./.env'): raise Exception("env file not found")

SERVICE_REGION = os.getenv("SPEECH_SERVICEE_REGION")
SUBSCRIPTION_KEY = os.getenv("SPEECH_SERVICE_API_KEY")
GEMINI_API_KEY = os.getenv("GOOGLE_GEMINI_API_KEY")

url_base = f"https://{SERVICE_REGION}.customvoice.api.speech.microsoft.com/api"
genai.configure(api_key=GEMINI_API_KEY)

MODEL = "gemini-1.5-flash-latest" # CAN ALSO USE: "gemini-1.5-pro-latest" - BUT WILL BE A BIT SLOW.
model = genai.GenerativeModel(MODEL)

In [3]:
def text_to_speech(text, output_file):
    speech_key = SUBSCRIPTION_KEY # Replace with your Speech service key
    service_region = SERVICE_REGION   # Replace with your Speech service region
    
    # Create a speech configuration object
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    speech_config.speech_synthesis_voice_name = 'en-US-AndrewNeural'
    
    # Create a speech synthesizer object with a file output
    audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file)
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

    # Synthesize the text and save it to the file
    result = synthesizer.speak_text_async(text).get()
    
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print(f'Text-to-speech conversion successful. Audio saved to {output_file}')
    else:
        print(f'Text-to-speech conversion failed: {result.reason}')

# File and Directories names
* can change to desired names

In [4]:
input_ppt = "output.pptx" # Make sure it in in the current working directory (Example: file_name.pptx)
# holds content
image_names = []
slides_content = []
# directories
images_dir = "images"
avtr_video_dir = "avt_videos"
# output
frames_video_file = "main.mp4"
avtr_video_file = "avtr_vid.webm"
final_video_file = "final.webm"

# Helper functions

In [5]:
def delete_folder_and_create(path):
    if os.path.exists(path): shutil.rmtree(path)
    os.makedirs(path)

def convert_to_images(input_file_path):
    file_name = input_file_path.split(".")[0]
    convert(input_file_path, "")
    pdf_path = file_name + ".pdf"
    if os.path.exists(images_dir): shutil.rmtree(images_dir)
    os.makedirs(images_dir)
    pdf_document = fitz.open(pdf_path)
    num_pages = len(pdf_document)
    num_digits = len(str(num_pages))
    for page_num in range(num_pages):
        page_number_str = str(page_num + 1).zfill(num_digits)
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img_path = f"{images_dir}/page_{page_number_str}.png"
        image_names.append(f"page_{page_number_str}")
        img.save(img_path)
        print(f"Saved {img_path}")

def download_file(url, local_path, index):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        # filename = url.split("/")[-1].split("?")[0]
        filename = image_names[index]+".webm"
        local_filename = os.path.join(local_path, filename)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                if chunk: f.write(chunk)
    return local_filename

def preprocess_response(text):
    text = text.replace('*', '')
    return text

def combine_webm_videos(output_file):
    clips = []
    for file in os.listdir(avtr_video_dir):
        path = os.path.join(avtr_video_dir,file)
        clips.append(VideoFileClip(path))
    final_clip = concatenate_videoclips(clips, method="compose")
    final_clip.write_videofile(output_file, codec="libvpx", audio_codec="libvorbis")
    final_clip.close()
    for clip in clips: clip.close()

def get_video_duration(file_path):
    video = VideoFileClip(file_path)
    duration = video.duration
    video.close()
    return duration

def custom_resize(clip, newsize):
    def resize_frame(frame):
        img = Image.fromarray(frame)
        return np.array(img.resize(newsize, Image.LANCZOS))
    return clip.fl_image(resize_frame)


# Note: It deletes and creates all previous generated intermediate folders and files!!

In [7]:
delete_folder_and_create(images_dir)
delete_folder_and_create(avtr_video_dir)

# Convert the .pptx into images in a directory

In [8]:
convert_to_images(input_ppt)

Error: Output file 'd:\IISc-Internship\Week11\info_to_pptx\output.pdf' already exists.
Conversion completed: 0 files converted successfully, 1 files failed.
Saved images/page_1.png
Saved images/page_2.png
Saved images/page_3.png


# Generate content for each slide based on the content of previous slides
* Previous slide's generated content is also sent for each of the slide so that the context is preserved.
* prompt could be changed, currently using a simple prompt.

In [9]:
# prompt_template = """
# You are a professional presenter who has created this slides and is explaining to audience. You need to look into the current slide of a PowerPoint presentation given to you as an image and explain it to the audience. Ensure the explanation is clear, detailed, and relevant to the content of the slide. Do not use any greetings like good morning, evening, or night. Do not use astricks in the response. The output should be plain text without any markdown notations. Explain the slides slide by slide. If previous slides have been explained, continue seamlessly from the previous slide's information.
# """

# for idx,slide in enumerate(os.listdir(images_dir)):
#     print("Slide: ", str(idx+1))
#     img = PIL.Image.open(os.path.join("images", slide))
#     prompt = prompt_template
#     if len(slides_content) > 0:
#         prompt += " Continue from the previous slide's information given here: "
#         for j, text in enumerate(slides_content):
#             prompt += f"Slide-{j+1} {text} "
#     prompt += f"Explain the current slide (Slide-{len(slides_content)+1}) in detail:"
#     response = model.generate_content([prompt, img], stream=True)
#     response.resolve()
#     description = preprocess_response(response.text)
#     slides_content.append(description)
#     print(description)

slides_content = []

prompt_template = """
You are a professional presenter explaining a PowerPoint presentation to an audience. Analyze the current slide image and provide a concise, insightful explanation. Focus on the key points and main ideas rather than reading any text verbatim from the slide. Your explanation should be brief but informative, suitable for a quick presentation.

Guidelines:
1. Do not use greetings or introductions.
2. Avoid reading sentences directly from the slide.
3. Provide context and insights beyond what's visibly written.
4. Keep the explanation concise, aiming for about 2-3 sentences per slide.
5. Use plain text without any special formatting or markdown.
6. If this isn't the first slide, ensure your explanation flows naturally from the previous content.
7. Use examples other than the one given in the ppt.
8. Provide additional details when and where required.
9. Connect the contents from previous slides also.
10. Be as concise as possible.
11. Focus more on explaining images in each slide which may be flowcharts or images provided that they are technical images with information related to the topic.

Current slide number: {slide_number}

Previous slides' content (if any):
{previous_slides_content}

Explain this slide succinctly:
"""

for idx, slide in enumerate(os.listdir(images_dir)):
    print(f"Slide: {idx+1}")
    img = PIL.Image.open(os.path.join("images", slide))
    
    previous_slides_content = ""
    if len(slides_content) > 0:
        previous_slides_content = " ".join([f"Slide-{j+1}: {text}" for j, text in enumerate(slides_content)])
    
    prompt = prompt_template.format(
        slide_number=idx+1,
        previous_slides_content=previous_slides_content
    )
    # print(prompt)
    # print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n\n")
    
    response = model.generate_content([prompt, img], stream=True)
    response.resolve()
    description = preprocess_response(response.text)
    slides_content.append(description)
    print(description)

Slide: 1
This presentation will serve as a comprehensive guide to Large Language Model (LLM) agents, explaining their capabilities in solving intricate problems. We will explore their components, practical use cases, and the challenges that come with their implementation.  Think of LLM agents as sophisticated tools equipped with reasoning, data analysis, and planning capabilities to solve complex problems in areas like language processing and natural language understanding. 

Slide: 2
This slide highlights the key distinction between traditional Large Language Models (LLMs) and LLM agents. While basic LLMs with Retrieval Augmented Generation (RAG) systems primarily retrieve information, LLM agents go beyond simple retrieval. They employ sequential reasoning, planning, and memory to analyze data, plan tasks, and connect information to solve complex problems, as illustrated by the flowchart depicting the components of an LLM agent.  For example, consider a medical diagnosis scenario. Whi

In [10]:
slides_content

['This presentation will serve as a comprehensive guide to Large Language Model (LLM) agents, explaining their capabilities in solving intricate problems. We will explore their components, practical use cases, and the challenges that come with their implementation.  Think of LLM agents as sophisticated tools equipped with reasoning, data analysis, and planning capabilities to solve complex problems in areas like language processing and natural language understanding. \n',
 'This slide highlights the key distinction between traditional Large Language Models (LLMs) and LLM agents. While basic LLMs with Retrieval Augmented Generation (RAG) systems primarily retrieve information, LLM agents go beyond simple retrieval. They employ sequential reasoning, planning, and memory to analyze data, plan tasks, and connect information to solve complex problems, as illustrated by the flowchart depicting the components of an LLM agent.  For example, consider a medical diagnosis scenario. While a basic 

In [6]:
slides_content=['This presentation will serve as a comprehensive guide to Large Language Model (LLM) agents, explaining their capabilities in solving intricate problems. We will explore their components, practical use cases, and the challenges that come with their implementation.  Think of LLM agents as sophisticated tools equipped with reasoning, data analysis, and planning capabilities to solve complex problems in areas like language processing and natural language understanding. \n',
 'This slide highlights the key distinction between traditional Large Language Models (LLMs) and LLM agents. While basic LLMs with Retrieval Augmented Generation (RAG) systems primarily retrieve information, LLM agents go beyond simple retrieval. They employ sequential reasoning, planning, and memory to analyze data, plan tasks, and connect information to solve complex problems, as illustrated by the flowchart depicting the components of an LLM agent.  For example, consider a medical diagnosis scenario. While a basic LLM might retrieve symptoms and related medical conditions, an LLM agent could analyze patient history, medical records, and current symptoms to arrive at a more accurate diagnosis, potentially even suggesting a treatment plan. \n',
 'This slide breaks down the building blocks of LLM agents, highlighting three key components: the Agent/Brain, Memory, and Planning. The Agent/Brain is the core language model, while Memory enables short-term and long-term retention, allowing for tailored responses and personalized interactions. Planning, the final component, focuses on formulating and refining plans, breaking down complex tasks into smaller steps. This allows LLM agents to adapt to real-world situations and utilize reasoning methods like Chain of Thought (CoT) and Tree of Thought (ToT). \n']

In [7]:
import re

# Initialize the list that will hold the structured data
sent_slide_content = []

# Iterate over each slide's text
for i, slide_text in enumerate(slides_content):
    # Split the slide text into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', slide_text.strip())
    # print("sentences: ", sentences)
    trimmed_sentences = []
    for sentence in sentences:
        trimmed_sentences.append(" ".join(sentence.split()))
    # Create a dictionary for the current slide
    slide_dict = {
        "slide_number": i + 1,
        "sentences": trimmed_sentences
    }

    # Add the dictionary to the slides list
    sent_slide_content.append(slide_dict)

print("slides: ",sent_slide_content)
# Output the structured data
for slide in sent_slide_content:
    print(f"Slide {slide['slide_number']}:")
    for sentence in slide['sentences']:
        print(f"  - {sentence}")

# Now you have the structured slides list

slides:  [{'slide_number': 1, 'sentences': ['This presentation will serve as a comprehensive guide to Large Language Model (LLM) agents, explaining their capabilities in solving intricate problems.', 'We will explore their components, practical use cases, and the challenges that come with their implementation.', 'Think of LLM agents as sophisticated tools equipped with reasoning, data analysis, and planning capabilities to solve complex problems in areas like language processing and natural language understanding.']}, {'slide_number': 2, 'sentences': ['This slide highlights the key distinction between traditional Large Language Models (LLMs) and LLM agents.', 'While basic LLMs with Retrieval Augmented Generation (RAG) systems primarily retrieve information, LLM agents go beyond simple retrieval.', 'They employ sequential reasoning, planning, and memory to analyze data, plan tasks, and connect information to solve complex problems, as illustrated by the flowchart depicting the component

In [8]:
AUDIO_FILES = "audios"

In [9]:
import shutil

if os.path.exists(AUDIO_FILES):
    shutil.rmtree(AUDIO_FILES)
    os.mkdir(AUDIO_FILES)
else: os.mkdir(AUDIO_FILES)

In [10]:
for slide in sent_slide_content:
    slide_number = slide['slide_number']
    for sentence_number, sentence in enumerate(slide['sentences'], start=1):
        # Generate the file name in the format: "slideXXX_sentenceYYY.wav"
        file_name = f"slide{slide_number:03}_sentence{sentence_number:03}.wav"
        output_file_path = os.path.join(AUDIO_FILES, file_name)
        
        # Assuming text_to_speech is your function that generates the audio.
        text_to_speech(sentence, output_file_path)

Text-to-speech conversion successful. Audio saved to audios\slide001_sentence001.wav
Text-to-speech conversion successful. Audio saved to audios\slide001_sentence002.wav
Text-to-speech conversion successful. Audio saved to audios\slide001_sentence003.wav
Text-to-speech conversion successful. Audio saved to audios\slide002_sentence001.wav
Text-to-speech conversion successful. Audio saved to audios\slide002_sentence002.wav
Text-to-speech conversion successful. Audio saved to audios\slide002_sentence003.wav
Text-to-speech conversion successful. Audio saved to audios\slide002_sentence004.wav
Text-to-speech conversion successful. Audio saved to audios\slide002_sentence005.wav
Text-to-speech conversion successful. Audio saved to audios\slide003_sentence001.wav
Text-to-speech conversion successful. Audio saved to audios\slide003_sentence002.wav
Text-to-speech conversion successful. Audio saved to audios\slide003_sentence003.wav
Text-to-speech conversion successful. Audio saved to audios\slide

___

# From Here to generate the final video

In [8]:
from pydub import AudioSegment
from moviepy.editor import concatenate_audioclips, AudioFileClip
import os

def get_audio_durations(directory):
    # List all MP3 or WAV files in the directory
    files = sorted([f for f in os.listdir(directory) if f.endswith(('.mp3','.wav'))])
    print("Files: ", files)
    
    # Initialize lists for clips, durations, and slide durations
    clips = []
    durations = []
    slide_durations = []

    current_slide = None
    current_slide_duration = 0

    for file in files:
        file_path = os.path.join(directory, file)
        
        # Load the audio file using pydub
        audio_clip = AudioFileClip(file_path)
        clips.append(audio_clip)
        
        # Calculate duration and add to the list
        duration = audio_clip.duration
        durations.append(duration)
        
        # Extract slide number from the file name
        slide_number = int(file.split('_')[0][5:])

        if current_slide is None:
            current_slide = slide_number

        if slide_number == current_slide:
            current_slide_duration += duration
        else:
            slide_durations.append(current_slide_duration)
            current_slide_duration = duration
            current_slide = slide_number
    
    # Append the duration of the last slide
    slide_durations.append(current_slide_duration)
    
    return durations, slide_durations

# Usage example
directory = "processed_files/audios"  # Replace with your directory path
output_file = 'combined_audio.wav'  # Replace with your desired output file name
durations, slide_durations = get_audio_durations(directory)

print("Durations of each audio file:", durations)
print("Combined durations for each slide:", slide_durations)


Files:  ['slide001_sentence001.wav', 'slide001_sentence002.wav', 'slide001_sentence003.wav', 'slide002_sentence001.wav', 'slide002_sentence002.wav', 'slide002_sentence003.wav', 'slide003_sentence001.wav', 'slide003_sentence002.wav', 'slide003_sentence003.wav', 'slide003_sentence004.wav']
Durations of each audio file: [6.64, 9.52, 8.04, 7.19, 8.42, 11.15, 6.27, 8.4, 12.32, 7.49]
Combined durations for each slide: [24.2, 26.759999999999998, 34.480000000000004]


In [10]:
from moviepy.editor import ImageClip

def create_video_from_image(image_path, output_video_path, duration, fps=30):
    """
    Creates a video using a single image for a specific duration.

    :param image_path: Path to the input image file.
    :param output_video_path: Path where the output video will be saved.
    :param duration: Duration of the video in seconds.
    :param fps: Frames per second for the video. Default is 30.
    """
    # Load the image
    image_clip = ImageClip(image_path)

    # Set the duration of the video
    video_clip = image_clip.set_duration(duration)

    # Set the FPS (frames per second) of the video
    video_clip = video_clip.set_fps(fps)

    # Write the result to a video file
    video_clip.write_videofile(output_video_path, codec="libx264")

In [11]:
slide_durations

[24.2, 26.759999999999998, 34.480000000000004]

In [16]:
os.mkdir("output_videos")
for idx, vid in enumerate(os.listdir("avt_videos")):
    img_source = vid[:8]+".png"
    print("img source: ", img_source)
    slide_number = int(img_source[5:8]) 
    print("slide number: ",slide_number)
    slide_duration = durations[idx]
    print("slide_duration: ",slide_duration)
    image_path = os.path.join("images",img_source)
    output_video_path = os.path.join("output_videos",vid)
    create_video_from_image(image_path=image_path,output_video_path=output_video_path,duration=slide_duration, fps = 30)

t:   1%|          | 2/200 [01:33<00:12, 15.62it/s, now=None]

img source:  slide001.png
slide number:  1
slide_duration:  6.64
Moviepy - Building video output_videos\slide001_sentence001.mp4.
Moviepy - Writing video output_videos\slide001_sentence001.mp4





t:   1%|          | 2/200 [01:34<00:12, 15.62it/s, now=None]

Moviepy - Done !
Moviepy - video ready output_videos\slide001_sentence001.mp4
img source:  slide001.png
slide number:  1
slide_duration:  9.52
Moviepy - Building video output_videos\slide001_sentence002.mp4.
Moviepy - Writing video output_videos\slide001_sentence002.mp4



t:   1%|          | 2/200 [01:36<00:12, 15.62it/s, now=None]

Moviepy - Done !
Moviepy - video ready output_videos\slide001_sentence002.mp4
img source:  slide001.png
slide number:  1
slide_duration:  8.04
Moviepy - Building video output_videos\slide001_sentence003.mp4.
Moviepy - Writing video output_videos\slide001_sentence003.mp4



t:   1%|          | 2/200 [01:38<00:12, 15.62it/s, now=None]

Moviepy - Done !
Moviepy - video ready output_videos\slide001_sentence003.mp4
img source:  slide002.png
slide number:  2
slide_duration:  7.19
Moviepy - Building video output_videos\slide002_sentence001.mp4.
Moviepy - Writing video output_videos\slide002_sentence001.mp4



t:   1%|          | 2/200 [01:40<00:12, 15.62it/s, now=None]

Moviepy - Done !
Moviepy - video ready output_videos\slide002_sentence001.mp4
img source:  slide002.png
slide number:  2
slide_duration:  8.42
Moviepy - Building video output_videos\slide002_sentence002.mp4.
Moviepy - Writing video output_videos\slide002_sentence002.mp4



t:   1%|          | 2/200 [01:42<00:12, 15.62it/s, now=None]

Moviepy - Done !
Moviepy - video ready output_videos\slide002_sentence002.mp4
img source:  slide002.png
slide number:  2
slide_duration:  11.15
Moviepy - Building video output_videos\slide002_sentence003.mp4.
Moviepy - Writing video output_videos\slide002_sentence003.mp4



t:   1%|          | 2/200 [01:44<00:12, 15.62it/s, now=None]

Moviepy - Done !
Moviepy - video ready output_videos\slide002_sentence003.mp4
img source:  slide003.png
slide number:  3
slide_duration:  6.27
Moviepy - Building video output_videos\slide003_sentence001.mp4.
Moviepy - Writing video output_videos\slide003_sentence001.mp4



t:   1%|          | 2/200 [01:46<00:12, 15.62it/s, now=None]

Moviepy - Done !
Moviepy - video ready output_videos\slide003_sentence001.mp4
img source:  slide003.png
slide number:  3
slide_duration:  8.4
Moviepy - Building video output_videos\slide003_sentence002.mp4.
Moviepy - Writing video output_videos\slide003_sentence002.mp4



t:   1%|          | 2/200 [01:47<00:12, 15.62it/s, now=None]

Moviepy - Done !
Moviepy - video ready output_videos\slide003_sentence002.mp4
img source:  slide003.png
slide number:  3
slide_duration:  12.32
Moviepy - Building video output_videos\slide003_sentence003.mp4.
Moviepy - Writing video output_videos\slide003_sentence003.mp4



t:   1%|          | 2/200 [01:50<00:12, 15.62it/s, now=None]

Moviepy - Done !
Moviepy - video ready output_videos\slide003_sentence003.mp4
img source:  slide003.png
slide number:  3
slide_duration:  7.49
Moviepy - Building video output_videos\slide003_sentence004.mp4.
Moviepy - Writing video output_videos\slide003_sentence004.mp4



t:   1%|          | 2/200 [01:52<00:12, 15.62it/s, now=None]

Moviepy - Done !
Moviepy - video ready output_videos\slide003_sentence004.mp4


In [27]:
import cv2
import numpy as np
import os
import shutil
from tqdm import tqdm
import moviepy.editor as mpy
from PIL import Image, ImageSequence
import mediapipe as mp
from moviepy.editor import ImageSequenceClip, VideoFileClip

# Function to remove pixels where green value is greater than both red and blue values by a specified threshold
def remove_pixels_based_on_green(image, threshold=5):
    # Convert image to RGBA
    image = image.convert("RGBA")
    data = image.getdata()

    new_data = []
    for item in data:
        r, g, b, a = item
        if (g - r >= threshold) and (g - b >= threshold):
            new_data.append((0, 0, 0, 0))  # Make pixel transparent
        else:
            new_data.append(item)

    image.putdata(new_data)
    return image

# Initialize MediaPipe Selfie Segmentation
mp_selfie_segmentation = mp.solutions.selfie_segmentation
segmentation = mp_selfie_segmentation.SelfieSegmentation(model_selection=1)

def save_frames(input_video_path, threshold = 5):
    if os.path.exists("frames"):
        shutil.rmtree("frames")
        os.mkdir("frames")
    else: os.mkdir("frames")
    # Initialize video capture
    cap = cv2.VideoCapture(input_video_path)

    # Create output directory if it doesn't exist
    output_dir = 'frames'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    else:
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)
        
    # Read the first frame to get dimensions
    ret, frame = cap.read()
    if not ret:
        print("Failed to read video")
        return

    height, width, _ = frame.shape
    background = np.zeros((height, width, 3), dtype=np.uint8)  # Black background

    # Get total number of frames in the video
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Process each frame
    frame_count = 0
    for _ in tqdm(range(total_frames), desc="Processing frames"):
        ret, frame = cap.read()
        if not ret:
            break

        # Convert frame to RGB for MediaPipe
        RGB = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Process the frame with MediaPipe
        results = segmentation.process(RGB)
        mask = results.segmentation_mask

        # Create a binary mask for the foreground (foreground is where mask > 0.6)
        mask = np.expand_dims(mask > 0.6, axis=-1)
        mask = np.repeat(mask, 3, axis=-1)  # Convert to 3-channel

        # Create the output frame with an alpha channel
        output = np.zeros((height, width, 4), dtype=np.uint8)
        output[:, :, :3] = np.where(mask, frame, background)  # Apply mask to frame
        output[:, :, 3] = np.where(mask[:, :, 0], 255, 0)    # Set alpha channel based on mask

        # Convert to PIL Image to process transparency and apply blur
        pil_image = Image.fromarray(output, 'RGBA')
        # pil_image = apply_gaussian_blur(pil_image)  # Apply Gaussian blur
        pil_image = remove_pixels_based_on_green(pil_image, threshold)  # Remove pixels based on green value
        output = np.array(pil_image)

        # Save each frame as a PNG file
        filename = os.path.join(output_dir, f"frame_{frame_count:04d}.png")
        cv2.imwrite(filename, output)

        frame_count += 1

    cap.release()
    cv2.destroyAllWindows()

def overlay_images_on_video(video_path, frames_dir, output_video_path, audio_path, overlay_height):
    # Initialize video capture
    cap = cv2.VideoCapture(video_path)

    # Check if the video opened successfully
    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # List all image files in the directory and sort them
    image_files = sorted([f for f in os.listdir(frames_dir) if f.endswith('.png')])
    num_images = len(image_files)
    
    if num_images == 0:
        print("Error: No images found in the directory.")
        return
    
    # Create a list to store output frames
    output_frames = []

    # Calculate the width of the overlay image maintaining its aspect ratio
    overlay_width = int((overlay_height / height) * width)

    # Process each frame
    frame_count = 0
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)  # Reset to the start

    for _ in tqdm(range(total_frames), desc="Processing Frames"):
        ret, frame = cap.read()
        if not ret:
            break
        
        # Determine the image to overlay
        overlay_img_path = os.path.join(frames_dir, image_files[frame_count % num_images])
        overlay_img = cv2.imread(overlay_img_path, cv2.IMREAD_UNCHANGED)
        
        # Resize the overlay image to the desired dimensions while maintaining aspect ratio
        if overlay_img.shape[0] != overlay_height:
            aspect_ratio = overlay_img.shape[1] / overlay_img.shape[0]
            new_width = int(overlay_height * aspect_ratio)
            overlay_img = cv2.resize(overlay_img, (new_width, overlay_height))
        
        # Split the overlay image into color and alpha channels
        overlay_color = overlay_img[:, :, :3]
        alpha_channel = overlay_img[:, :, 3] / 255.0
        
        # Create the output frame with an alpha channel
        frame_with_overlay = frame.copy()
        overlay_x = (width - overlay_img.shape[1]) // 2  # Center horizontally
        overlay_y = height - overlay_img.shape[0]  # Align bottom
        
        for c in range(3):
            frame_with_overlay[overlay_y:overlay_y + overlay_img.shape[0], overlay_x:overlay_x + overlay_img.shape[1], c] = \
                (alpha_channel * overlay_color[:, :, c] + (1 - alpha_channel) * frame_with_overlay[overlay_y:overlay_y + overlay_img.shape[0], overlay_x:overlay_x + overlay_img.shape[1], c])
        
        output_frames.append(frame_with_overlay)
        frame_count += 1
    
    cap.release()

    # Create video from frames
    clip = ImageSequenceClip([cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in output_frames], fps=fps)
    
    # Add audio to the video
    video_clip = VideoFileClip(video_path)
    audio_clip = VideoFileClip(audio_path).audio
    clip = clip.set_audio(audio_clip)

    # Write the final video file
    clip.write_videofile(output_video_path, codec='libx264')



# this is the main loop to get individual overlayed videos

In [35]:
fin_videos = "fin_videos"
if os.path.exists(fin_videos):
    shutil.rmtree(fin_videos)

os.mkdir(fin_videos)

for avt_file in os.listdir("avt_videos"):
    avtr_file = os.path.join("avt_videos",avt_file)
    save_frames(input_video_path=avtr_file,threshold=20)
    print("done with storing the frames: ", avtr_file)
    video_path = os.path.join("output_videos",avt_file)
    frames_dir = 'frames'
    output_video_path = os.path.join(fin_videos,avt_file)
    overlay_height = 380
    overlay_images_on_video(video_path=video_path, frames_dir = "frames", output_video_path=output_video_path, audio_path=avtr_file, overlay_height=overlay_height)

Processing frames:  99%|█████████▉| 194/195 [01:48<00:00,  1.79it/s]


done with storing the frames:  avt_videos\slide001_sentence001.mp4


Processing Frames: 100%|██████████| 200/200 [00:07<00:00, 27.92it/s]


Moviepy - Building video fin_videos\slide001_sentence001.mp4.
MoviePy - Writing audio in slide001_sentence001TEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video fin_videos\slide001_sentence001.mp4



                                                               

Moviepy - Done !
Moviepy - video ready fin_videos\slide001_sentence001.mp4


Processing frames: 100%|█████████▉| 281/282 [02:35<00:00,  1.81it/s]


done with storing the frames:  avt_videos\slide001_sentence002.mp4


Processing Frames: 100%|██████████| 286/286 [00:10<00:00, 27.12it/s]


Moviepy - Building video fin_videos\slide001_sentence002.mp4.
MoviePy - Writing audio in slide001_sentence002TEMP_MPY_wvf_snd.mp3


                                                                   

MoviePy - Done.
Moviepy - Writing video fin_videos\slide001_sentence002.mp4



                                                               

Moviepy - Done !
Moviepy - video ready fin_videos\slide001_sentence002.mp4


Processing frames: 100%|█████████▉| 236/237 [02:08<00:00,  1.83it/s]


done with storing the frames:  avt_videos\slide001_sentence003.mp4


Processing Frames: 100%|██████████| 242/242 [00:09<00:00, 25.90it/s]


Moviepy - Building video fin_videos\slide001_sentence003.mp4.
MoviePy - Writing audio in slide001_sentence003TEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video fin_videos\slide001_sentence003.mp4



                                                               

Moviepy - Done !
Moviepy - video ready fin_videos\slide001_sentence003.mp4


Processing frames: 100%|█████████▉| 210/211 [01:55<00:00,  1.82it/s]


done with storing the frames:  avt_videos\slide002_sentence001.mp4


Processing Frames: 100%|██████████| 216/216 [00:07<00:00, 27.90it/s]


Moviepy - Building video fin_videos\slide002_sentence001.mp4.
MoviePy - Writing audio in slide002_sentence001TEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video fin_videos\slide002_sentence001.mp4



                                                               

Moviepy - Done !
Moviepy - video ready fin_videos\slide002_sentence001.mp4


Processing frames: 100%|█████████▉| 248/249 [02:13<00:00,  1.85it/s]


done with storing the frames:  avt_videos\slide002_sentence002.mp4


Processing Frames: 100%|██████████| 253/253 [00:09<00:00, 27.60it/s]


Moviepy - Building video fin_videos\slide002_sentence002.mp4.
MoviePy - Writing audio in slide002_sentence002TEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video fin_videos\slide002_sentence002.mp4



                                                               

Moviepy - Done !
Moviepy - video ready fin_videos\slide002_sentence002.mp4


Processing frames: 100%|█████████▉| 329/330 [02:59<00:00,  1.83it/s]


done with storing the frames:  avt_videos\slide002_sentence003.mp4


Processing Frames: 100%|██████████| 335/335 [00:13<00:00, 25.16it/s]


Moviepy - Building video fin_videos\slide002_sentence003.mp4.
MoviePy - Writing audio in slide002_sentence003TEMP_MPY_wvf_snd.mp3


                                                                  

MoviePy - Done.




Moviepy - Writing video fin_videos\slide002_sentence003.mp4



                                                               

Moviepy - Done !
Moviepy - video ready fin_videos\slide002_sentence003.mp4


Processing frames:  99%|█████████▉| 183/184 [01:39<00:00,  1.83it/s]


done with storing the frames:  avt_videos\slide003_sentence001.mp4


Processing Frames: 100%|██████████| 189/189 [00:07<00:00, 26.64it/s]


Moviepy - Building video fin_videos\slide003_sentence001.mp4.
MoviePy - Writing audio in slide003_sentence001TEMP_MPY_wvf_snd.mp3


                                                                  

MoviePy - Done.
Moviepy - Writing video fin_videos\slide003_sentence001.mp4



                                                               

Moviepy - Done !
Moviepy - video ready fin_videos\slide003_sentence001.mp4


Processing frames: 100%|█████████▉| 247/248 [02:17<00:00,  1.79it/s]


done with storing the frames:  avt_videos\slide003_sentence002.mp4


Processing Frames: 100%|██████████| 252/252 [00:10<00:00, 23.88it/s]


Moviepy - Building video fin_videos\slide003_sentence002.mp4.
MoviePy - Writing audio in slide003_sentence002TEMP_MPY_wvf_snd.mp3


                                                                  

MoviePy - Done.
Moviepy - Writing video fin_videos\slide003_sentence002.mp4



                                                               

Moviepy - Done !
Moviepy - video ready fin_videos\slide003_sentence002.mp4


Processing frames: 100%|█████████▉| 365/366 [03:21<00:00,  1.81it/s]


done with storing the frames:  avt_videos\slide003_sentence003.mp4


Processing Frames: 100%|██████████| 370/370 [00:15<00:00, 24.38it/s]


Moviepy - Building video fin_videos\slide003_sentence003.mp4.
MoviePy - Writing audio in slide003_sentence003TEMP_MPY_wvf_snd.mp3


                                                                   

MoviePy - Done.
Moviepy - Writing video fin_videos\slide003_sentence003.mp4



                                                               

Moviepy - Done !
Moviepy - video ready fin_videos\slide003_sentence003.mp4


Processing frames: 100%|█████████▉| 219/220 [02:02<00:00,  1.79it/s]


done with storing the frames:  avt_videos\slide003_sentence004.mp4


Processing Frames: 100%|██████████| 225/225 [00:07<00:00, 28.18it/s]


Moviepy - Building video fin_videos\slide003_sentence004.mp4.
MoviePy - Writing audio in slide003_sentence004TEMP_MPY_wvf_snd.mp3


                                                                   

MoviePy - Done.
Moviepy - Writing video fin_videos\slide003_sentence004.mp4



                                                               

Moviepy - Done !
Moviepy - video ready fin_videos\slide003_sentence004.mp4


In [19]:
cap = cv2.VideoCapture("output_new.mp4")
print(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(cap.get(cv2.CAP_PROP_FPS))

4230.0
29.97


In [21]:
cap = cv2.VideoCapture("main.mp4")
print(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(cap.get(cv2.CAP_PROP_FPS))

4234.0
29.97


In [36]:
import os
from moviepy.editor import VideoFileClip, concatenate_videoclips

def process_and_concat_videos(directory, output_file, seconds_to_remove=0):
    # List all video files in the directory
    files = sorted([f for f in os.listdir(directory) if f.endswith('.mp4')])
    clips = []

    for file in files:
        file_path = os.path.join(directory, file)
        clip = VideoFileClip(file_path)

        # Remove last `seconds_to_remove` seconds if greater than 0
        if seconds_to_remove > 0:
            duration = clip.duration
            if duration > seconds_to_remove:
                clip = clip.subclip(0, duration - seconds_to_remove)
            else:
                print(f"Video '{file}' is shorter than the specified removal time. Skipping.")
                continue

        clips.append(clip)

    # Concatenate all video clips
    final_clip = concatenate_videoclips(clips, method='compose')

    # Write the output file
    final_clip.write_videofile(output_file, codec='libx264')

# Example usage
directory = 'fin_videos'  # Replace with your directory path
output_file = 'final_video.mp4'  # Replace with your desired output file name
seconds_to_remove = 1  # Number of seconds to remove from the end of each video

process_and_concat_videos(directory, output_file, seconds_to_remove)


Moviepy - Building video final_video.mp4.
MoviePy - Writing audio in final_videoTEMP_MPY_wvf_snd.mp3


                                                                      

MoviePy - Done.
Moviepy - Writing video final_video.mp4



                                                                

Moviepy - Done !
Moviepy - video ready final_video.mp4
