<a href="https://colab.research.google.com/github/Mediaeater/52-technologies-in-2016/blob/master/Zeroscope_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers accelerate gradio
!pip install opencv-python
!pip install diffusers
!pip install --upgrade diffusers[torch]

In [None]:
import os
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import export_to_video
from PIL import Image
import gradio as gr
from gradio.components import Textbox, Number, Checkbox, Slider, File

# Function to initialize the DiffusionPipeline
def initialize_diffusion_pipeline(model_name, dtype=torch.float16, chunk_size=1, dim=1):
    print(f"Initializing the pipeline with model: {model_name}")
    pipe = DiffusionPipeline.from_pretrained(model_name, torch_dtype=dtype)
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    pipe.enable_model_cpu_offload()
    pipe.enable_vae_slicing()
    pipe.unet.enable_forward_chunking(chunk_size=chunk_size, dim=dim)
    return pipe

# Function to export frames to a video
def export_frames_to_video(frames, output_path):
    video_number = 1
    while os.path.exists(f"{output_path}video{video_number}.mp4"):
        video_number += 1
    video_path = export_to_video(frames, output_video_path=f"{output_path}video{video_number}.mp4")
    print(f"Video generated: {video_path}")
    return video_path

# Main function to generate videos
def generate_video(prompts=None, num_inference_steps=None, num_upscale_steps=None, height=None, width=None, upscale=None, upscaled_height=None, upscaled_width=None, num_frames=None, strength=None, output_path=None, negative_prompt=None, guidance_scale=None):
    prompts = prompts.split("\n") if prompts is not None else ["Space scenery"]
    num_inference_steps = int(num_inference_steps) if num_inference_steps is not None else 30
    num_upscale_steps = int(num_upscale_steps) if num_upscale_steps is not None else 30
    height = int(height) if height is not None else 576
    width = int(width) if width is not None else 1024
    upscale = upscale if upscale is not None else False
    upscaled_height = int(upscaled_height) if upscaled_height is not None else 576
    upscaled_width = int(upscaled_width) if upscaled_width is not None else 1024
    num_frames = int(num_frames) if num_frames is not None else 30
    strength = float(strength) if strength is not None else 0.6
    negative_prompt = negative_prompt.strip() if negative_prompt is not None else ""  # Convert to a single string
    guidance_scale = float(guidance_scale) if guidance_scale is not None else 1.0
    output_path = output_path or "./output/"

    video_paths = []

    # Create the pipeline once outside the loop
    pipe = initialize_diffusion_pipeline("cerspense/zeroscope_v2_576w")

    for i, prompt in enumerate(prompts):
        # Generate video frames
        video_frames = pipe(prompt.strip(), num_inference_steps=num_inference_steps, height=height, width=width, num_frames=num_frames, negative_prompt=negative_prompt, guidance_scale=guidance_scale).frames

        if upscale:
            # Clear memory before using the pipeline with larger model
            del pipe
            torch.cuda.empty_cache()

            pipe = initialize_diffusion_pipeline("cerspense/zeroscope_v2_XL")

            upscaled_size = (upscaled_width, upscaled_height)
            video = [Image.fromarray(frame).resize(upscaled_size) for frame in video_frames]

            video_frames = pipe(prompt.strip(), num_inference_steps=num_upscale_steps, video=video, strength=strength, negative_prompt=negative_prompt, guidance_scale=guidance_scale).frames

            video_path = export_frames_to_video(video_frames, output_path)
            video_paths.append(video_path)

            # Clear memory after using the pipeline with larger model
            del pipe
            torch.cuda.empty_cache()

            # Re-initialize the pipeline with the smaller model
            pipe = initialize_diffusion_pipeline("cerspense/zeroscope_v2_576w")
        else:
            video_path = export_frames_to_video(video_frames, output_path)
            video_paths.append(video_path)

    return ", ".join(video_paths)

inputs = [
    Textbox(lines=5, label="Prompts (one per line)", placeholder="Enter prompts here, one per line. Use Enter to create a new line without submitting. Press Shift+Enter to submit"),
    Number(label="Number of Inference Steps (minimum recommended is 30)"),
    Number(label="Number of Upscale Steps (minimum recommended is 2)"),
    Number(label="Height"),
    Number(label="Width"),
    Checkbox(label="Upscale"),
    Number(label="Upscaled Height"),
    Number(label="Upscaled Width"),
    Number(label="Number of Frames (minimum recommended: 24, fps is 8)"),
    Slider(minimum=0, maximum=1, step=0.1, label="Strength (recommended: 1)"),
    Textbox(label="Output Folder Path (if empty files are saved into the output folder)"),
    Textbox(label="Negative Prompt"),  # new input for negative_prompt
    Number(label="Guidance Scale (recommended above 10)"),  # new input for guidance_scale
]

iface = gr.Interface(fn=generate_video,
                     inputs=inputs,
                     outputs="text",
                     title="Video Generation",
                     description="Generate videos using zeroscope models. All credit goes to Zeroscope team: https://huggingface.co/cerspense")
iface.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://c9b543c414f1fedfa0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Initializing the pipeline with model: cerspense/zeroscope_v2_576w


Downloading (…)ain/model_index.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.safetensors not found


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Downloading (…)_encoder/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)b28/unet/config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading (…)cheduler_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)cb28/vae/config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Downloading (…)on_pytorch_model.bin:   0%|          | 0.00/2.82G [00:00<?, ?B/s]

Downloading (…)on_pytorch_model.bin:   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]