<a href="https://colab.research.google.com/github/NagypalMarton/Burger-Industry/blob/main/UX_laboratory_4_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install the required packages:
* [gradio](https://www.gradio.app/docs): UI library with Python interface
* [accelerate](https://huggingface.co/docs/accelerate/index): Huggingface library for distributed training and inference
* [diffusers](https://huggingface.co/docs/diffusers/index): Diffusion model library from Huggingface supporting a range of generative diffusion models, pipelines and schedulers
* [controlnet_aux](https://github.com/huggingface/controlnet_aux): ControlNet-related utilities and auxiliary models

In [1]:
!pip install "gradio<4.0" accelerate diffusers controlnet_aux "numpy<2.0" mediapipe

Collecting gradio<4.0
  Downloading gradio-3.50.2-py3-none-any.whl.metadata (17 kB)
Collecting controlnet_aux
  Downloading controlnet_aux-0.0.9-py3-none-any.whl.metadata (6.5 kB)
Collecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mediapipe
  Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio<4.0)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi (from gradio<4.0)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio<4.0)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==0.6.1 (from gradio<4.0)
  Downloading gradio_client-0.6.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (f

In [1]:
import random
import numpy as np
import cv2
import torch
import gradio as gr
import controlnet_aux
import PIL.Image
from diffusers import (
    ControlNetModel,
    DiffusionPipeline,
    StableDiffusionControlNetPipeline,
    UniPCMultistepScheduler,
)
import mediapipe as mp

In [3]:
MAX_NUM_IMAGES = 5
DEFAULT_NUM_IMAGES = 2
MAX_IMAGE_RESOLUTION = 768
DEFAULT_IMAGE_RESOLUTION = 768

MAX_SEED = np.iinfo(np.int32).max

In [4]:
# Hand Gesture Recognition and Scribble Drawing
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

canvas = np.full((768, 768, 3), 255, dtype=np.uint8)  # Initialize canvas
canvas_resolution = (768, 768)

In [5]:
def map_landmark_to_canvas(landmark_x, landmark_y, canvas_resolution):
    canvas_width, canvas_height = canvas_resolution
    x = int(landmark_x * canvas_width)
    y = int(landmark_y * canvas_height)
    return x, y

def draw_scribble_on_canvas(x, y):
    global canvas
    cv2.circle(canvas, (x, y), radius=5, color=(0, 0, 0), thickness=-1)

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
task_name = "scribble"

# using srcibble-based ControlNet with Stable Diffusion 1.5
base_model_id = "runwayml/stable-diffusion-v1-5"
model_id = "lllyasviel/control_v11p_sd15_scribble"

# Instantiate the model and pipeline without specifying torch_dtype
if device == "cuda":
    controlnet = ControlNetModel.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        base_model_id, controlnet=controlnet, torch_dtype=torch.float16
    ).to(device)
else:
    controlnet = ControlNetModel.from_pretrained(model_id).to(device)  # No torch_dtype for CPU
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        base_model_id, controlnet=controlnet
    ).to(device)  # No torch_dtype for CPU
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.to(device)

# image generation from scribble input
# using torch.inference_mode to disable gradient tracking
@torch.inference_mode()
def process_scribble_interactive(video_frame, prompt, additional_prompt, negative_prompt, num_images, image_resolution, num_steps, guidance_scale, seed):
    global canvas

    # 1. Hand Gesture Recognition (Implementation using MediaPipe)
    if video_frame is not None:  # Check if video frame is available
        video_frame = np.array(video_frame)  # Convert to NumPy array

        with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
            image = cv2.cvtColor(video_frame, cv2.COLOR_BGR2RGB)  # Now using NumPy array
            results = hands.process(image)

            if results.multi_hand_landmarks:
              hand_landmarks = results.multi_hand_landmarks[0]
              index_finger_tip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
              x, y = map_landmark_to_canvas(index_finger_tip.x, index_finger_tip.y, canvas_resolution)
              draw_scribble_on_canvas(x, y)

    # 2. Preprocess Scribble (Resize, format, etc.)
    control_image = PIL.Image.fromarray(canvas)
    control_image = controlnet_aux.util.HWC3(control_image)
    control_image = resize_image(control_image, resolution=image_resolution)


    if not prompt:
        prompt = additional_prompt
    else:
        prompt = f"{prompt}, {additional_prompt}"

    generator = torch.Generator().manual_seed(seed)
    results = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        guidance_scale=guidance_scale,
        num_images_per_prompt=num_images,
        num_inference_steps=num_steps,
        generator=generator,
        image=control_image,
    ).images
    return [control_image] + results

config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [13]:
def process_and_draw(video_frame):
    global canvas
    # Ensure video_frame is a NumPy array
    if video_frame is not None and isinstance(video_frame, np.ndarray):
        with mp_hands.Hands(
            min_detection_confidence=0.5, min_tracking_confidence=0.5
        ) as hands:
            image = cv2.cvtColor(video_frame, cv2.COLOR_BGR2RGB)  # Now using NumPy array
            results = hands.process(image)
            if results.multi_hand_landmarks:
                hand_landmarks = results.multi_hand_landmarks[0]
                index_finger_tip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
                x, y = map_landmark_to_canvas(index_finger_tip.x, index_finger_tip.y, canvas_resolution)
                draw_scribble_on_canvas(x, y)
    return video_frame

In [14]:
# random seed utility
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed

def create_demo(process):
    with gr.Blocks() as demo:
        gr.Markdown("## Scribble-Based Image Generation")
        gr.Markdown("This interface allows you to generate images from your scribbles using a diffusion model.")

        with gr.Row():
            # Left column for inputs
            with gr.Column():
               # white_image = np.full((768, 768, 3), 255, dtype=np.uint8)
                #image = gr.Image(tool="sketch", brush_radius=10, value=white_image, label="Scribble Here")
                video = gr.Video(label="Hand Gestures", source="webcam")#Mediapipe
                prompt = gr.Textbox(label="Prompt", placeholder="Enter your main prompt here")
                a_prompt = gr.Textbox(label="Additional Prompt", placeholder="Optional extra details")
                n_prompt = gr.Textbox(label="Negative Prompt", placeholder="Enter negative prompt if any")

                with gr.Accordion("Advanced Settings", open=False):
                    num_samples = gr.Slider(label="Number of images", minimum=1, maximum=5, value=3, step=1)
                    image_resolution = gr.Slider(label="Image resolution", minimum=256, maximum=768, value=768, step=256)
                    num_steps = gr.Slider(label="Number of steps", minimum=1, maximum=100, value=1, step=1)
                    guidance_scale = gr.Slider(label="Guidance scale", minimum=0.1, maximum=30.0, value=1.0, step=0.1)
                    seed = gr.Slider(label="Seed", minimum=0, maximum=2**31-1, step=1, value=0)
                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
                run_button = gr.Button("Generate Images")

            with gr.Column():
                scribble_output = gr.Image(label="Scribble Canvas", shape=(768, 768))  # Add scribble output
                result = gr.Gallery(label="Generated Output", show_label=False, columns=2, object_fit="scale-down")

        # Collect inputs in order
        inputs = [
            video,
            prompt,
            a_prompt,
            n_prompt,
            num_samples,
            image_resolution,
            num_steps,
            guidance_scale,
            seed,
        ]
        # Ensure video_frame is a NumPy array and handle the conversion properly
        video.change(
            fn=lambda x: np.array(x) if isinstance(x, np.ndarray) else x,  # Convert to NumPy array if not already
            inputs=video,
            outputs=video,
        ).then(
            fn=lambda video_frame_np: (
                process_and_draw(video_frame_np),
                cv2.cvtColor(canvas, cv2.COLOR_BGR2RGB)
            ),
            inputs=video,
            outputs=[video, scribble_output]
        )

        # Setup event triggers
        prompt.submit(fn=randomize_seed_fn, inputs=[seed, randomize_seed], outputs=seed, queue=False).then(
            fn=process, inputs=inputs, outputs=result
        )
        run_button.click(fn=randomize_seed_fn, inputs=[seed, randomize_seed], outputs=seed, queue=False).then(
            fn=process, inputs=inputs, outputs=result
        )

    return demo

# Create and launch the demo
demo = create_demo(process_scribble_interactive)
demo.queue().launch(debug=True, share=True)

IMPORTANT: You are using gradio version 3.50.2, however version 4.44.1 is available, please upgrade.
--------
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://9836552b819155aa7c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/gradio/queueing.py", line 407, in call_prediction
    output = await route_utils.call_process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/route_utils.py", line 226, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 1548, in process_api
    inputs = self.preprocess_data(fn_index, inputs, state)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 1329, in preprocess_data
    processed_input.append(block.preprocess(inputs[i]))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/components/video.py", line 230, in preprocess
    ff.run()
  File "/usr/local/lib/python3.11/di

Killing tunnel 127.0.0.1:7860 <> https://9836552b819155aa7c.gradio.live


