### Semantic Control in Diffusion Inpainting: Merging RePaint Sampling with Text-Driven Generation in GLIDE

In [1]:
from IPython.display import display
from PIL import Image
from typing import Tuple
import numpy as np
import torch as torch
import torch.nn.functional as F
import gradio as gr

from glide_text2im.download import load_checkpoint
from glide_text2im.model_creation import (
    create_model_and_diffusion,
    model_and_diffusion_defaults,
    model_and_diffusion_defaults_upsampler
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(gr.__version__)

5.20.1


In [3]:
import pydantic
print(pydantic.__version__)

2.10.6


In [4]:
# This notebook supports both CPU and GPU.
# On CPU, generating one sample may take on the order of 20 minutes.
# On a GPU, it should be under a minute.

has_cuda = torch.cuda.is_available()
device = torch.device('cuda:2' if has_cuda else 'cpu')
print("Using device:", device)

Using device: cuda:2


## Data

For practical reasons, we run the notebook in the Google Drive folder where it is located. We need to provide a path.

In [5]:
# from google.colab import drive

# drive.mount('/content/drive')

# # Replace path_to_notebook with your actual path
# path_to_notebook = "drive/MyDrive/Colab Notebooks/M1 Data AI/Computer Vision/Project"

# %cd {path_to_notebook}

# !pwd
# !ls

### Sampling with Repaint sampling method

In [11]:
def process_input_with_mask(input_with_mask, prompt, guidance_scale):
    """
    Process an image with drawing mask and extract both components for inpainting.

    Args:
        input_with_mask: Image with alpha channel containing mask information
        prompt: Text prompt for guiding the inpainting
        guidance_scale: Scale factor for classifier-free guidance

    Returns:
        Inpainted image or error message
    """
    # return "function executed"
    gr.Info(f"Input Image Dict: {input_with_mask.keys()}")
    gr.Info(f"Background: {input_with_mask['background'].shape if isinstance(input_with_mask['background'], np.ndarray) else 'Invalid Image'}")
    gr.Info(f"Layers: {len(input_with_mask['layers'])}")
    gr.Info(f"Composite: {input_with_mask['composite'].shape if isinstance(input_with_mask['composite'], np.ndarray) else 'Invalid Image'}")
    gr.Info(f"Layer 1: {input_with_mask['layers'][0].shape if isinstance(input_with_mask['layers'][0], np.ndarray) else 'Invalid Image'}")
    # return input_with_mask['layers'][0]


    # Check if there's any drawing on the image
    if input_with_mask is None:
        return "Please upload an image first."
    

    if len(input_with_mask['background'].shape) == 3:
        if len(input_with_mask['layers']) == 1:
            # The 4th channel contains the mask (alpha channel)
            original_image = input_with_mask['background'][:, :, :3]

            # Extract mask from alpha channel, white (255) marks areas to inpaint
            mask = input_with_mask['layers'][0]

            # Check if any masking was done
            if np.max(mask) == 0:
                return "Please draw on the image to create a mask for inpainting."

            # Process the image and mask for inpainting
            gr.Info(f"Original Image: {original_image.shape}")
            gr.Info(f"Mask: {mask.shape}")
            return inpaint_image(original_image, mask, prompt, guidance_scale, size=64)
        else:
            return "The image doesn't have an alpha channel for masking. Please draw on the image."
    else:
        return "Please upload a valid image and draw on it to create a mask."



In [12]:
import repaint_sampling as RS
import repaint_patcher as RP
import prepare_glide_inpaint as PGI
from image_util import *

In [13]:
def inpaint_image(input_image, mask_image, prompt, guidance_scale=7.0, up_sample_model=False, size=256, batch_size=1):
    """Inpaint the masked region of the image based on the text prompt."""
    model, diffusion, options = PGI.create_glide_generative(device=device, cuda=has_cuda)
    RP.patch_model_for_repaint(diffusion)
    sampler = RS.CFGSamplerRepaint(model, diffusion, options, guidance_scale, device)

    # Process input image
    image_tensor = process_image(input_image, size=size)

    # Handling the mask from the drawing canvas
    if len(mask_image.shape) == 3 and mask_image.shape[2] == 4:
        mask_array = mask_image[:, :, 3]
    elif len(mask_image.shape) == 2:
        mask_array = mask_image
    else:
        mask_array = np.mean(mask_image, axis=2) if mask_image.shape[2] == 3 else mask_image

    mask_tensor = process_mask(mask_array, size=size)

    # In GLIDE's inpainting model, 0 means "inpaint this", 1 means "keep this"
    # But in our UI, white (255) means "inpaint this", so we invert the mask
    inpaint_mask = 1.0 - mask_tensor

    # Rest of your function remains the same
    # For RePaint, we need the ground truth image and keep mask
    gt = image_tensor
    gt_keep_mask = inpaint_mask  # 1 for areas to keep (not inpaint)

    # Sampling from the model
    print("Generating inpainted image...")

    if not up_sample_model:
        jump_params = {
            "t_T": 250,
            "n_sample": 1,
            "jump_length": 10,
            "jump_n_sample": 10
        }
        samples = sampler.sample(gt, gt_keep_mask, prompt, batch_size, jump_params=jump_params)

        # Convert the tensor to a numpy array in the range [0, 255]
        samples = ((samples + 1) * 127.5).clamp(0, 255).to(torch.uint8)
        samples = samples.permute(0, 2, 3, 1).contiguous()
        sample_image = samples[0].cpu().numpy()

        return sample_image
    else:
        #### Inpainting directly from the 256x256 images
        # TODO: Implement this
        pass

### Gradio Interface

In [14]:
def create_gradio_interface():
    """Create the Gradio interface for the inpainting application."""
    with gr.Blocks(title="RePaint with GLIDE Text-Driven Inpainting") as app:
        gr.Markdown("# Text-Driven Image Inpainting with RePaint and GLIDE")
        gr.Markdown(
            "Upload an image, draw directly on it to create a mask (white areas will be inpainted), and enter a text prompt.")

        with gr.Row():
            with gr.Column():
                input_image = gr.ImageMask(label="Upload & Draw Mask (white areas will be inpainted)",
                                       image_mode="RGBA",
                                       type='numpy',
                                       sources=('upload'),	
                                       interactive=True)

                prompt = gr.Textbox(label="Text Prompt", placeholder="Describe what should be in the masked area...")
                guidance_scale = gr.Slider(minimum=1.0, maximum=15.0, value=7.0, step=0.5,
                                           label="Guidance Scale (higher = more text influence)")
                submit_btn = gr.Button("Generate Inpainting")

            with gr.Column():
                output_image = gr.Image(label="Inpainted Result")

        submit_btn.click(
            fn=process_input_with_mask,
            inputs=[input_image, prompt, guidance_scale],
            outputs=output_image
        )

        gr.Markdown("## How to use")
        gr.Markdown("""
        1. Upload an image
        2. Draw directly on the image with white brush to create a mask (white areas will be inpainted)
        3. Enter a text prompt describing what you want in the masked area
        4. Click 'Generate Inpainting' and wait for the result
        5. Adjust the guidance scale if needed (higher values follow the text more closely)

        Tips:
        - Use a larger brush radius for covering larger areas
        - You can clear your drawing and start over using the clear button
        - Be specific in your text prompt for best results
        """)

    return app

In [None]:
import os

demo = create_gradio_interface()
os.makedirs("sample_images", exist_ok=True)
demo.launch(share=True, debug=True, inline=True, show_error=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://14ff3813002eca503b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  return th.load(path, map_location=device)


Generating inpainted image...


100%|██████████| 4570/4570 [03:57<00:00, 19.24it/s]
  return th.load(path, map_location=device)


Generating inpainted image...


100%|██████████| 4570/4570 [03:44<00:00, 20.35it/s]
  return th.load(path, map_location=device)


Generating inpainted image...


 52%|█████▏    | 2397/4570 [00:58<00:55, 38.90it/s]