# Semantic Control in Diffusion Inpainting: Merging RePaint Sampling with Text-Driven Generation in GLIDE

This notebook demonstrates the use of the RePaint sampling method applied to OpenAI's GLIDE text-guided image diffusion model. It requires downloading the publicly available GLIDE models (done in the code), and creates a Gradio user interface to interact with variants of our method. A torch compilation step is done to speed up image generation in the UI, but it takes around 10 minutes per model to compile.

In [1]:
# Download the required packages. If the wrong versions of pydantic and gradio
# are used, the UI will fail silently.
%pip install git+https://github.com/openai/glide-text2im
%pip install pydantic==2.10.6
%pip install gradio==5.20.1

Collecting git+https://github.com/openai/glide-text2im
  Cloning https://github.com/openai/glide-text2im to /tmp/pip-req-build-fh5y30gq
  Running command git clone --filter=blob:none --quiet https://github.com/openai/glide-text2im /tmp/pip-req-build-fh5y30gq
  Resolved https://github.com/openai/glide-text2im to commit 69b530740eb6cef69442d6180579ef5ba9ef063e
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from glide-text2im==0.0.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->glide-text2im==0.0.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->glide-text2im==0.0.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->glide-text2im==0.0.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127

In [3]:
# clone the repository where this notebook is contained
# NOTE: need to merge the refactor branch once it is done
!git clone -b refactor https://github.com/pandanautinspace/CSC_52002_EP_Generative_AI_Project
%cd CSC_52002_EP_Generative_AI_Project/

Cloning into 'CSC_52002_EP_Generative_AI_Project'...
remote: Enumerating objects: 438, done.[K
remote: Total 438 (delta 0), reused 0 (delta 0), pack-reused 438 (from 1)[K
Receiving objects: 100% (438/438), 121.63 MiB | 48.16 MiB/s, done.
Resolving deltas: 100% (71/71), done.


In [10]:
import numpy as np
import torch as torch
import gradio as gr

# This notebook supports both CPU and GPU.
# On CPU, generating one sample may take on the order of 20 minutes.
# It is recommended to skip the compilation on CPU.
# On a GPU, it should be under a minute.

has_cuda = torch.cuda.is_available()
device = torch.device('cuda' if has_cuda else 'cpu')
print("Using device:", device)

In [11]:
compile = True if device.type == "cuda" else False
# uncomment the following line to disable torch compilation
# compile = False
print("Compiling:", compile)

Using device: cuda


### Sampling with Repaint sampling method

In [17]:
# import our contributed code and PyTorch
import glide_patching.repaint_sampling as RS
import glide_patching.repaint_patcher as RP
import glide_patching.prepare_glide_inpaint as PGI
from glide_patching.image_util import *
import torch

In [18]:
# set up all models for comparison

model_base, diffusion_base, options_base = PGI.create_glide_generative(device=device, cuda=has_cuda)
model_rp, diffusion_rp, options_rp = PGI.create_glide_generative(device=device, cuda=has_cuda, use_inpaint=False)
model_rpip, diffusion_rpip, options_rpip = PGI.create_glide_generative(device=device, cuda=has_cuda)

RP.patch_model_for_repaint(diffusion_rp)
RP.patch_model_for_repaint(diffusion_rpip)
if compile:
  diffusion_base.p_sample_loop = torch.compile(diffusion_base.p_sample_loop, mode="max-autotune")
  diffusion_rp.p_sample_loop = torch.compile(diffusion_rp.p_sample_loop, mode="max-autotune")
  diffusion_rpip.p_sample_loop = torch.compile(diffusion_rpip.p_sample_loop, mode="max-autotune")


model_up_base, diffusion_up_base, options_up_base = PGI.create_glide_upsampler(device=device, cuda=has_cuda)
model_up_rp, diffusion_up_rp, options_up_rp = PGI.create_glide_upsampler(device=device, cuda=has_cuda, use_inpaint=False)
model_up_rpip, diffusion_up_rpip, options_up_rpip = PGI.create_glide_upsampler(device=device, cuda=has_cuda)

RP.patch_model_for_repaint(diffusion_up_rp)
RP.patch_model_for_repaint(diffusion_up_rpip)

sampler_up_base = RS.UpscaleSamplerInpaint(model_up_base, diffusion_up_base, options_up_base, model_fn=None, device=device)
sampler_up_rp = RS.UpscaleSamplerRepaint(model_up_rp, diffusion_up_rp, options_up_rp, model_fn=None, device=device)
sampler_up_rpip = RS.UpscaleSamplerRepaint(model_up_rpip, diffusion_up_rpip, options_up_rpip, model_fn=None, device=device)
if compile:
  sampler_up_base.sample = torch.compile(sampler_up_base.sample, mode="max-autotune")
  sampler_up_rp.sample = torch.compile(sampler_up_rp.sample, mode="max-autotune")
  sampler_up_rpip.sample = torch.compile(sampler_up_rpip.sample, mode="max-autotune")

upsample_temp = 0.997

  0%|          | 0.00/1.54G [00:00<?, ?iB/s]

  0%|          | 0.00/1.54G [00:00<?, ?iB/s]

  0%|          | 0.00/1.59G [00:00<?, ?iB/s]

  0%|          | 0.00/1.59G [00:00<?, ?iB/s]

In [19]:
def inpaint_image(input_image, mask_image, prompt, guidance_scale=7.0, size=256, batch_size=1, variant="base"):
    """Inpaint the masked region of the image based on the text prompt."""
    sampler = None
    if variant=="base":
        sampler = RS.CFGSamplerInpaint(model_base, diffusion_base, options_base, guidance_scale, device)
    elif variant=="repaint":
        sampler = RS.CFGSamplerRepaint(model_rp, diffusion_rp, options_rp, guidance_scale, device)
    elif variant=="repaint_inpaint":
        sampler = RS.CFGSamplerRepaintInpaint(model_rpip, diffusion_rpip, options_rpip, guidance_scale, device)
    else:
        raise Exception("Invalid variant choice")

    sampler_up = None
    if variant=="base":
        sampler_up = sampler_up_base
    elif variant=="repaint":
        sampler_up = sampler_up_rp
    elif variant=="repaint_inpaint":
        sampler_up = sampler_up_rp

    # Process input image
    image_tensor = process_image(input_image, size=64)
    image_tensor_full = process_image(input_image, size=size)

    # Handling the mask from the drawing canvas
    if len(mask_image.shape) == 3 and mask_image.shape[2] == 4:
        mask_array = mask_image[:, :, 3]
    elif len(mask_image.shape) == 2:
        mask_array = mask_image
    else:
        mask_array = np.mean(mask_image, axis=2) if mask_image.shape[2] == 3 else mask_image

    mask_tensor = process_mask(mask_array, size=64)
    mask_tensor_full = process_mask(mask_array, size=size)

    # In GLIDE's inpainting model, 0 means "inpaint this", 1 means "keep this"
    # But in our UI, white (255) means "inpaint this", so we invert the mask
    inpaint_mask = 1.0 - mask_tensor
    inpaint_mask_full = 1.0 - mask_tensor_full

    # Rest of your function remains the same
    # For RePaint, we need the ground truth image and keep mask
    gt = image_tensor
    gt_keep_mask = inpaint_mask  # 1 for areas to keep (not inpaint)

    gt_full = image_tensor_full
    gt_keep_mask_full = inpaint_mask_full

    # Sampling from the model
    print("Generating inpainted image...")

    jump_params = {
        "t_T": 250,
        "n_sample": 1,
        "jump_length": 10,
        "jump_n_sample": 3,
        "start_resampling": 20
    }
    samples = None
    if variant=="base":
        samples = sampler.sample(gt, gt_keep_mask, prompt, batch_size)[:batch_size]
    else:
        samples = sampler.sample(gt, gt_keep_mask, prompt, batch_size, jump_params=jump_params)[:batch_size]

    if variant=="base":
        samples = sampler_up.sample(samples, upsample_temp, gt_full, gt_keep_mask_full, prompt, batch_size)[:batch_size]
    else:
        samples = sampler_up.sample(samples, upsample_temp, gt_full, gt_keep_mask_full, prompt, batch_size, jump_params=jump_params)[:batch_size]

    # Convert the tensor to a numpy array in the range [0, 255]
    samples = ((samples + 1) * 127.5).clamp(0, 255).to(torch.uint8)
    samples = samples.permute(0, 2, 3, 1).contiguous()
    sample_image = samples[0].cpu().numpy()

    return sample_image

In [None]:
#warm up model
img = Image.open("data/large_masked_coco/genhalf_masked_2697_A_bus_is_going_down_the_street_and_it_has_an_advertisement_on_the_side_of_it..png")
img = img.resize((256, 256))
img = np.array(img)
mask = Image.open("data/masks/64/genhalf.png")
mask = mask.resize((256, 256), resample=Image.NEAREST)
mask = np.array(mask)
inpaint_image(img, mask, "prmpt", variant="base")
inpaint_image(img, mask, "prmpt", variant="repaint")
inpaint_image(img, mask, "prmpt", variant="repaint_inpaint")
print("Done Compiling Models")

Generating inpainted image...
CFGSampler


  0%|          | 0/250 [00:00<?, ?it/s]

AUTOTUNE mm(256x512, 512x2048)
  triton_mm_82 0.0143 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_85 0.0143 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_86 0.0143 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_87 0.0143 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_83 0.0154 ms 93.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_89 0.0154 ms 93.3% ACC_

  0%|          | 0/250 [00:00<?, ?it/s]

In [None]:
def process_input_with_mask(input_with_mask, prompt, guidance_scale, variant):
    """
    Process an image with drawing mask and extract both components for inpainting.

    Args:
        input_with_mask: Image with alpha channel containing mask information
        prompt: Text prompt for guiding the inpainting
        guidance_scale: Scale factor for classifier-free guidance

    Returns:
        Inpainted image or error message
    """
    print(variant)
    # Check if there's any drawing on the image
    if input_with_mask is None:
        return "Please upload an image first."


    if len(input_with_mask['background'].shape) == 3:
        if len(input_with_mask['layers']) == 1:
            # The 4th channel contains the mask (alpha channel)
            original_image = input_with_mask['background'][:, :, :3]

            # Extract mask from alpha channel, white (255) marks areas to inpaint
            mask = input_with_mask['layers'][0]

            # Check if any masking was done
            if np.max(mask) == 0:
                return "Please draw on the image to create a mask for inpainting."

            # Process the image and mask for inpainting
            gr.Info(f"Original Image: {original_image.shape}")
            gr.Info(f"Mask: {mask.shape}")
            return inpaint_image(original_image, mask, prompt, guidance_scale, variant=variant, size=256)
        else:
            return "The image doesn't have an alpha channel for masking. Please draw on the image."
    else:
        return "Please upload a valid image and draw on it to create a mask."



### Gradio Interface

In [None]:
def create_gradio_interface():
    """Create the Gradio interface for the inpainting application."""
    with gr.Blocks(title="RePaint with GLIDE Text-Driven Inpainting") as app:
        gr.Markdown("# Text-Driven Image Inpainting with RePaint and GLIDE")
        gr.Markdown(
            "Upload an image, draw directly on it to create a mask (white areas will be inpainted), and enter a text prompt.")

        with gr.Row():
            with gr.Column():
                input_image = gr.ImageMask(label="Upload & Draw Mask (white areas will be inpainted)",
                                       image_mode="RGBA",
                                       type='numpy',
                                       sources=('upload'),
                                       interactive=True)

                prompt = gr.Textbox(label="Text Prompt", placeholder="Describe what should be in the masked area...")
                guidance_scale = gr.Slider(minimum=1.0, maximum=15.0, value=7.0, step=0.5,
                                           label="Guidance Scale (higher = more text influence)")
                variant = gr.Radio([("Just Glide Inpainting", "base"),("Glide model with repaint strategy", "repaint"),("Glide Inpainting with repaint strategy", "repaint_inpaint")],value="base")
                submit_btn = gr.Button("Generate Inpainting")

            with gr.Column():
                output_image = gr.Image(label="Inpainted Result")

        submit_btn.click(
            fn=process_input_with_mask,
            inputs=[input_image, prompt, guidance_scale, variant],
            outputs=output_image,
        )

        gr.Markdown("## How to use")
        gr.Markdown("""
        1. Upload an image
        2. Draw directly on the image with white brush to create a mask (white areas will be inpainted)
        3. Enter a text prompt describing what you want in the masked area
        4. Click 'Generate Inpainting' and wait for the result
        5. Adjust the guidance scale if needed (higher values follow the text more closely)

        Tips:
        - Use a larger brush radius for covering larger areas
        - You can clear your drawing and start over using the clear button
        - Be specific in your text prompt for best results
        """)

    return app

In [None]:
import os

demo = create_gradio_interface()
os.makedirs("sample_images", exist_ok=True)
demo.launch(share=True, debug=True, inline=True, show_error=True)