In [1]:
# Run this line in Colab to install the package if it is
# not already installed.
!pip install git+https://github.com/openai/glide-text2im

Collecting git+https://github.com/openai/glide-text2im
  Cloning https://github.com/openai/glide-text2im to /tmp/pip-req-build-40q_pztc
  Running command git clone --filter=blob:none --quiet https://github.com/openai/glide-text2im /tmp/pip-req-build-40q_pztc
  Resolved https://github.com/openai/glide-text2im to commit 69b530740eb6cef69442d6180579ef5ba9ef063e
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from glide-text2im==0.0.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: glide-text2im
  Building wheel for glide-text2im (setup.py) ... [?25l[?25hdone
  Created wheel for glide-text2im: filename=glide_text2im-0.0.0-py3-none-any.whl size=1953624 sha256=ce76fc5a7d748066fdd76703726ba29ee6706100ec608ad22ae84a5487ddcf58
  Stored in directory: 

In [2]:
from PIL import Image
from IPython.display import display
import torch as th
import torch.nn as nn

from glide_text2im.clip.model_creation import create_clip_model
from glide_text2im.download import load_checkpoint
from glide_text2im.model_creation import (
    create_model_and_diffusion,
    model_and_diffusion_defaults,
    model_and_diffusion_defaults_upsampler,
)
from glide_text2im.tokenizer.simple_tokenizer import SimpleTokenizer

In [3]:
# This notebook supports both CPU and GPU.
# On CPU, generating one sample may take on the order of 20 minutes.
# On a GPU, it should be under a minute.

has_cuda = th.cuda.is_available()
device = th.device('cpu' if not has_cuda else 'cuda')

In [4]:
# Create base model.
options = model_and_diffusion_defaults()
options['use_fp16'] = has_cuda
options['timestep_respacing'] = '100' # use 100 diffusion steps for fast sampling
model, diffusion = create_model_and_diffusion(**options)
model.eval()
if has_cuda:
    model.convert_to_fp16()
model.to(device)
model.load_state_dict(load_checkpoint('base', device))
print('total base parameters', sum(x.numel() for x in model.parameters()))

  0%|          | 0.00/1.54G [00:00<?, ?iB/s]

  return th.load(path, map_location=device)


total base parameters 385030726


In [5]:
# Create upsampler model.
options_up = model_and_diffusion_defaults_upsampler()
options_up['use_fp16'] = has_cuda
options_up['timestep_respacing'] = 'fast27' # use 27 diffusion steps for very fast sampling
model_up, diffusion_up = create_model_and_diffusion(**options_up)
model_up.eval()
if has_cuda:
    model_up.convert_to_fp16()
model_up.to(device)
model_up.load_state_dict(load_checkpoint('upsample', device))
print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))

  0%|          | 0.00/1.59G [00:00<?, ?iB/s]

total upsampler parameters 398361286


In [6]:
# Create CLIP model.
clip_model = create_clip_model(device=device)
clip_model.image_encoder.load_state_dict(load_checkpoint('clip/image-enc', device))
clip_model.text_encoder.load_state_dict(load_checkpoint('clip/text-enc', device))

  self.global_layout = np.tril(np.ones([self.n_query_block, self.n_key_block], dtype=np.bool))


AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
def show_images(batch: th.Tensor):
    """ Display a batch of images inline. """
    scaled = ((batch + 1)*127.5).round().clamp(0,255).to(th.uint8).cpu()
    reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3])
    display(Image.fromarray(reshaped.numpy()))

In [None]:
# Sampling parameters
prompt = "an oil painting of a corgi"
batch_size = 1
guidance_scale = 3.0

# Tune this parameter to control the sharpness of 256x256 images.
# A value of 1.0 is sharper, but sometimes results in grainy artifacts.
upsample_temp = 0.997

In [None]:
##############################
# Sample from the base model #
##############################

# Create the text tokens to feed to the model.
tokens = model.tokenizer.encode(prompt)
tokens, mask = model.tokenizer.padded_tokens_and_mask(
    tokens, options['text_ctx']
)

# Pack the tokens together into model kwargs.
model_kwargs = dict(
    tokens=th.tensor([tokens] * batch_size, device=device),
    mask=th.tensor([mask] * batch_size, dtype=th.bool, device=device),
)

# Setup guidance function for CLIP model.
cond_fn = clip_model.cond_fn([prompt] * batch_size, guidance_scale)

# Sample from the base model.
model.del_cache()
samples = diffusion.p_sample_loop(
    model,
    (batch_size, 3, options["image_size"], options["image_size"]),
    device=device,
    clip_denoised=True,
    progress=True,
    model_kwargs=model_kwargs,
    cond_fn=cond_fn,
)
model.del_cache()

# Show the output
show_images(samples)

In [None]:
##############################
# Upsample the 64x64 samples #
##############################

tokens = model_up.tokenizer.encode(prompt)
tokens, mask = model_up.tokenizer.padded_tokens_and_mask(
    tokens, options_up['text_ctx']
)

# Create the model conditioning dict.
model_kwargs = dict(
    # Low-res image to upsample.
    low_res=((samples+1)*127.5).round()/127.5 - 1,

    # Text tokens
    tokens=th.tensor(
        [tokens] * batch_size, device=device
    ),
    mask=th.tensor(
        [mask] * batch_size,
        dtype=th.bool,
        device=device,
    ),
)

# Sample from the base model.
model_up.del_cache()
up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
up_samples = diffusion_up.ddim_sample_loop(
    model_up,
    up_shape,
    noise=th.randn(up_shape, device=device) * upsample_temp,
    device=device,
    clip_denoised=True,
    progress=True,
    model_kwargs=model_kwargs,
    cond_fn=None,
)[:batch_size]
model_up.del_cache()

# Show the output
show_images(up_samples)

In [7]:
!wget http://images.cocodataset.org/zips/train2017.zip


--2025-01-07 21:09:28--  http://images.cocodataset.org/zips/train2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 3.5.30.165, 3.5.29.147, 52.216.36.177, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|3.5.30.165|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19336861798 (18G) [application/zip]
Saving to: ‘train2017.zip’


2025-01-07 21:28:26 (16.2 MB/s) - ‘train2017.zip’ saved [19336861798/19336861798]

Archive:  train2017.zip
checkdir:  cannot create extraction directory: data/coco
           No such file or directory


In [22]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from pycocotools.coco import COCO
from glide_text2im.model_creation import create_model_and_diffusion
from glide_text2im.tokenizer.simple_tokenizer import SimpleTokenizer
from torchvision import transforms
from PIL import Image
import os


# COCO Dataset Loader with Padding
class COCODataset(torch.utils.data.Dataset):
    def __init__(self, annotation_file, image_dir, tokenizer, image_size=64, max_caption_length=20):
        self.coco = COCO(annotation_file)  # Load COCO annotations
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.image_transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
        ])
        self.image_ids = list(self.coco.imgs.keys())  # List of image IDs
        self.max_caption_length = max_caption_length

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        img_info = self.coco.loadImgs(image_id)[0]
        ann_ids = self.coco.getAnnIds(imgIds=image_id)
        anns = self.coco.loadAnns(ann_ids)

        # Use the first caption if multiple annotations exist
        caption = anns[0]["caption"]
        image_path = os.path.join(self.image_dir, img_info["file_name"])

        # Open and preprocess image
        image = Image.open(image_path).convert("RGB")
        image = self.image_transform(image)

        # Tokenize caption and pad to max_caption_length
        tokens = self.tokenizer.encode(caption)
        tokens = tokens[:self.max_caption_length]  # Truncate if longer
        tokens = torch.tensor(tokens, dtype=torch.long)

        return image, tokens


# Custom collate function for padding
def custom_collate_fn(batch):
    images, tokens = zip(*batch)

    # Pad sequences so they have the same length
    padded_tokens = pad_sequence(tokens, batch_first=True, padding_value=0)

    return torch.stack(images), padded_tokens


# Load Model and Diffusion
def load_glide_model(device):
    model, diffusion = create_model_and_diffusion(
        image_size=64,
        num_channels=192,
        num_res_blocks=3,
        channel_mult="1,2,3,4",
        num_heads=4,
        num_head_channels=64,
        num_heads_upsample=4,
        attention_resolutions="32,16,8",
        dropout=0.1,
        text_ctx=1280,
        xf_width=512,
        xf_layers=24,
        xf_heads=8,
        xf_final_ln=True,
        xf_padding=True,
        use_fp16=torch.cuda.is_available(),
        diffusion_steps=1000,
        timestep_respacing="1000",
        noise_schedule="linear",
        use_scale_shift_norm=True,
        resblock_updown=True,
        cache_text_emb=False,
        inpaint=False,
        super_res=False
    )
    model.to(device)
    return model, diffusion


def train_glide(model, diffusion, dataloader, optimizer, device, num_epochs=5):
    tokenizer = SimpleTokenizer()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        print(f"\nEpoch [{epoch + 1}/{num_epochs}]")

        for images, tokens in tqdm(dataloader):
            images = images.to(device)
            tokens = tokens.to(device)

            # Sample time steps for the diffusion process
            t = torch.randint(0, diffusion.num_timesteps, (images.size(0),), device=device)
            noise = torch.randn_like(images)
            noisy_images = diffusion.q_sample(images, t, noise=noise)

            # Compute the loss using training_losses() method
            model_kwargs = {"tokens": tokens}
            losses = diffusion.training_losses(model, noisy_images, t, model_kwargs=model_kwargs)
            loss = losses["loss"].mean()  # Mean loss for the batch

            # Backprop and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch [{epoch + 1}] completed. Average Loss: {avg_loss:.4f}")

    # Save model
    torch.save(model.state_dict(), "glide_model_coco.pth")
    print("Training completed and model saved!")

# Main function
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # COCO dataset paths
    annotation_file = "/content/annotations/captions_train2017.json"
    image_dir = "/content/train2017"

    # Load model and diffusion
    model, diffusion = load_glide_model(device)
    tokenizer = SimpleTokenizer()

    # Prepare dataset and dataloader
    dataset = COCODataset(annotation_file, image_dir, tokenizer)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)

    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    train_glide(model, diffusion, dataloader, optimizer, device)


if __name__ == "__main__":
    main()

loading annotations into memory...
Done (t=1.03s)
creating index...
index created!

Epoch [1/5]


  0%|          | 0/14786 [00:00<?, ?it/s]


AttributeError: 'SpacedDiffusion' object has no attribute 'training_losses'

In [16]:
!pip install pycocotools tqdm torch torchvision



In [14]:
!unzip train2017.zip
!unzip annotations.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 extracting: train2017/000000452746.jpg  
 extracting: train2017/000000423782.jpg  
 extracting: train2017/000000546343.jpg  
 extracting: train2017/000000249290.jpg  
 extracting: train2017/000000025529.jpg  
 extracting: train2017/000000316928.jpg  
 extracting: train2017/000000337866.jpg  
 extracting: train2017/000000547768.jpg  
 extracting: train2017/000000423162.jpg  
 extracting: train2017/000000224149.jpg  
 extracting: train2017/000000117841.jpg  
 extracting: train2017/000000251660.jpg  
 extracting: train2017/000000110997.jpg  
 extracting: train2017/000000424728.jpg  
 extracting: train2017/000000384745.jpg  
 extracting: train2017/000000475535.jpg  
 extracting: train2017/000000252604.jpg  
 extracting: train2017/000000002525.jpg  
 extracting: train2017/000000547307.jpg  
 extracting: train2017/000000546568.jpg  
 extracting: train2017/000000002024.jpg  
 extracting: train2017/000000162559.jpg  
 extracting