install requirements

In [None]:
!pip install optimum-quanto
!pip install accelerate
!git clone https://github.com/huggingface/diffusers.git
!pip install -e "diffusers/.[torch]"
!pip install -e "diffusers/.[flax]"
!git -C diffusers/ pull
!pip install transformers --upgrade 

import torch # necessary to check the device
# identify which device is used (cuda = GPU, cpu = CPU only, mps = Mac)
device: str = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')
if device == 'cpu':
    !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
elif device == 'cuda':
    !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
elif device == 'mps':
    !pip3 install torch torchvision torchaudio
else:
    print("device unknown")
# exception: cu124 necessary for google colab no matter if T4 GPU enabled or CPU only

import libraries and dependencies

In [6]:
import torch

import accelerate

from optimum.quanto import freeze, qfloat8, quantize

from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
from diffusers.pipelines.flux.pipeline_flux import FluxPipeline

from transformers import T5EncoderModel

define loading and saving path for models

In [None]:
cache_dir = '../../models/text-to-image/flux.1-schnell' # saving path

model = "black-forest-labs/FLUX.1-schnell" # official model flux1.-schnell from Blackforest (not quantized)
model_tr = "https://huggingface.co/Kijai/flux-fp8/blob/main/flux1-schnell-fp8.safetensors" # quantized transformer from Hugginface


load and quantize transformer

In [None]:
transformer = FluxTransformer2DModel.from_single_file(model_tr, 
                                                        torch_dtype=torch.bfloat16,
                                                        cache_dir = cache_dir
)
quantize(transformer, weights=qfloat8)
freeze(transformer)

load and quantize text_encoder_2

In [None]:
text_encoder_2 = T5EncoderModel.from_pretrained(model,
                                                subfolder="text_encoder_2",
                                                torch_dtype=torch.bfloat16,
                                                cache_dir=cache_dir
)
quantize(text_encoder_2, weights=qfloat8)
freeze(text_encoder_2)

set up pipe line with main model and the two quantized models (transformer & text_encoder_2). When running on cuda (GPU) there are some more "tricks" to lower the memory usage

In [None]:
pipe = FluxPipeline.from_pretrained(model,
                                    transformer=None,
                                    text_encoder_2=None,
                                    torch_dtype=torch.bfloat16
)
pipe.transformer = transformer
pipe.text_encoder_2 = text_encoder_2
pipe.to(torch.device(device))

For cuda (GPU) use ONLY to save some VRAM on GPU to get the code running with VRAM < 16 GB:

In [7]:
if device == 'cuda':
    pipe.enable_model_cpu_offload() # offloads modules to CPU on a submodule level (rather than model level)
    # pipe.enable_sequential_cpu_offload() # when using non-quantized versions to make it run with VRAM 4-32 GB
    # pipe.vae.enable_slicing() # when using non-quantized versions to make it run with VRAM 4-32 GB
    # pipe.vae.enable_tiling() # when using non-quantized versions to make it run with VRAM 4-32 GB

define parameters for the image


In [None]:
prompt = "Ancient soldier with a sword and a shield. Behind there are horses. In the background there is a mountain with snow."
height, width = 128, 128
num_inference_steps = 4  # number of iterations, 4 gives decent results and should be considered as minimum; people on hugging face and git hub ~15-50 iterations
generator = torch.Generator("cpu").manual_seed(12345) # set seed for repeatable results

image generation

In [None]:
image = pipe(
    prompt=prompt,
    guidance_scale=0.0, # must be 0.0 for flux1.-schnell, may be 3.5 for flux1.-dev but up to 7.0 --> higher guidance scale forces the model to keep closer to the prompt at the expense of image quality
    height=height,
    width=width,
    #output_type="pil",
    num_inference_steps=num_inference_steps,
    max_sequence_length=128, #256 is max for flux1.-schnell; maximum sequence length to use with the prompt
    generator=generator
).images[0]


show image

In [None]:
image

save image

In [None]:
image.save(f"figs/Kijai_qt-qte2_{num_inference_steps}_{height}_{width}.png")