# Test Stable Diffusion with text-to-image


In [None]:
!pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers pandas numpy tqdm wandb

In [None]:
import torch
import torchvision
# import torchaudio

print(f'torch version: {torch.__version__}')
print(f'torchvision version: {torchvision.__version__}')
# print(f'torchaudio version: {torchaudio.__version__}')

In [None]:
import subprocess
import os 


def git(*args):
    result = subprocess.run(['git'] + list(args), check=True)
    return result

os.chdir('/kaggle/working/')
repo_url = "https://github.com/HoangTran223/SD_from_scratch.git"

git("clone", repo_url)

In [None]:
!pip install pipeline

In [None]:
os.chdir('/kaggle/working/SD_from_scratch/')
print("After working directory:", os.getcwd())
!ls /kaggle/working/SD_from_scratch/

In [None]:
if torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"
print(f"Using device: {DEVICE}")

In [None]:
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(
    repo_id="runwayml/stable-diffusion-v1-5",
    filename="v1-5-pruned-emaonly.ckpt",
    cache_dir="/kaggle/working/data"
)
print("Model path:", model_path)

In [None]:
from huggingface_hub import snapshot_download
import shutil
import os

tokenizer_dir = snapshot_download(
    repo_id="openai/clip-vit-large-patch14",
    cache_dir="/kaggle/working/data",
    allow_patterns=["vocab.json", "merges.txt"]
)

os.makedirs("/kaggle/working/data", exist_ok=True)
shutil.copy(os.path.join(tokenizer_dir, "vocab.json"), "/kaggle/working/data/vocab.json")
shutil.copy(os.path.join(tokenizer_dir, "merges.txt"), "/kaggle/working/data/merges.txt")


In [None]:
import torch
import numpy
from sd import model_loader, pipeline 
from transformers import CLIPTokenizer

model_path = "/kaggle/working/data/v1-5-pruned-emaonly.ckpt"
tokenizer = CLIPTokenizer("/kaggle/working/data/vocab.json", merges_file="/kaggle/working/data/merges.txt")

models = model_loader.preload_models_from_standard_weights(model_path, DEVICE)

In [None]:
prompt = "A girl student, best quality, vibrant colors, detailed anime style, glasses, highly detailed, ultra sharp, cinematic, school uniform, 8k resolution, 3D illustration."
negative_prompt = "lowres, bad anatomy, extra fingers, watermark, text, blurry, distorted, sketch."

In [None]:
output_image = pipeline.generate(
    prompt=prompt,
    uncond_prompt=negative_prompt,
    input_image=None,
    strength=0.0,
    do_cfg=True,
    cfg_scale=8,
    sampler_name=sampler,
    n_inference_steps=50,
    seed=42,
    models=models,
    device=DEVICE,
    idle_device="cpu",
    tokenizer=tokenizer,
)

Image.fromarray(output_image)