In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"


from diffusers import StableDiffusionXLPipeline
from diffusers import DiffusionPipeline
from diffusers import DDPMPipeline
import torch
import torch.utils
import torch.utils.data
import numpy as np
import os
import torch.nn.functional as F
from utils import utils_blindsr as blindsr
from PIL import Image
from torch.utils.data.dataloader import DataLoader
import torchvision.transforms as transforms
import torch.nn as nn
import math
from peft import LoraConfig, get_peft_model
from peft.tuners.lora import LoraModel
from tqdm import tqdm
from transformers import get_scheduler

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
class MipNeRF360Dataset(torch.utils.data.Dataset):
    def __init__(self, root='/home/hentci/vimeo_triplet/simple/lq'):
        self.images = sorted(os.listdir(root))
        self.label = ''
        self.root = root

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        transform = transforms.Compose([
            transforms.ToTensor(),
            # transforms.Resize((512, 512)),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        
        image = transform(Image.open(os.path.join(self.root, self.images[idx])))
        
        return image, self.label, self.images[idx]


train_dataset = MipNeRF360Dataset()
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False)

img, label, filenames = next(iter(train_dataloader))
print(len(train_dataloader))
print(img.shape)
print(label)

300
torch.Size([1, 3, 256, 256])
('',)


In [4]:
pipeline = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
).to("cuda")

refiner = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-refiner-1.0",
    text_encoder_2=pipeline.text_encoder_2,
    vae=pipeline.vae,
    torch_dtype=torch.float16,
    use_safetensors=True,
    variant="fp16",
).to("cuda")

Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00,  7.22it/s]
Loading pipeline components...: 100%|██████████| 5/5 [00:00<00:00, 16.26it/s]


In [52]:
image = pipeline(prompt='', output_type="latent").images[0]

  8%|▊         | 4/50 [00:00<00:07,  6.07it/s]

100%|██████████| 50/50 [00:09<00:00,  5.55it/s]


In [32]:
result_dir = '/home/hentci/Final_Project/results/refiner/vimeo_lq'
os.makedirs(result_dir, exist_ok=True)
for img, label, filename in train_dataloader:
    image = refiner(prompt='', image=img[0]).images[0]
    image.save(os.path.join(result_dir, filename[0]))

100%|██████████| 15/15 [00:00<00:00, 20.32it/s]
100%|██████████| 15/15 [00:00<00:00, 20.42it/s]
100%|██████████| 15/15 [00:00<00:00, 20.51it/s]
100%|██████████| 15/15 [00:00<00:00, 20.53it/s]
100%|██████████| 15/15 [00:00<00:00, 18.92it/s]
100%|██████████| 15/15 [00:00<00:00, 20.66it/s]
100%|██████████| 15/15 [00:00<00:00, 20.20it/s]
100%|██████████| 15/15 [00:00<00:00, 20.34it/s]
100%|██████████| 15/15 [00:00<00:00, 27.29it/s]
100%|██████████| 15/15 [00:00<00:00, 21.72it/s]
100%|██████████| 15/15 [00:00<00:00, 20.34it/s]
100%|██████████| 15/15 [00:00<00:00, 20.35it/s]
100%|██████████| 15/15 [00:00<00:00, 20.26it/s]
100%|██████████| 15/15 [00:00<00:00, 20.41it/s]
100%|██████████| 15/15 [00:00<00:00, 20.19it/s]
100%|██████████| 15/15 [00:00<00:00, 20.44it/s]
100%|██████████| 15/15 [00:00<00:00, 20.54it/s]
100%|██████████| 15/15 [00:00<00:00, 20.20it/s]
100%|██████████| 15/15 [00:00<00:00, 20.57it/s]
100%|██████████| 15/15 [00:00<00:00, 20.47it/s]
100%|██████████| 15/15 [00:00<00:00, 20.

In [None]:
image

In [3]:
# 配置 LoRA 参数
lora_config = LoraConfig(
    r=4,            # 矩阵降维因子
    lora_alpha=16,  # LoRA scaling因子
    target_modules=["to_q", "to_k", "to_v"],  # 选择目标模块
    lora_dropout=0.1,
    bias="none"
)

# 获取微调模型
lora_model = get_peft_model(pipeline.unet, lora_config)

# 设置为训练模式
lora_model.train()

PeftModel(
  (base_model): LoraModel(
    (model): UNet2DConditionModel(
      (conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (time_proj): Timesteps()
      (time_embedding): TimestepEmbedding(
        (linear_1): Linear(in_features=320, out_features=1280, bias=True)
        (act): SiLU()
        (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
      )
      (add_time_proj): Timesteps()
      (add_embedding): TimestepEmbedding(
        (linear_1): Linear(in_features=2816, out_features=1280, bias=True)
        (act): SiLU()
        (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
      )
      (down_blocks): ModuleList(
        (0): DownBlock2D(
          (resnets): ModuleList(
            (0-1): 2 x ResnetBlock2D(
              (norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
              (conv1): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
              (time_emb_proj): Linear(in_featur

In [4]:
lora_model.print_trainable_parameters()

trainable params: 4,474,880 || all params: 2,571,938,564 || trainable%: 0.1740


torch.Size([8, 3, 512, 512])
('', '', '', '', '', '', '', '')


In [6]:
# 损失函数
loss_function = nn.MSELoss()

# 优化器
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=5e-5)

# 学习率调度器
num_epochs = 3
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader) * num_epochs
)

In [9]:
# 将模型移动到 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lora_model.to(device)

for epoch in range(num_epochs):
    lora_model.train()
    epoch_loss = 0
    pbar = tqdm(train_dataloader)
    for i, batch in enumerate(pbar):
        images, prompts = batch
        images = images.to(device)
        
        optimizer.zero_grad()
        
        noise = torch.randn_like(images)
        timesteps = torch.randint(0, 999, (images.shape[0],)).long().to('cuda')
        noisy_x = pipeline.scheduler.add_noise(images, noise, timesteps)
        
        # token = pipeline.tokenizer(prompts, padding=True, return_tensors="pt").to('cuda')
        # encoder_hidden_states = pipeline.text_encoder(**token)
        # encoder_hidden_states = pipeline.encode_prompt()
        
        # 前向传播
        pred = lora_model(noisy_x, timesteps, '')
        
        # 计算损失
        loss = loss_function(pred, noise)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {epoch_loss / len(train_dataloader)}")

# 保存微调后的模型
torch.save(lora_model.state_dict(), "finetuned_lora_model.pth")

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]


TypeError: argument of type 'NoneType' is not iterable

In [10]:
torch.tensor(pipeline.encode_prompt(list(('', '', '', '', '', '', '', ''))))

ValueError: only one element tensors can be converted to Python scalars

In [39]:
token = pipeline.tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt").to('cuda')
pipeline.text_encoder(**token)

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-3.8843e-01,  2.2949e-02, -5.2338e-02,  ..., -4.9023e-01,
          -3.0664e-01,  6.7383e-02],
         [ 2.7878e-02, -1.3242e+00,  3.0859e-01,  ..., -5.2539e-01,
           9.7461e-01,  6.6406e-01],
         [ 1.1572e+00,  1.3306e-01,  7.9004e-01,  ..., -2.1035e+00,
          -1.1514e+00, -3.3228e-01],
         ...,
         [ 5.8447e-01, -1.3806e-01,  2.1562e+00,  ..., -1.0508e+00,
          -1.5222e-01,  9.3140e-02],
         [-7.8186e-02,  9.8242e-01,  6.9189e-01,  ..., -2.8887e+00,
           2.1088e-02, -4.1382e-01],
         [-1.2158e+00, -5.1367e-01,  4.8022e-01,  ..., -1.3782e-01,
           8.1250e-01,  5.5811e-01]],

        [[-3.8843e-01,  2.2949e-02, -5.2338e-02,  ..., -4.9023e-01,
          -3.0664e-01,  6.7383e-02],
         [ 2.7878e-02, -1.3242e+00,  3.0859e-01,  ..., -5.2539e-01,
           9.7461e-01,  6.6406e-01],
         [ 1.1572e+00,  1.3306e-01,  7.9004e-01,  ..., -2.1035e+00,
          -1.1514e+00, -3.3228e

(tensor([[[-3.8926, -2.5137,  4.7148,  ...,  0.1898,  0.4185, -0.2966],
          [-0.3762, -0.6851, -0.4727,  ...,  0.3284, -0.5166,  0.7002],
          [-0.4841, -0.7422, -0.4375,  ..., -0.4170,  0.3286, -0.0295],
          ...,
          [-0.0102, -0.3306, -0.3499,  ...,  0.3237,  0.0553,  0.3213],
          [-0.0137, -0.3247, -0.3423,  ...,  0.3374, -0.0778,  0.2849],
          [-0.0244, -0.2366, -0.2251,  ...,  0.3645, -0.1267,  0.3848]],
 
         [[-3.8926, -2.5137,  4.7148,  ...,  0.1898,  0.4185, -0.2966],
          [-0.3762, -0.6851, -0.4727,  ...,  0.3284, -0.5166,  0.7002],
          [-0.4841, -0.7422, -0.4375,  ..., -0.4170,  0.3286, -0.0295],
          ...,
          [-0.0102, -0.3306, -0.3499,  ...,  0.3237,  0.0553,  0.3213],
          [-0.0137, -0.3247, -0.3423,  ...,  0.3374, -0.0778,  0.2849],
          [-0.0244, -0.2366, -0.2251,  ...,  0.3645, -0.1267,  0.3848]]],
        device='cuda:0', dtype=torch.float16, grad_fn=<ViewBackward0>),
 tensor([[[0., 0., 0.,  ..., 