# 1. 把底模safetensors转化为文件夹权重模式

在这里下载底模https://civitai.com/models/27259?modelVersionId=221220然后移动到文件夹

In [None]:
!wget --content-disposition "https://civitai.com/api/download/models/221220?type=Model&format=SafeTensor&size=pruned&fp=fp16"

In [None]:
from diffusers import StableDiffusionPipeline

checkpoint_path = r"tmndMix_tmndMixSPRAINBOW.safetensors"
save_path = r"basemodel"
# 加载 .safetensors 文件
pipeline = StableDiffusionPipeline.from_single_file(checkpoint_path)

# 将模型保存为 diffusers 格式
pipeline.save_pretrained(save_path)

# 2. 把数据集解压

数据集要的所有东西全部放在共享文件夹了（images）  

In [None]:
!unzip image/image_resized.zip
!cp images/merged_joints2d.txt image_resized/merged_joints2d.txt
!cp images/filtered_camera_params.txt image_resized/camera_params.txt
!cp images/image_features.txt image_resized/image_features.txt

In [None]:
!unzip images/image_mirror_resized.zip
!cp images/mirror_merged_joints2d.txt image_resized/merged_joints2d.txt
!cp images/mirror_filtered_camera_params.txt image_mirror_resized/camera_params.txt
!cp images/image_features.txt image_mirror_resized/image_features.txt

# 3. 训练

In [None]:
!git clone https://github.com/sodabreak/PoseCtrl.git
!cd poseCtrl

单卡

In [None]:
python train_colab_V6_AllData.py --pretrained_model_name_or_path "/basemodel" --base_point_path "/PoseCtrl/dataSet/standardVertex_2.txt" --data_root_path_2 "/image_resized" --data_root_path_3 "/image_mirror_resized" --train_batch_size 4 --save_steps 2000 

多卡

- 先运行配置
- 回答问题：使用混合精度fp16

In [None]:
accelerate config

In [None]:
accelerate launch --num_processes 2 --multi_gpu --mixed_precision "fp16" train_colab_V6_AllData.py --pretrained_model_name_or_path "/basemodel" --base_point_path "/PoseCtrl/dataSet/standardVertex_2.txt" --data_root_path_2 "/image_resized" --data_root_path_3 "/image_mirror_resized" --train_batch_size 4 --save_steps 2000 

# 权重处理

所有训练得到的权重都在/sd-pose_ctrl文件夹内



# inference

In [None]:
""" V6 """
import torch
import torch
from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
from PIL import Image
import sys
import os
current_dir = 'content'

sys.path.append('/content/PoseCtrl')
sys.path.append('/content/PoseCtrl/poseCtrl')
from poseCtrl.models.pose_adaptor import VPmatrixPoints, ImageProjModel
from poseCtrl.models.attention_processor import AttnProcessor, PoseAttnProcessor
from poseCtrl.data.dataset import CombinedDataset, load_base_points
from poseCtrl.models.posectrl import PoseCtrl,PoseCtrlV1,PoseCtrlV5
import numpy as np
from poseCtrl.pipelines.Pose_pipelines import PoseControlNet
from poseCtrl.pipelines.Pose_pipelines import PoseControlNet, PoseControlNetV6

base_point_path=r'/content/drive/MyDrive/PoseCtrl/dataSet/standardVertex_2.txt'
raw_base_points=load_base_points(base_point_path)  
base_model_path = r"/content/drive/MyDrive/basemodel"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
ip_ckpt = r"/content/drive/MyDrive/ckpt/points_posectrl2000.bin"
device = "cuda"
def image_grid(imgs_np: np.ndarray, rows: int, cols: int):
    """
    imgs_np: np.ndarray of shape (B, H, W, C) and values in [0, 1] (float)
    rows, cols: number of rows and columns in grid
    """
    assert imgs_np.ndim == 4 and imgs_np.shape[0] == rows * cols, "Input shape must be (B, H, W, C)"
    B, H, W, C = imgs_np.shape

    # 归一化 -> uint8 -> PIL
    pil_imgs = [Image.fromarray((img * 255).astype(np.uint8)) for img in imgs_np]

    # 创建大图
    grid_img = Image.new('RGB', size=(cols * W, rows * H))
    
    for idx, img in enumerate(pil_imgs):
        x = (idx % cols) * W
        y = (idx // cols) * H
        grid_img.paste(img, box=(x, y))

    return grid_img

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)
vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)

# load SD pipeline
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    feature_extractor=None,
    safety_checker=None
)
val_dataset = CombinedDataset(
        # path1=args.data_root_path_1,
        path2="/content/drive/MyDrive/images_01/test",
        tokenizer=pipe.tokenizer,
    )


data = val_dataset[0]

from torchvision import transforms

transform = transforms.Resize((256, 256))

""" denormalize 是因为dataloader的图片加载完之后会被transform，正常的图片只要是PIL就行 """
def denormalize(tensor, mean, std):
    return tensor * std + mean

mean = 0.5
std = 0.5


image = data['image']
points = data['joints_image']
image = denormalize(image, mean, std)
image_pil = transforms.ToPILImage()(image) 
points_pil = transforms.ToPILImage()(points)
points = data['joints_image'].to(torch.float16).unsqueeze(0).to(device)
vmatrix = data['view_matrix'].to(torch.float16).unsqueeze(0).to(device)
pmatrix = data['projection_matrix'].to(torch.float16).unsqueeze(0).to(device)
text = 'highly detailed, anime, 1girl, blue_eyes, purple long_hair, black dress, smile, white simple background'
pose_model = PoseControlNetV6(pipe, image_encoder_path, ip_ckpt, raw_base_points, device,8)
images = pose_model.generate( prompt = text, num_samples=4, num_inference_steps=50, seed=42, V_matrix=vmatrix, P_matrix=pmatrix, points=points)
grid = image_grid(images, 1, 4)
combined_image = Image.new('RGB', (6*image_pil.width, image_pil.height))
combined_image.paste(image_pil, (0, 0))
combined_image.paste(points_pil, (image_pil.width, 0))
combined_image.paste(grid, (2*image_pil.width, 0))
combined_image