In [1]:
import torch 
from PIL import Image

import cn_clip.clip as clip
from cn_clip.clip import load_from_name, available_models
print("Available models:", available_models())  
# Available models: ['ViT-B-16', 'ViT-L-14', 'ViT-L-14-336', 'ViT-H-14', 'RN50']

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = load_from_name("ViT-B-16", device=device, download_root='./')
model.eval()
image = preprocess(Image.open("imgs/皮卡丘.png").convert("RGBA")).unsqueeze(0).to(device)  #Label probs: [[0.003006 0.974    0.01017  0.01265 ]]
# image = preprocess(Image.open("imgs/皮卡丘.png")).unsqueeze(0).to(device)
text = clip.tokenize(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    # 对特征进行归一化，请使用归一化后的图文特征用于下游任务
    image_features /= image_features.norm(dim=-1, keepdim=True) 
    text_features /= text_features.norm(dim=-1, keepdim=True)    

    logits_per_image, logits_per_text = model.get_similarity(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # [[1.268734e-03 5.436878e-02 6.795761e-04 9.436829e-01]]

Available models: ['ViT-B-16', 'ViT-L-14', 'ViT-L-14-336', 'ViT-H-14', 'RN50']
Loading vision model config from /root/autodl-tmp/project/cn_clip/clip/model_configs/ViT-B-16.json
Loading text model config from /root/autodl-tmp/project/cn_clip/clip/model_configs/RoBERTa-wwm-ext-base-chinese.json
Model info {'embed_dim': 512, 'image_resolution': 224, 'vision_layers': 12, 'vision_width': 768, 'vision_patch_size': 16, 'vocab_size': 21128, 'text_attention_probs_dropout_prob': 0.1, 'text_hidden_act': 'gelu', 'text_hidden_dropout_prob': 0.1, 'text_hidden_size': 768, 'text_initializer_range': 0.02, 'text_intermediate_size': 3072, 'text_max_position_embeddings': 512, 'text_num_attention_heads': 12, 'text_num_hidden_layers': 12, 'text_type_vocab_size': 2}
Label probs: [[0.002913 0.974    0.01017  0.01266 ]]


In [3]:
import torch

# 指定你保存的模型参数的路径
saved_model_path = 'epoch_latest.pt'

# 加载保存的模型参数
model_state_dict = torch.load(saved_model_path)

# 打印模型结构
for key, value in model_state_dict.items():
    if isinstance(value, torch.Tensor):
        print(f"Layer: {key}, Shape: {value.shape}")
    else:
        print(f"Layer: {key}, Type: {type(value)}")


Layer: epoch, Type: <class 'int'>
Layer: step, Type: <class 'int'>
Layer: name, Type: <class 'str'>
Layer: state_dict, Type: <class 'collections.OrderedDict'>
Layer: optimizer, Type: <class 'dict'>


In [4]:
import torch

# 指定你保存的模型参数的路径
saved_model_path = 'epoch_latest.pt'

# 加载保存的模型参数
checkpoint = torch.load(saved_model_path)

# 获取模型权重和偏差
model_state_dict = checkpoint['state_dict']

# 打印模型结构
for key, value in model_state_dict.items():
    if isinstance(value, torch.Tensor):
        print(f"Layer: {key}, Shape: {value.shape}")
    else:
        print(f"Layer: {key}, Type: {type(value)}")



Layer: module.text_projection, Shape: torch.Size([768, 512])
Layer: module.logit_scale, Shape: torch.Size([])
Layer: module.visual.class_embedding, Shape: torch.Size([768])
Layer: module.visual.positional_embedding, Shape: torch.Size([197, 768])
Layer: module.visual.proj, Shape: torch.Size([768, 512])
Layer: module.visual.conv1.weight, Shape: torch.Size([768, 3, 16, 16])
Layer: module.visual.ln_pre.weight, Shape: torch.Size([768])
Layer: module.visual.ln_pre.bias, Shape: torch.Size([768])
Layer: module.visual.transformer.resblocks.0.attn.in_proj_weight, Shape: torch.Size([2304, 768])
Layer: module.visual.transformer.resblocks.0.attn.in_proj_bias, Shape: torch.Size([2304])
Layer: module.visual.transformer.resblocks.0.attn.out_proj.weight, Shape: torch.Size([768, 768])
Layer: module.visual.transformer.resblocks.0.attn.out_proj.bias, Shape: torch.Size([768])
Layer: module.visual.transformer.resblocks.0.ln_1.weight, Shape: torch.Size([768])
Layer: module.visual.transformer.resblocks.0.ln_1

In [5]:
model_path ='epoch_latest.pt'
model_state_dict = torch.load(model_path, map_location=torch.device('cpu'))

# 打印状态字典的键
print("Keys in the state_dict:")
for key in model_state_dict.keys():
    print(key)

Keys in the state_dict:
epoch
step
name
state_dict
optimizer


In [7]:
import torch
import torch.nn.functional as F
import cn_clip.clip as clip
from PIL import Image

# 加载微调后的模型权重
model_path = 'epoch_latest.pt'
saved_model = torch.load(model_path, map_location=torch.device('cpu'))
model_state_dict = saved_model['state_dict']

# 调整状态字典的键
adjusted_state_dict = {}
for key in model_state_dict.keys():
    new_key = key[7:] if key.startswith('module.') else key
    adjusted_state_dict[new_key] = model_state_dict[key]

# 创建模型实例
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load_from_name("ViT-B-16", device=device)

# 加载调整后的状态字典
try:
    model.load_state_dict(adjusted_state_dict)
except RuntimeError as e:
    print("Error:", e)
    model.load_state_dict(adjusted_state_dict, strict=False)

# 设置模型为评估模式
model.eval()

# 图像预处理
image_path = "imgs/皮卡丘.png"
image = preprocess(Image.open(image_path).convert("RGBA")).unsqueeze(0).to(device)

# 文本处理
text = clip.tokenize(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]).to(device)

# 推理
with torch.no_grad():
    image_features, text_features, logit_scale = model(image, text)

    # 归一化特征
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    # 计算相似度分数
    logits_per_image = logit_scale * image_features @ text_features.t()

    # 转换为概率
    probs_per_image = F.softmax(logits_per_image, dim=-1).cpu().numpy()

print("Label probabilities:", probs_per_image)


Loading vision model config from /root/autodl-tmp/project/cn_clip/clip/model_configs/ViT-B-16.json
Loading text model config from /root/autodl-tmp/project/cn_clip/clip/model_configs/RoBERTa-wwm-ext-base-chinese.json
Model info {'embed_dim': 512, 'image_resolution': 224, 'vision_layers': 12, 'vision_width': 768, 'vision_patch_size': 16, 'vocab_size': 21128, 'text_attention_probs_dropout_prob': 0.1, 'text_hidden_act': 'gelu', 'text_hidden_dropout_prob': 0.1, 'text_hidden_size': 768, 'text_initializer_range': 0.02, 'text_intermediate_size': 3072, 'text_max_position_embeddings': 512, 'text_num_attention_heads': 12, 'text_num_hidden_layers': 12, 'text_type_vocab_size': 2}
Label probabilities: [[0.e+00 0.e+00 6.e-08 1.e+00]]


In [8]:
probs_per_image[0].max()

1.0