# 计算测试集图像语义特征

抽取Pytorch训练得到的图像分类模型中间层的输出特征，作为输入图像的语义特征。

计算测试集所有图像的语义特征，使用t-SNE和UMAP两种降维方法降维至二维和三维，可视化。

分析不同类别的语义距离、异常数据、细粒度分类、高维数据结构。

## 导入工具包

In [3]:
from tqdm import tqdm

import pandas as pd
import numpy as np

import torch

import cv2
from PIL import Image

# 忽略烦人的红色提示
import warnings
warnings.filterwarnings("ignore")

# 有 GPU 就用 GPU，没有就用 CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device', device)
dataset_name = 'test_easy_classes'
model_path = 'models/2023-09-24-15:52_max_epoch_50/'

# 类别名称 和 ID索引号 的映射字典
class_names_dic = {0: 'covering', 1: 'device', 2: 'domestic_animal', 3: 'mater', 4: 'person', 5: 'plant',
                       6: 'structure', 7: 'vertebrate'}
# 获得类别名称
classes = list(class_names_dic.values())
print(classes)

device cuda:0
['covering', 'device', 'domestic_animal', 'mater', 'person', 'plant', 'structure', 'vertebrate']


## 图像预处理

In [4]:
from torchvision import transforms

# # 训练集图像预处理：缩放裁剪、图像增强、转 Tensor、归一化
# train_transform = transforms.Compose([transforms.RandomResizedCrop(224),
#                                       transforms.RandomHorizontalFlip(),
#                                       transforms.ToTensor(),
#                                       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
#                                      ])

# 测试集图像预处理-RCTN：缩放、裁剪、转 Tensor、归一化
test_transform = transforms.Compose([transforms.Resize(256),
                                     transforms.CenterCrop(224),
                                     transforms.ToTensor(),
                                     transforms.Normalize(
                                         mean=[0.485, 0.456, 0.406], 
                                         std=[0.229, 0.224, 0.225])
                                    ])

## 导入训练好的模型

In [11]:
from network import ShuffleNetV2_Plus
# 数据集文件夹路径
dataset_name = 'test_easy_classes'
model_path = 'models/2023-09-24-15:52_max_epoch_50/'
model_name = 'retrain_COME15K_checkpoint-best-avg-0.735-Medium.pth.tar'
dataset_dir = '../data_class_txt/'+ dataset_name + '.txt'
# init model
architecture = [0, 0, 3, 1, 1, 1, 0, 0, 2, 0, 2, 1, 1, 0, 2, 0, 2, 1, 3, 2]
model = ShuffleNetV2_Plus(architecture=architecture, n_class=class_names_dic.__len__(), model_size="Medium")
weight_path = model_path + model_name
trained_weight = torch.load(weight_path)
model.load_state_dict(trained_weight['state_dict'], strict=True)
model = model.eval().to(device)
model

model size is  Medium


ShuffleNetV2_Plus(
  (first_conv): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): HS()
  )
  (features): Sequential(
    (0): Shufflenet(
      (branch_main): Sequential(
        (0): Conv2d(16, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=24, bias=False)
        (4): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): Conv2d(24, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (6): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (7): ReLU(inplace=True)
      )
      (branch_proj): Sequential(
        (0): Conv2d(16, 16, kernel_size=(3, 

globalpool## 导入训练好的模型

In [10]:
model_and_weight_path = model_path + model_name
model = torch.load(model_and_weight_path)
model = model.eval().to(device)
model

AttributeError: 'dict' object has no attribute 'eval'

## 抽取模型中间层输出结果作为语义特征

In [8]:
from torchvision.models.feature_extraction import create_feature_extractor

In [12]:
model_trunc = create_feature_extractor(model, return_nodes={'globalpool': 'semantic_feature'})

## 计算单张图像的语义特征

In [13]:
img_path = 'data/SOD-SemanticDataset/test/COME15K-Hard/COME_Hard_1.jpg'
img_pil = Image.open(img_path)
input_img = test_transform(img_pil) # 预处理
input_img = input_img.unsqueeze(0).to(device)
# 执行前向预测，得到指定中间层的输出
pred_logits = model_trunc(input_img) 

In [14]:
pred_logits['semantic_feature'].squeeze().detach().cpu().numpy().shape

(1280,)

In [20]:
pred_logits['semantic_feature'].squeeze().detach().cpu().numpy()

array([-0.03510536,  0.35254237,  0.16077131, ..., -0.03608838,
        0.3300721 ,  0.34675378], dtype=float32)

## 载入测试集图像分类结果

In [21]:
df = pd.read_csv(model_path + dataset_name + '-测试集预测结果.csv')

In [22]:
df.head()

Unnamed: 0,图像路径,标注类别ID,标注类别名称,top-1-预测ID,top-1-预测名称,top-2-预测ID,top-2-预测名称,top-3-预测ID,top-3-预测名称,top-n预测正确,covering-预测置信度,device-预测置信度,domestic_animal-预测置信度,mater-预测置信度,person-预测置信度,plant-预测置信度,structure-预测置信度,vertebrate-预测置信度
0,data/SOD-SemanticDataset/test/COME15K-Easy/COM...,0,covering,3,mater,1,device,6,structure,False,0.053483,0.221736,0.001853,0.478643,0.015743,0.007416,0.206461,0.014667
1,data/SOD-SemanticDataset/test/COME15K-Easy/COM...,0,covering,2,domestic_animal,4,person,7,vertebrate,False,0.06581,0.0012,0.730829,0.007129,0.097066,0.001619,0.002674,0.093673
2,data/SOD-SemanticDataset/test/COME15K-Easy/COM...,0,covering,4,person,5,plant,6,structure,False,0.099537,0.015642,0.002969,0.060892,0.297557,0.256265,0.151516,0.115622
3,data/SOD-SemanticDataset/test/COME15K-Easy/COM...,0,covering,0,covering,4,person,3,mater,True,0.514543,0.002269,0.005743,0.012137,0.446524,0.005227,0.008116,0.005442
4,data/SOD-SemanticDataset/test/COME15K-Easy/COM...,0,covering,4,person,0,covering,7,vertebrate,True,0.300385,0.001374,0.02511,0.004683,0.617911,0.006019,0.003014,0.041503


## 计算测试集每张图像的语义特征

In [23]:
encoding_array = []
img_path_list = []

for img_path in tqdm(df['图像路径']):
    img_path_list.append(img_path)
    img_pil = Image.open(img_path).convert('RGB')
    input_img = test_transform(img_pil).unsqueeze(0).to(device) # 预处理
    feature = model_trunc(input_img)['semantic_feature'].squeeze().detach().cpu().numpy() # 执行前向预测，得到 avgpool 层输出的语义特征
    encoding_array.append(feature)
encoding_array = np.array(encoding_array)

100%|███████████████████████████████████████| 4600/4600 [00:59<00:00, 77.79it/s]


In [24]:
encoding_array.shape

(4600, 1280)

## 保存为本地的.npy文件

In [25]:
# 保存为本地的 npy 文件
np.save(model_path + dataset_name + '-测试集语义特征.npy', encoding_array)