# 计算测试集图像语义特征

抽取Pytorch训练得到的图像分类模型中间层的输出特征，作为输入图像的语义特征。

计算测试集所有图像的语义特征，使用t-SNE和UMAP两种降维方法降维至二维和三维，可视化。

分析不同类别的语义距离、异常数据、细粒度分类、高维数据结构。

## 导入工具包

In [8]:
from tqdm import tqdm

import pandas as pd
import numpy as np

import torch

import cv2
from PIL import Image

# 忽略烦人的红色提示
import warnings
warnings.filterwarnings("ignore")

# 有 GPU 就用 GPU，没有就用 CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device', device)

# 类别名称 和 ID索引号 的映射字典
class_names_dic = {0: 'covering', 1: 'device', 2: 'domestic_animal', 3: 'mater', 4: 'person', 5: 'plant',
                       6: 'structure', 7: 'vertebrate'}
# 获得类别名称
classes = list(class_names_dic.values())
print(classes)

device cuda:0
['covering', 'device', 'domestic_animal', 'mater', 'person', 'plant', 'structure', 'vertebrate']


## 图像预处理

In [9]:
from torchvision import transforms

# # 训练集图像预处理：缩放裁剪、图像增强、转 Tensor、归一化
# train_transform = transforms.Compose([transforms.RandomResizedCrop(224),
#                                       transforms.RandomHorizontalFlip(),
#                                       transforms.ToTensor(),
#                                       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
#                                      ])

# 测试集图像预处理-RCTN：缩放、裁剪、转 Tensor、归一化
test_transform = transforms.Compose([transforms.Resize(256),
                                     transforms.CenterCrop(224),
                                     transforms.ToTensor(),
                                     transforms.Normalize(
                                         mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
                                    ])

## 导入训练好的模型

In [23]:
from network import ShuffleNetV2_Plus
# 数据集文件夹路径
# dataset_name = 'test_easy_classes'
dataset_name = 'test_hard_classes'
dataset_dir = '../data_class_txt/'+ dataset_name + '.txt'
# 模型路径
model_path = './models/2023-10-09-01-47_max_epoch_100/'
model_name = 'retrain_COME15K_checkpoint-best-avg-0.743-Medium.pth.tar'
# init model
architecture = [0, 0, 3, 1, 1, 1, 0, 0, 2, 0, 2, 1, 1, 0, 2, 0, 2, 1, 3, 2]
model = ShuffleNetV2_Plus(architecture=architecture, n_class=class_names_dic.__len__(), model_size="Medium")
weight_path = model_path + model_name
model = torch.load(weight_path)
# model.load_state_dict(trained_weight['state_dict'], strict=True)
model = model.eval().to(device)
model

model size is  Medium
Shuffle3x3
Shuffle3x3
Xception
Shuffle5x5
Shuffle5x5
Shuffle5x5
Shuffle3x3
Shuffle3x3
Shuffle7x7
Shuffle3x3
Shuffle7x7
Shuffle5x5
Shuffle5x5
Shuffle3x3
Shuffle7x7
Shuffle3x3
Shuffle7x7
Shuffle5x5
Xception
Shuffle7x7


ShuffleNetV2_Plus(
  (first_conv): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.0012468827930174563, affine=True, track_running_stats=True)
    (2): HS()
  )
  (features): Sequential(
    (0): Shufflenet(
      (branch_main): Sequential(
        (0): Conv2d(16, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(24, eps=1e-05, momentum=0.0012468827930174563, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=24, bias=False)
        (4): BatchNorm2d(24, eps=1e-05, momentum=0.0012468827930174563, affine=True, track_running_stats=True)
        (5): Conv2d(24, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (6): BatchNorm2d(32, eps=1e-05, momentum=0.0012468827930174563, affine=True, track_running_stats=True)
        (7): ReLU(inplace=True)
      )
     

## 导入训练好的模型

In [24]:
model_and_weight_path = model_path + model_name
model = torch.load(model_and_weight_path)
model = model.eval().to(device)
model

ShuffleNetV2_Plus(
  (first_conv): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.0012468827930174563, affine=True, track_running_stats=True)
    (2): HS()
  )
  (features): Sequential(
    (0): Shufflenet(
      (branch_main): Sequential(
        (0): Conv2d(16, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(24, eps=1e-05, momentum=0.0012468827930174563, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=24, bias=False)
        (4): BatchNorm2d(24, eps=1e-05, momentum=0.0012468827930174563, affine=True, track_running_stats=True)
        (5): Conv2d(24, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (6): BatchNorm2d(32, eps=1e-05, momentum=0.0012468827930174563, affine=True, track_running_stats=True)
        (7): ReLU(inplace=True)
      )
     

## 抽取模型中间层输出结果作为语义特征

In [25]:
from torchvision.models.feature_extraction import create_feature_extractor

In [26]:
model_trunc = create_feature_extractor(model, return_nodes={'globalpool': 'semantic_feature'})

## 计算单张图像的语义特征

In [27]:
img_path = 'data/SOD-SemanticDataset/test/COME15K-Hard/COME_Hard_1.jpg'
img_pil = Image.open(img_path)
input_img = test_transform(img_pil) # 预处理
input_img = input_img.unsqueeze(0).to(device)
# 执行前向预测，得到指定中间层的输出
pred_logits = model_trunc(input_img) 

In [28]:
pred_logits['semantic_feature'].squeeze().detach().cpu().numpy().shape

(1280,)

In [29]:
pred_logits['semantic_feature'].squeeze().detach().cpu().numpy()

array([ 0.39754692,  0.4811616 ,  0.7771426 , ..., -0.02078315,
        1.0940305 ,  0.4439114 ], dtype=float32)

## 载入测试集图像分类结果

In [30]:
df = pd.read_csv(model_path + dataset_name + '-测试集预测结果.csv')

In [31]:
df.head()

Unnamed: 0,图像路径,标注类别ID,标注类别名称,top-1-预测ID,top-1-预测名称,top-2-预测ID,top-2-预测名称,top-3-预测ID,top-3-预测名称,top-n预测正确,covering-预测置信度,device-预测置信度,domestic_animal-预测置信度,mater-预测置信度,person-预测置信度,plant-预测置信度,structure-预测置信度,vertebrate-预测置信度
0,data/SOD-SemanticDataset/test/COME15K-Hard/COM...,0,covering,0,covering,4,person,3,mater,True,0.801112,0.006859,0.00255,0.0156,0.158353,0.006221,0.005665,0.003641
1,data/SOD-SemanticDataset/test/COME15K-Hard/COM...,0,covering,6,structure,3,mater,0,covering,True,0.113383,0.02764,0.006023,0.259255,0.044438,0.006826,0.53832,0.004116
2,data/SOD-SemanticDataset/test/COME15K-Hard/COM...,0,covering,0,covering,4,person,2,domestic_animal,True,0.600943,0.00828,0.050587,0.032752,0.277345,0.005105,0.013126,0.011862
3,data/SOD-SemanticDataset/test/COME15K-Hard/COM...,0,covering,4,person,3,mater,1,device,False,0.017682,0.046715,0.005153,0.087193,0.813342,0.003787,0.019615,0.006513
4,data/SOD-SemanticDataset/test/COME15K-Hard/COM...,0,covering,0,covering,3,mater,4,person,True,0.681542,0.039885,0.012647,0.156968,0.054006,0.019469,0.012044,0.023439


## 计算测试集每张图像的语义特征

In [32]:
encoding_array = []
img_path_list = []

for img_path in tqdm(df['图像路径']):
    img_path_list.append(img_path)
    img_pil = Image.open(img_path).convert('RGB')
    input_img = test_transform(img_pil).unsqueeze(0).to(device) # 预处理
    feature = model_trunc(input_img)['semantic_feature'].squeeze().detach().cpu().numpy() # 执行前向预测，得到 avgpool 层输出的语义特征
    encoding_array.append(feature)
encoding_array = np.array(encoding_array)

100%|██████████████████████████████████████| 3000/3000 [00:25<00:00, 117.70it/s]


In [33]:
encoding_array.shape

(3000, 1280)

## 保存为本地的.npy文件

In [34]:
# 保存为本地的 npy 文件
np.save(model_path + dataset_name + '-测试集语义特征.npy', encoding_array)