# 数据导入

In [1]:
from datasets import load_dataset

dataset = load_dataset(path="imagefolder", data_dir="E:\workdata\hfdataset") #路径不能有中文

Resolving data files:   0%|          | 0/5380 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/312 [00:00<?, ?it/s]

In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 5380
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 312
    })
})


# 利用VIT训练模型

### 将图像转换为张量

In [3]:

from transformers import AutoImageProcessor

# 加载图像处理器
image_processor = AutoImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")

def transforms(examples):
    images = [img.convert("RGB").resize((384, 384)) for img in examples["image"]]  # 图片被转换为RGB通道，同时缩放至384*384
    
    examples["pixel_values"] = image_processor(images, return_tensors="pt")["pixel_values"] # 使用image_processor处理图像，生成pixel_values（张量图像）
    return examples

dataset.set_transform(transforms)
dataset['train'][0].keys()

dict_keys(['image', 'label', 'pixel_values'])

### 定义数据整理器，用于对图像分批

In [4]:
import torch

def collate_fn(batch): # batch应包含pixel_values和labels
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]), # 返回堆叠的图像张量
        'labels': torch.tensor([x['labels'] for x in batch]) # 返回堆叠的标签张量
    }

### 定义评价器

In [5]:
import numpy as np
import evaluate

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(p):
    # 计算预测结果
    predictions = np.argmax(p.predictions, axis=1)

    # 计算各个指标
    accuracy = accuracy_metric.compute(predictions=predictions, references=p.label_ids)
    f1 = f1_metric.compute(predictions=predictions, references=p.label_ids, average='weighted')
    precision = precision_metric.compute(predictions=predictions, references=p.label_ids, average='weighted')
    recall = recall_metric.compute(predictions=predictions, references=p.label_ids, average='weighted')

    # 输出结果
    return {
        'accuracy': accuracy['accuracy'],
        'f1': f1['f1'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }


  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


### 修改label为labels以符合VIT要求

In [6]:
for split in dataset:
    dataset[split] = dataset[split].rename_column('label', 'labels') # 将 'label' 特征名改为 'labels'

### 加载初始模型

In [7]:
from transformers import ViTForImageClassification

model_name = 'google/vit-hybrid-base-bit-384' # 指定VIT模型

labels = dataset['train'].features['labels'].names # 提取标签信息

model = ViTForImageClassification.from_pretrained(
    model_name,
    num_labels=len(labels), # 指定模型输出层的大小(类别数)
    id2label={str(i): c for i, c in enumerate(labels)}, #建立标签和类别索引的映射关系
    label2id={c: str(i) for i, c in enumerate(labels)},
    ignore_mismatched_sizes=True
)

You are using a model of type vit-hybrid to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at google/vit-hybrid-base-bit-384 were not used when initializing ViTForImageClassification: ['vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.2.layers.3.conv1.weight', 'vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.1.layers.0.conv3.weight', 'vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.2.layers.8.norm1.bias', 'vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.1.layers.2.norm1.weight', 'vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.0.layers.1.conv3.weight', 'vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.2.layers.1.norm1.bias', 'vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.2.layers.3.norm1.weight', 'vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.1.layers.2.norm3.weight', 'vit.embeddings.patch_e

### 设定训练参数

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-Hybird-covid", # 模型保存的目录
  per_device_train_batch_size=512, # 越大模型越稳定但需要消耗更大显存
  evaluation_strategy="steps", #模型评估策略，steps表示每过一定次数评估一次模型
  num_train_epochs=100, #训练轮数
  fp16=True, #半精度浮点，可减少内存占用，需要设备支持
  save_steps=100, #多少步保存一次模型
  eval_steps=100, #多少步评估一次模型
  logging_steps=3, #多少步评估一次日志
  learning_rate=5e-5, #学习率 越低越容易过拟合，越高收敛越慢
  save_total_limit=3, #保存的模型总数
  remove_unused_columns=False, #删除未使用列，减少内存消耗
  push_to_hub=False, #是否将模型发布到hf社区
  report_to='tensorboard', #日志的报告地
  load_best_model_at_end=True, #是否自动加载最优模型
  ignore_data_skip=True #从断点继续训练模型
)


##### 更多参数解释 https://zhuanlan.zhihu.com/p/363670628

### 将参数传入训练器

In [9]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    
)


### 训练开始

In [10]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()




OutOfMemoryError: CUDA out of memory. Tried to allocate 108.00 GiB. GPU 0 has a total capacity of 8.00 GiB of which 4.87 GiB is free. Of the allocated memory 2.00 GiB is allocated by PyTorch, and 39.50 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# 性能验证

## 基准测试

In [28]:
metrics = trainer.evaluate(dataset['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


***** eval metrics *****
  epoch                   =      100.0
  eval_accuracy           =     0.8397
  eval_f1                 =     0.8355
  eval_loss               =     0.3775
  eval_precision          =     0.8557
  eval_recall             =     0.8278
  eval_runtime            = 0:00:15.93
  eval_samples_per_second =     19.584
  eval_steps_per_second   =      2.448


## 外部验证

In [None]:
from datasets import load_dataset

Outerdataset = load_dataset(path="imagefolder", data_dir="E:/jupyter/VIT_example/dataset/Outerdataset") #路径不能有中文
Outerdataset.set_transform(transforms)
Outerdataset['test'][0].keys()
for split in Outerdataset:
    Outerdataset[split] = Outerdataset[split].rename_column('label', 'labels') # 将 'label' 特征名改为 'labels'


In [None]:
metrics = trainer.evaluate(Outerdataset["test"])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

## 对特定图片进行预测

In [29]:
num=1
image=dataset["test"][num]["image"]
dataset["test"][num]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1458x1303>,
 'labels': 0,
 'pixel_values': tensor([[[-0.6392, -0.6314, -0.6235,  ..., -0.9843, -0.9843, -0.9843],
          [-0.6078, -0.6000, -0.6000,  ..., -0.9843, -0.9843, -0.9843],
          [-0.5922, -0.5843, -0.5765,  ..., -0.9843, -0.9843, -0.9843],
          ...,
          [-0.1216, -0.0980, -0.0824,  ..., -0.3647, -0.3804, -0.3882],
          [-0.1137, -0.0980, -0.0902,  ..., -0.3725, -0.3804, -0.3961],
          [-0.1137, -0.0980, -0.0824,  ..., -0.3647, -0.3804, -0.3961]],
 
         [[-0.6392, -0.6314, -0.6235,  ..., -0.9843, -0.9843, -0.9843],
          [-0.6078, -0.6000, -0.6000,  ..., -0.9843, -0.9843, -0.9843],
          [-0.5922, -0.5843, -0.5765,  ..., -0.9843, -0.9843, -0.9843],
          ...,
          [-0.1216, -0.0980, -0.0824,  ..., -0.3647, -0.3804, -0.3882],
          [-0.1137, -0.0980, -0.0902,  ..., -0.3725, -0.3804, -0.3961],
          [-0.1137, -0.0980, -0.0824,  ..., -0.3647, -0.3804, -0.3961

In [30]:
from transformers import pipeline

classifier = pipeline("image-classification", model='./vit-Hybird-covid/' )
classifier(image)

[{'score': 0.9786100387573242, 'label': 'Covid'},
 {'score': 0.018465569242835045, 'label': 'Normal'},
 {'score': 0.0024604233913123608, 'label': 'Lung Opacity'},
 {'score': 0.0004639296093955636, 'label': 'Viral Pneumonia'}]

## 进行Covid与非Covid的二分类预测

In [31]:
def compute_metrics(p):
    # 将预测结果中的所有非0（非Covid）标签转换为1
    predictions = np.argmax(p.predictions, axis=1)
    binary_predictions = np.where(predictions == 0, 0, 1)

    # 将真实标签中的所有非0（非Covid）标签转换为1
    binary_references = np.where(p.label_ids == 0, 0, 1)

    # 计算二分类的各个指标
    accuracy = accuracy_metric.compute(predictions=binary_predictions, references=binary_references)['accuracy']
    f1 = f1_metric.compute(predictions=binary_predictions, references=binary_references, average='binary')['f1']
    precision = precision_metric.compute(predictions=binary_predictions, references=binary_references, average='binary')['precision']
    recall = recall_metric.compute(predictions=binary_predictions, references=binary_references, average='binary')['recall']

    # 返回计算的指标
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [32]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-Hybird-covid", # 模型保存的目录
  per_device_train_batch_size=16, # 越大模型越稳定但需要消耗更大显存
  evaluation_strategy="steps", #模型评估策略，steps表示每过一定次数评估一次模型
  num_train_epochs=100, #训练轮数
  fp16=True, #半精度浮点，可减少内存占用，需要设备支持
  save_steps=100, #多少步保存一次模型
  eval_steps=100, #多少步评估一次模型
  logging_steps=3, #多少步评估一次日志
  learning_rate=2e-4, #学习率 越低越容易过拟合，越高收敛越慢
  save_total_limit=10, #保存的模型总数
  remove_unused_columns=False, #删除未使用列，减少内存消耗
  push_to_hub=False, #是否将模型发布到hf社区
  report_to='tensorboard', #日志的报告地
  load_best_model_at_end=True, #是否自动加载最优模型
  ignore_data_skip=True #从断点继续训练模型
)
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    
)


In [33]:
metrics = trainer.evaluate(dataset['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


***** eval metrics *****
  eval_accuracy           =     0.8526
  eval_f1                 =     0.8631
  eval_loss               =     0.3775
  eval_precision          =     0.9295
  eval_recall             =     0.8056
  eval_runtime            = 0:00:14.24
  eval_samples_per_second =     21.895
  eval_steps_per_second   =      2.737
