# 数据导入

In [1]:
from datasets import load_dataset

dataset = load_dataset(path="imagefolder", data_dir="E:/jupyter/VIT_example/dataset/hfdataset_Mini") #路径不能有中文

Resolving data files:   0%|          | 0/3768 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1612 [00:00<?, ?it/s]

In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 3768
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1612
    })
})


# 利用VIT训练模型

### 将图像转换为张量

In [3]:

from transformers import AutoImageProcessor

# 加载图像处理器
image_processor = AutoImageProcessor.from_pretrained("google/vit-large-patch32-384")

def transforms(examples):
    images = [img.convert("RGB").resize((384, 384)) for img in examples["image"]]  # 图片被转换为RGB通道，同时缩放至384*384
    
    examples["pixel_values"] = image_processor(images, return_tensors="pt")["pixel_values"] # 使用image_processor处理图像，生成pixel_values（张量图像）
    return examples

dataset.set_transform(transforms)
dataset['train'][0].keys()

dict_keys(['image', 'label', 'pixel_values'])

### 定义数据整理器，用于对图像分批

In [4]:
import torch

def collate_fn(batch): # batch应包含pixel_values和labels
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]), # 返回堆叠的图像张量
        'labels': torch.tensor([x['labels'] for x in batch]) # 返回堆叠的标签张量
    }

### 定义评价器

In [5]:
import numpy as np
import evaluate

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(p):
    # 计算预测结果
    predictions = np.argmax(p.predictions, axis=1)

    # 计算各个指标
    accuracy = accuracy_metric.compute(predictions=predictions, references=p.label_ids)
    f1 = f1_metric.compute(predictions=predictions, references=p.label_ids, average='weighted')
    precision = precision_metric.compute(predictions=predictions, references=p.label_ids, average='weighted')
    recall = recall_metric.compute(predictions=predictions, references=p.label_ids, average='weighted')

    # 输出结果
    return {
        'accuracy': accuracy['accuracy'],
        'f1': f1['f1'],
        'precision': precision['precision'],
        'recall': recall['recall']
    }


### 修改label为labels以符合VIT要求

In [6]:
for split in dataset:
    dataset[split] = dataset[split].rename_column('label', 'labels') # 将 'label' 特征名改为 'labels'

### 加载初始模型

In [7]:
from transformers import ViTForImageClassification

model_name = 'google/vit-large-patch32-384' # 指定VIT模型

labels = dataset['train'].features['labels'].names # 提取标签信息

model = ViTForImageClassification.from_pretrained(
    model_name,
    num_labels=len(labels), # 指定模型输出层的大小(类别数)
    id2label={str(i): c for i, c in enumerate(labels)}, #建立标签和类别索引的映射关系
    label2id={c: str(i) for i, c in enumerate(labels)},
    ignore_mismatched_sizes=True
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch32-384 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 设定训练参数

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-large-covid", # 模型保存的目录
  per_device_train_batch_size=16, # 越大模型越稳定但需要消耗更大显存
  evaluation_strategy="steps", #模型评估策略，steps表示每过一定次数评估一次模型
  num_train_epochs=100, #训练轮数
  fp16=True, #半精度浮点，可减少内存占用，需要设备支持
  save_steps=100, #多少步保存一次模型
  eval_steps=100, #多少步评估一次模型
  logging_steps=3, #多少步评估一次日志
  learning_rate=2e-4, #学习率 越低越容易过拟合，越高收敛越慢
  save_total_limit=10, #保存的模型总数
  remove_unused_columns=False, #删除未使用列，减少内存消耗
  push_to_hub=False, #是否将模型发布到hf社区
  report_to='tensorboard', #日志的报告地
  load_best_model_at_end=True, #是否自动加载最优模型
  ignore_data_skip=True #从断点继续训练模型
)


##### 更多参数解释 https://zhuanlan.zhihu.com/p/363670628

### 将参数传入训练器

In [9]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    
)


### 训练开始

In [10]:
# # 启动 TensorBoard
# import os
# os.system("tensorboard --logdir 'E:\jupyter\VIT_example\vit-large-covid\runs'")

In [11]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.7985,0.633632,0.790943,0.769791,0.816753,0.790943
200,0.3215,0.474745,0.844293,0.840186,0.856883,0.844293
300,0.4277,0.485817,0.826923,0.831554,0.859316,0.826923
400,0.3462,0.335386,0.885236,0.884636,0.889117,0.885236
500,0.0916,0.321727,0.891439,0.889616,0.89304,0.891439
600,0.0546,0.276444,0.918114,0.918364,0.919551,0.918114
700,0.1116,0.452279,0.860422,0.860419,0.882077,0.860422
800,0.0914,0.280849,0.920596,0.921606,0.924271,0.920596
900,0.0351,0.290784,0.923697,0.923957,0.925599,0.923697
1000,0.0941,0.322099,0.922457,0.922795,0.923265,0.922457


***** train metrics *****
  epoch                    =          100.0
  total_flos               = 284653198700GF
  train_loss               =         0.0246
  train_runtime            =     5:32:27.48
  train_samples_per_second =          18.89
  train_steps_per_second   =          1.183


# 性能验证

## 基准测试

In [12]:
metrics = trainer.evaluate(dataset['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


***** eval metrics *****
  epoch                   =      100.0
  eval_accuracy           =     0.9181
  eval_f1                 =     0.9184
  eval_loss               =     0.2764
  eval_precision          =     0.9196
  eval_recall             =     0.9181
  eval_runtime            = 0:00:23.05
  eval_samples_per_second =     69.924
  eval_steps_per_second   =      8.762


## 外部验证

In [13]:
from datasets import load_dataset

Outerdataset = load_dataset(path="imagefolder", data_dir="E:/jupyter/VIT_example/dataset/Outerdataset") #路径不能有中文
Outerdataset.set_transform(transforms)
Outerdataset['test'][0].keys()
for split in Outerdataset:
    Outerdataset[split] = Outerdataset[split].rename_column('label', 'labels') # 将 'label' 特征名改为 'labels'

Resolving data files:   0%|          | 0/312 [00:00<?, ?it/s]

In [20]:
metrics = trainer.evaluate(Outerdataset["test"])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  eval_accuracy           =     0.8814
  eval_f1                 =     0.9003
  eval_loss               =     4.4802
  eval_precision          =     0.8743
  eval_recall             =     0.9278
  eval_runtime            = 0:00:14.54
  eval_samples_per_second =     21.447
  eval_steps_per_second   =      2.681


## 对特定图片进行预测

In [15]:
num=1
image=Outerdataset["test"][num]["image"]
Outerdataset["test"][num]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1458x1303>,
 'labels': 0,
 'pixel_values': tensor([[[-0.6392, -0.6314, -0.6235,  ..., -0.9843, -0.9843, -0.9843],
          [-0.6078, -0.6000, -0.6000,  ..., -0.9843, -0.9843, -0.9843],
          [-0.5922, -0.5843, -0.5765,  ..., -0.9843, -0.9843, -0.9843],
          ...,
          [-0.1216, -0.0980, -0.0824,  ..., -0.3647, -0.3804, -0.3882],
          [-0.1137, -0.0980, -0.0902,  ..., -0.3725, -0.3804, -0.3961],
          [-0.1137, -0.0980, -0.0824,  ..., -0.3647, -0.3804, -0.3961]],
 
         [[-0.6392, -0.6314, -0.6235,  ..., -0.9843, -0.9843, -0.9843],
          [-0.6078, -0.6000, -0.6000,  ..., -0.9843, -0.9843, -0.9843],
          [-0.5922, -0.5843, -0.5765,  ..., -0.9843, -0.9843, -0.9843],
          ...,
          [-0.1216, -0.0980, -0.0824,  ..., -0.3647, -0.3804, -0.3882],
          [-0.1137, -0.0980, -0.0902,  ..., -0.3725, -0.3804, -0.3961],
          [-0.1137, -0.0980, -0.0824,  ..., -0.3647, -0.3804, -0.3961

In [16]:
from transformers import pipeline

classifier = pipeline("image-classification", model='./vit-large-covid/' )
classifier(image)

[{'score': 0.9980438947677612, 'label': 'Covid'},
 {'score': 0.0011776406317949295, 'label': 'Normal'},
 {'score': 0.0005966455792076886, 'label': 'Viral Pneumonia'},
 {'score': 0.00018177053425461054, 'label': 'Lung Opacity'}]

### 进行Covid与非Covid的二分类预测

In [17]:
def compute_metrics(p):
    # 将预测结果中的所有非0（非Covid）标签转换为1
    predictions = np.argmax(p.predictions, axis=1)
    binary_predictions = np.where(predictions == 0, 0, 1)

    # 将真实标签中的所有非0（非Covid）标签转换为1
    binary_references = np.where(p.label_ids == 0, 0, 1)

    # 计算二分类的各个指标
    accuracy = accuracy_metric.compute(predictions=binary_predictions, references=binary_references)['accuracy']
    f1 = f1_metric.compute(predictions=binary_predictions, references=binary_references, average='binary')['f1']
    precision = precision_metric.compute(predictions=binary_predictions, references=binary_references, average='binary')['precision']
    recall = recall_metric.compute(predictions=binary_predictions, references=binary_references, average='binary')['recall']

    # 返回计算的指标
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-Large-covid", # 模型保存的目录
  per_device_train_batch_size=16, # 越大模型越稳定但需要消耗更大显存
  evaluation_strategy="steps", #模型评估策略，steps表示每过一定次数评估一次模型
  num_train_epochs=100, #训练轮数
  fp16=True, #半精度浮点，可减少内存占用，需要设备支持
  save_steps=100, #多少步保存一次模型
  eval_steps=100, #多少步评估一次模型
  logging_steps=3, #多少步评估一次日志
  learning_rate=1e-5, #学习率 越低越容易过拟合，越高收敛越慢
  save_total_limit=3, #保存的模型总数
  remove_unused_columns=False, #删除未使用列，减少内存消耗
  push_to_hub=False, #是否将模型发布到hf社区
  report_to='tensorboard', #日志的报告地
  load_best_model_at_end=True, #是否自动加载最优模型
  ignore_data_skip=True #从断点继续训练模型
)
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=Outerdataset["test"],
    tokenizer=image_processor,
    
)


In [19]:
metrics = trainer.evaluate(Outerdataset['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


***** eval metrics *****
  eval_accuracy           =     0.8814
  eval_f1                 =     0.9003
  eval_loss               =     4.4802
  eval_precision          =     0.8743
  eval_recall             =     0.9278
  eval_runtime            = 0:00:13.47
  eval_samples_per_second =     23.155
  eval_steps_per_second   =      2.894
