In [1]:

import os
#设置线程库以避免冲突
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, models, transforms
from PIL import Image,ImageFile
import shutil
import random
from collections import defaultdict
from PIL import ImageFile

# 允许加载截断的图像
ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = None
# 设置随机种子以确保结果可重现
torch.manual_seed(42)
random.seed(42)

In [2]:
#定义预处理函数进行数据增强
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [3]:
#根据标签文件重新组织数据文件夹结构以适应ImageFolder
import pandas as pd

def reorganize_dataset_by_label(src_dir, dst_dir, labels_file, val_split=0.2):
    """
    根据标签文件重新组织数据集结构
    
    参数:
    src_dir: 原始图像文件所在目录
    dst_dir: 重新组织后的目录
    labels_file: 包含文件名和类别信息的CSV文件路径
    val_split: 验证集占比
    """
    # 创建目标文件夹
    train_dir = os.path.join(dst_dir, 'train')
    val_dir = os.path.join(dst_dir, 'val')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    
    # 读取标签文件
    labels_df = pd.read_csv(labels_file)
    print(f"总共找到 {len(labels_df)} 张图片")
    
    # 按类别分组
    class_groups = defaultdict(list)
    for _, row in labels_df.iterrows():
        filename = row['filename']
        # 使用category_id作为类别名称
        class_name = str(row['category_id'])  # 转换为字符串以用作文件夹名称
        class_groups[class_name].append(filename)
    
    print(f"总共 {len(class_groups)} 个类别")
    
    # 创建类别文件夹并分配文件
    for class_name, files in class_groups.items():
        os.makedirs(os.path.join(train_dir, class_name), exist_ok=True)
        os.makedirs(os.path.join(val_dir, class_name), exist_ok=True)
        
        # 随机分割训练集和验证集
        random.shuffle(files)
        split_idx = int(len(files) * (1 - val_split))
        train_files = files[:split_idx]
        val_files = files[split_idx:]
        
        # 复制训练文件
        for file in train_files:
            src_path = os.path.join(src_dir, file)
            dst_path = os.path.join(train_dir, class_name, file)
            try:
                shutil.copy2(src_path, dst_path)
            except Exception as e:
                print(f"复制文件 {file} 时出错: {e}")
        
        # 复制验证文件
        for file in val_files:
            src_path = os.path.join(src_dir, file)
            dst_path = os.path.join(val_dir, class_name, file)
            try:
                shutil.copy2(src_path, dst_path)
            except Exception as e:
                print(f"复制文件 {file} 时出错: {e}")
    
    print(f"数据集重新组织完成")
    return len(class_groups)

In [4]:
# 检查是否已经重新组织过数据
data_dir = '../train'
organized_data_dir = '../organized_train'
labels_file = '../train_labels.csv'

# 如果还没有组织数据，则进行组织
if not os.path.exists(organized_data_dir):
    print("正在根据标签文件重新组织数据集...")
    num_classes = reorganize_dataset_by_label(data_dir, organized_data_dir, labels_file)
else:
    # 计算类别数
    train_classes = os.listdir(os.path.join(organized_data_dir, 'train'))
    num_classes = len(train_classes)
    print(f"使用已有的组织好的数据集，共 {num_classes} 个类别")

使用已有的组织好的数据集，共 100 个类别


In [5]:
try:
    image_datasets = {}
    for x in ['train', 'val']:
        image_datasets[x] = datasets.ImageFolder(
            os.path.join(organized_data_dir, x),
            transform=data_transforms[x]
        )

    dataloaders = {x: torch.utils.data.DataLoader(
                        image_datasets[x], 
                        batch_size=64,           # 适中的批处理大小
                        shuffle=True, 
                        num_workers=0,           # 适中的并行数
                     ) for x in ['train', 'val']}

    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
    class_names = image_datasets['train'].classes

    print(f"训练集大小: {dataset_sizes['train']}")
    print(f"验证集大小: {dataset_sizes['val']}")
    print(f"类别数: {len(class_names)}")
except Exception as e:
    print(f"加载数据集时出错: {e}")
    print("请检查数据目录结构是否正确")

训练集大小: 15930
验证集大小: 3998
类别数: 100


In [6]:
# 模型定义和设置

try:
    weights = models.ConvNeXt_Weights.DEFAULT
    model_ft = models.convnext_base(weights=weights)
    print("使用最新的ConvNeXt预训练权重")
except AttributeError:
    # 对于旧版本的torchvision
    model_ft = models.convnext_base(pretrained=True)
    print("使用旧版pretrained参数")

# 冻结所有参数
for param in model_ft.parameters():
    param.requires_grad = False

# 替换最后的全连接层以适应我们的分类任务
num_ftrs=model_ft.classifier[2].in_features
model_ft.classifier[2] = nn.Linear(num_ftrs, num_classes)

# 设置设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_ft = model_ft.to(device)



使用旧版pretrained参数


In [7]:
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

# 只有分类层的参数需要优化
optimizer_ft = optim.Adam(model_ft.classifier[2].parameters(), lr=0.001,weight_decay=1e-4)

# 学习率调度器
exp_lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer_ft, T_max=10, eta_min=0.0001)

In [8]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # 每个epoch都有训练和验证阶段
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # 训练模式
            else:
                model.eval()   # 验证模式

            running_loss = 0.0
            running_corrects = 0
            
            # 获取当前阶段的数据集大小
            phase_size = dataset_sizes[phase]
            batch_count = len(dataloaders[phase])
            
            # 遍历数据（添加异常处理）
            for i, (inputs, labels) in enumerate(dataloaders[phase]):
                try:  # 添加异常处理
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # 前向传播
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # 反向传播 + 优化（仅在训练阶段）
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # 统计
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                    
                    # 打印批次进度
                    if (i + 1) % 10 == 0 or (i + 1) == batch_count:
                        print(f'{phase} Epoch {epoch} - Batch {i+1}/{batch_count} completed')
                
                except Exception as e: 
                    print(f"跳过批次 {i} (错误: {str(e)[:50]}...)")
                    continue  

            # 对于CosineAnnealingWarmRestarts，每个batch后更新学习率
            if phase == 'train':
                if isinstance(scheduler, optim.lr_scheduler.CosineAnnealingWarmRestarts):
                    scheduler.step()
                else:
                    scheduler.step()

            epoch_loss = running_loss / max(phase_size, 1)
            epoch_acc = running_corrects.double() / max(phase_size, 1)

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # 深拷贝模型
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()
                print(f'最佳验证准确率: {best_acc:.4f}')

        print()

    print(f'Best val Acc: {best_acc:4f}')

    # 加载最佳模型权重
    model.load_state_dict(best_model_wts)
    return model

In [9]:
# 训练模型
print("开始训练模型...")
try:
    model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                           num_epochs=50
                           )
    
    # 保存模型
    torch.save(model_ft.state_dict(), 'convnext_large_flower_plus_recognition.pth')
    print("模型已保存为 convnext_large_plus_flower_recognition.pth")
except Exception as e:
    print(f"训练过程中出现错误: {e}")

开始训练模型...
Epoch 0/49
----------
train Epoch 0 - Batch 10/249 completed
train Epoch 0 - Batch 20/249 completed
train Epoch 0 - Batch 30/249 completed
train Epoch 0 - Batch 40/249 completed
train Epoch 0 - Batch 50/249 completed
train Epoch 0 - Batch 60/249 completed
train Epoch 0 - Batch 70/249 completed
train Epoch 0 - Batch 80/249 completed
train Epoch 0 - Batch 90/249 completed
train Epoch 0 - Batch 100/249 completed
train Epoch 0 - Batch 110/249 completed
train Epoch 0 - Batch 120/249 completed
train Epoch 0 - Batch 130/249 completed
train Epoch 0 - Batch 140/249 completed
train Epoch 0 - Batch 150/249 completed
train Epoch 0 - Batch 160/249 completed
train Epoch 0 - Batch 170/249 completed
train Epoch 0 - Batch 180/249 completed
train Epoch 0 - Batch 190/249 completed
train Epoch 0 - Batch 200/249 completed
train Epoch 0 - Batch 210/249 completed
train Epoch 0 - Batch 220/249 completed
train Epoch 0 - Batch 230/249 completed
train Epoch 0 - Batch 240/249 completed
train Epoch 0 - B