# 树叶分类

In [1]:
import torch
import collections
import math
import os
import shutil  # shutil库用于文件和文件夹的高级操作，例如复制、移动、删除等
import pandas as pd
import torch
import torchvision
from torch import nn

In [None]:
from torch.nn import functional as F
import time
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import hashlib
import requests
import zipfile

In [None]:
# 读取数据并查看类别
data_dir = './classify-leaves'
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))

print(f'训练集大小: {len(train_df)}')
print(f'测试集大小: {len(test_df)}')
print(f'\n训练集前几行:')
print(train_df.head())

# 获取所有类别
labels = train_df['label'].unique()
num_classes = len(labels)
print(f'\n类别数量: {num_classes}')
print(f'部分类别: {labels[:10]}')

In [None]:
# 创建标签到索引的映射
label_to_idx = {label: idx for idx, label in enumerate(sorted(labels))}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}

print(f'标签映射示例:')
for i, (label, idx) in enumerate(list(label_to_idx.items())[:5]):
    print(f'{label} -> {idx}')

In [None]:
# 自定义数据集类
from torch.utils.data import Dataset
from PIL import Image

class LeavesDataset(Dataset):
    """树叶分类数据集"""
    def __init__(self, csv_file, root_dir, label_to_idx=None, transform=None, is_test=False):
        """
        Args:
            csv_file (string): CSV文件路径
            root_dir (string): 数据根目录
            label_to_idx (dict): 标签到索引的映射
            transform (callable, optional): 数据变换
            is_test (bool): 是否为测试集
        """
        self.data_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.is_test = is_test
        self.label_to_idx = label_to_idx
        
    def __len__(self):
        return len(self.data_frame)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # 获取图片路径
        img_name = os.path.join(self.root_dir, self.data_frame.iloc[idx, 0])
        image = Image.open(img_name).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        if self.is_test:
            # 测试集只返回图片
            return image
        else:
            # 训练集返回图片和标签
            label = self.data_frame.iloc[idx, 1]
            label_idx = self.label_to_idx[label]
            return image, label_idx

print('数据集类定义完成')

In [None]:
# 数据增强和预处理（增强版）
from torchvision import transforms

# 训练集数据增强 - 适度增强
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),  # 随机裁剪并调整大小
    transforms.RandomHorizontalFlip(),  # 随机水平翻转
    transforms.RandomVerticalFlip(p=0.3),  # 随机垂直翻转
    transforms.RandomRotation(30),  # 增加旋转角度到30度
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),  # 增强颜色抖动
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet标准化
])

# 验证集/测试集数据预处理（不做数据增强）
val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

print('数据增强设置完成')

In [None]:
# 划分训练集和验证集
from sklearn.model_selection import train_test_split

# 将训练数据划分为训练集和验证集（80%训练，20%验证）
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

# 保存为临时CSV文件
train_csv_path = os.path.join(data_dir, 'train_split.csv')
val_csv_path = os.path.join(data_dir, 'val_split.csv')
train_data.to_csv(train_csv_path, index=False)
val_data.to_csv(val_csv_path, index=False)

print(f'训练集大小: {len(train_data)}')
print(f'验证集大小: {len(val_data)}')

In [None]:
# 创建数据集和数据加载器
batch_size = 32

# 训练数据集
train_dataset = LeavesDataset(
    csv_file=train_csv_path,
    root_dir=data_dir,
    label_to_idx=label_to_idx,
    transform=train_transform,
    is_test=False
)

# 验证数据集
val_dataset = LeavesDataset(
    csv_file=val_csv_path,
    root_dir=data_dir,
    label_to_idx=label_to_idx,
    transform=val_transform,
    is_test=False
)

# 测试数据集
test_dataset = LeavesDataset(
    csv_file=os.path.join(data_dir, 'test.csv'),
    root_dir=data_dir,
    label_to_idx=label_to_idx,
    transform=val_transform,
    is_test=True
)

# 数据加载器
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

print(f'训练集批次数: {len(train_loader)}')
print(f'验证集批次数: {len(val_loader)}')
print(f'测试集批次数: {len(test_loader)}')

In [None]:
# GPU设备函数
def try_gpu(i=0):
    """返回第i个GPU设备，如果不存在则返回CPU"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

print(f'可用GPU数量: {torch.cuda.device_count()}')

In [None]:
# 定义模型 - 使用ResNet34从头训练
import torchvision.models as models

def get_model(num_classes, model_name='resnet34', pretrained=False):
    """
    创建ResNet模型
    Args:
        num_classes: 类别数量
        model_name: 模型名称 ('resnet18', 'resnet34', 'resnet50')
        pretrained: 是否使用预训练权重（从头训练设为False）
    """
    # 加载指定的ResNet模型
    if model_name == 'resnet18':
        model = models.resnet18(pretrained=pretrained)
    elif model_name == 'resnet34':
        model = models.resnet34(pretrained=pretrained)
    elif model_name == 'resnet50':
        model = models.resnet50(pretrained=pretrained)
    else:
        raise ValueError(f'不支持的模型: {model_name}')
    
    # 修改最后的全连接层以适应我们的类别数量
    num_features = model.fc.in_features
    model.fc = nn.Linear(num_features, num_classes)
    
    return model

# 创建模型 - 使用ResNet34从头训练
device = try_gpu()
net = get_model(num_classes=num_classes, model_name='resnet34', pretrained=False)
net = net.to(device)

print(f'使用设备: {device}')
print(f'模型: ResNet-34 (从头训练)')
print(f'模型类别数: {num_classes}')
print(f'模型最后一层: {net.fc}')

# 统计模型参数
total_params = sum(p.numel() for p in net.parameters())
trainable_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
print(f'\n总参数量: {total_params:,}')
print(f'可训练参数: {trainable_params:,}')

In [None]:
# 定义训练函数
def train_epoch(net, train_loader, loss_fn, optimizer, device):
    """训练一个epoch"""
    net.train()
    train_loss = 0.0
    train_acc = 0.0
    total = 0
    
    for batch_idx, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        
        # 前向传播
        outputs = net(images)
        loss = loss_fn(outputs, labels)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 统计
        train_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        train_acc += predicted.eq(labels).sum().item()
        total += labels.size(0)
        
        # 打印进度
        if (batch_idx + 1) % 50 == 0:
            print(f'  Batch [{batch_idx + 1}/{len(train_loader)}], '
                  f'Loss: {loss.item():.4f}, '
                  f'Acc: {100. * predicted.eq(labels).sum().item() / labels.size(0):.2f}%')
    
    return train_loss / total, train_acc / total


def validate(net, val_loader, loss_fn, device):
    """验证函数"""
    net.eval()
    val_loss = 0.0
    val_acc = 0.0
    total = 0
    
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            
            outputs = net(images)
            loss = loss_fn(outputs, labels)
            
            val_loss += loss.item() * images.size(0)
            _, predicted = outputs.max(1)
            val_acc += predicted.eq(labels).sum().item()
            total += labels.size(0)
    
    return val_loss / total, val_acc / total

print('训练和验证函数定义完成')

In [None]:
# 训练配置（从头训练ResNet-34）
num_epochs = 50  # 从头训练需要更多epochs
learning_rate = 0.01  # 从头训练使用较大的学习率
weight_decay = 5e-4  # L2正则化

# 损失函数
loss_fn = nn.CrossEntropyLoss()

# 优化器 - 使用SGD with momentum（从头训练通常效果更好）
optimizer = torch.optim.SGD(
    net.parameters(), 
    lr=learning_rate,
    momentum=0.9,
    weight_decay=weight_decay
)

# 学习率调度器 - StepLR（每15个epoch降低学习率）
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.5)

print(f'训练配置:')
print(f'  模型: ResNet-34 (从头训练)')
print(f'  Epochs: {num_epochs}')
print(f'  Learning Rate: {learning_rate}')
print(f'  Optimizer: SGD with Momentum (0.9)')
print(f'  Weight Decay: {weight_decay}')
print(f'  Scheduler: StepLR (step_size=15, gamma=0.1)')

In [None]:
# 主训练循环
best_val_acc = 0.0
history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': []
}

print('开始训练...\n')
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print('-' * 60)
    
    # 训练
    train_loss, train_acc = train_epoch(net, train_loader, loss_fn, optimizer, device)
    
    # 验证
    val_loss, val_acc = validate(net, val_loader, loss_fn, device)
    
    # 更新学习率
    scheduler.step()
    
    # 记录历史
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    
    # 打印结果
    print(f'\n训练集 - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}')
    print(f'验证集 - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}')
    print(f'当前学习率: {scheduler.get_last_lr()[0]:.6f}')
    
    # 保存最佳模型
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(net.state_dict(), 'best_model.pth')
        print(f'✓ 保存最佳模型 (验证准确率: {val_acc:.4f})')
    
    print()

print(f'\n训练完成！')
print(f'最佳验证准确率: {best_val_acc:.4f}')

In [None]:
# 可视化训练历史
plt.figure(figsize=(12, 4))

# 损失曲线
plt.subplot(1, 2, 1)
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

# 准确率曲线
plt.subplot(1, 2, 2)
plt.plot(history['train_acc'], label='Train Acc')
plt.plot(history['val_acc'], label='Val Acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig('training_history.png', dpi=300, bbox_inches='tight')
plt.show()

print('训练历史已保存为 training_history.png')

In [None]:
# 加载最佳模型进行预测
net.load_state_dict(torch.load('best_model.pth'))
net.eval()

print('使用最佳模型进行预测...\n')

# 预测测试集
predictions = []
with torch.no_grad():
    for images in test_loader:
        images = images.to(device)
        outputs = net(images)
        _, predicted = outputs.max(1)
        predictions.extend(predicted.cpu().numpy())

print(f'预测完成，共 {len(predictions)} 个样本')

In [None]:
# 生成提交文件
# 将预测的索引转换回标签名称
predicted_labels = [idx_to_label[idx] for idx in predictions]

# 创建提交DataFrame
submission = pd.DataFrame({
    'image': test_df['image'],
    'label': predicted_labels
})

# 保存提交文件
submission_path = 'submission.csv'
submission.to_csv(submission_path, index=False)

print(f'提交文件已保存为: {submission_path}')
print(f'\n提交文件前几行:')
print(submission.head(10))
print(f'\n提交文件统计:')
print(f'总样本数: {len(submission)}')
print(f'预测的唯一类别数: {submission["label"].nunique()}')

In [None]:
# (可选) 可视化一些验证集预测结果
def visualize_predictions(net, val_loader, device, num_images=8):
    """可视化一些预测结果"""
    net.eval()
    images_shown = 0
    
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    axes = axes.flatten()
    
    with torch.no_grad():
        for images, labels in val_loader:
            images_batch = images.to(device)
            outputs = net(images_batch)
            _, predicted = outputs.max(1)
            
            for i in range(images.size(0)):
                if images_shown >= num_images:
                    break
                
                # 反归一化图片用于显示
                img = images[i].cpu().numpy()
                img = np.transpose(img, (1, 2, 0))
                mean = np.array([0.485, 0.456, 0.406])
                std = np.array([0.229, 0.224, 0.225])
                img = std * img + mean
                img = np.clip(img, 0, 1)
                
                # 显示图片
                axes[images_shown].imshow(img)
                axes[images_shown].axis('off')
                
                # 设置标题
                true_label = idx_to_label[labels[i].item()]
                pred_label = idx_to_label[predicted[i].item()]
                color = 'green' if true_label == pred_label else 'red'
                axes[images_shown].set_title(
                    f'True: {true_label[:15]}\nPred: {pred_label[:15]}',
                    fontsize=10,
                    color=color
                )
                
                images_shown += 1
            
            if images_shown >= num_images:
                break
    
    plt.tight_layout()
    plt.savefig('predictions_visualization.png', dpi=300, bbox_inches='tight')
    plt.show()
    print(f'预测结果可视化已保存为 predictions_visualization.png')

# 可视化预测结果
visualize_predictions(net, val_loader, device, num_images=8)

## 训练总结

本notebook实现了树叶分类的完整训练流程（从头训练ResNet-34）：

### 1. 数据处理
- 训练集：18,353张图片
- 测试集：8,800张图片  
- 数据划分：80%训练，20%验证
- **增强数据增强**：
  - RandomResizedCrop
  - RandomHorizontalFlip + RandomVerticalFlip
  - RandomRotation (30°)
  - ColorJitter（增强版）

### 2. 模型架构
- **ResNet-34**（从头训练，不使用预训练权重）
- 参数量：约21.8M
- 输出层：根据实际类别数量调整

### 3. 训练配置
- **优化器**: SGD with Momentum (0.9)
- **学习率**: 0.01（从头训练使用更大的初始学习率）
- **Weight Decay**: 5e-4
- **学习率调度**: StepLR (step_size=15, gamma=0.1)
- **损失函数**: CrossEntropyLoss
- **Batch size**: 32
- **Epochs**: 50（从头训练需要更多epochs）

### 4. 输出文件
- `best_model.pth`: 最佳模型权重
- `submission.csv`: Kaggle提交文件
- `training_history.png`: 训练历史曲线
- `predictions_visualization.png`: 预测结果可视化

### 5. 优化亮点
- 使用 ResNet-34 替代 ResNet-18，更强的特征提取能力
- 从头训练而非微调，学习更适合树叶数据的特征
- 增强的数据增强策略防止过拟合
- SGD + Momentum 优化器对从头训练更友好
- 更多的训练epochs以充分学习