In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader, random_split
import clip
from timm.loss import LabelSmoothingCrossEntropy
from torch.cuda.amp import autocast, GradScaler
from tqdm.notebook import tqdm
from PIL import Image
import pandas as pd
import os
import datetime

# 设备配置
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

# 数据预处理配置
clip_mean = (0.48145466, 0.4578275, 0.40821073)
clip_std = (0.26862954, 0.26130258, 0.27577711)

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.6, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.3, 0.3, 0.2),
    transforms.RandomRotation(15),
    transforms.RandomAffine(degrees=0, shear=0.15),
    transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 3.0)),
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.6, scale=(0.02, 0.2)),
    transforms.Normalize(clip_mean, clip_std)
])

test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(clip_mean, clip_std)
])

# 数据集加载
full_dataset = torchvision.datasets.ImageFolder(
    root='data/train',
    transform=train_transform
)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(
    full_dataset,
    [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

# 改进的模型架构
class EnhancedCLIPClassifier(nn.Module):
    def __init__(self, num_classes=11, unfreeze_layers=4):
        super().__init__()
        self.clip_model, _ = clip.load("ViT-B/32", device=device, jit=False)
        
        # 解冻最后N层Transformer blocks
        total_layers = 12
        for name, param in self.clip_model.visual.named_parameters():
            if "transformer.resblocks" in name:
                layer_num = int(name.split(".resblocks.")[1].split(".")[0])
                param.requires_grad = (layer_num >= total_layers - unfreeze_layers)
        
        # 注意力机制
        self.channel_attention = nn.Sequential(
            nn.Linear(512, 256),
            nn.Tanh(),
            nn.Linear(256, 512),
            nn.Sigmoid()
        )
        
        # 增强的分类器
        self.classifier = nn.Sequential(
            nn.LayerNorm(512),
            nn.Linear(512, 1024),
            nn.GELU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        with torch.no_grad():  # CLIP编码保持冻结
            features = self.clip_model.encode_image(x)
        features = features.float()
        
        # 通道注意力
        attention_weights = self.channel_attention(features)
        features = features * attention_weights
        
        return self.classifier(features)

# 训练函数（含混合精度训练）
def train_enhanced_model(model, criterion, optimizer, scheduler, dataloaders, num_epochs=30):
    scaler = GradScaler()
    history = {'train_loss': [], 'val_acc': [], 'val_loss': []}
    best_acc = 0.0
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        
        # 训练阶段
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(dataloaders['train'], desc="Training"):
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            with autocast():
                outputs = model(inputs)
                loss = criterion(outputs, labels)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            running_loss += loss.item() * inputs.size(0)
        
        # 验证阶段
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for inputs, labels in tqdm(dataloaders['val'], desc="Validating"):
                inputs, labels = inputs.to(device), labels.to(device)
                
                with autocast():
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                
                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        # 更新学习率
        if scheduler:
            scheduler.step()
        
        # 统计指标
        epoch_loss = running_loss / len(dataloaders['train'].dataset)
        val_loss = val_loss / len(dataloaders['val'].dataset)
        val_acc = correct / total
        
        history['train_loss'].append(epoch_loss)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        print(f"Train Loss: {epoch_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2%}")
        
        # 保存最佳模型
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), f'best_model_epoch{epoch+1}.pth')
            print(f"New best model saved at epoch {epoch+1} with acc {best_acc:.2%}")
    
    return history

# 初始化模型和训练参数
model = EnhancedCLIPClassifier(num_classes=11, unfreeze_layers=4).to(device)
criterion = LabelSmoothingCrossEntropy(smoothing=0.1)
optimizer = optim.AdamW([
    {'params': model.clip_model.visual.parameters(), 'lr': 1e-5},
    {'params': model.channel_attention.parameters(), 'lr': 1e-4},
    {'params': model.classifier.parameters(), 'lr': 1e-3}
], weight_decay=1e-4)

scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)

# 执行训练
history = train_enhanced_model(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    dataloaders={'train': train_loader, 'val': val_loader},
    num_epochs=10
)

# 测试时增强（TTA）预测
def tta_predict(model, image_path, num_crops=10):
    model.eval()
    img = Image.open(image_path).convert('RGB')
    
    tta_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.TenCrop(224),
        transforms.Lambda(lambda crops: [test_transform(crop) for crop in crops])
    ])
    
    crops = tta_transform(img)
    batch = torch.stack(crops[:num_crops]).to(device)
    
    with torch.no_grad(), autocast():
        outputs = model(batch)
        probs = torch.softmax(outputs, dim=1)
        avg_probs = torch.mean(probs, dim=0)
    
    return torch.argmax(avg_probs).item()

# 生成提交文件
test_folder = 'data/testA'
model.load_state_dict(torch.load('best_model_epoch9.pth'))  # 加载最佳模型
model.eval()

results = []
for img_name in tqdm(os.listdir(test_folder)):
    if not img_name.endswith('.png'):
        continue
    img_path = os.path.join(test_folder, img_name)
    pred = tta_predict(model, img_path)
    results.append({'id': img_name[:-4], 'label': pred})

df = pd.DataFrame(results)
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
df.to_csv(f'submission_{timestamp}.csv', index=False)
print("Submission file generated successfully!")

Using device: NVIDIA GeForce RTX 4060 Laptop GPU

Epoch 1/10


  scaler = GradScaler()


Training:   0%|          | 0/62 [00:00<?, ?it/s]

  with autocast():


Validating:   0%|          | 0/16 [00:00<?, ?it/s]

  with autocast():


Train Loss: 1.6684 | Val Loss: 1.2982 | Val Acc: 67.04%
New best model saved at epoch 1 with acc 67.04%

Epoch 2/10


Training:   0%|          | 0/62 [00:00<?, ?it/s]

Validating:   0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 1.4160 | Val Loss: 1.2769 | Val Acc: 67.24%
New best model saved at epoch 2 with acc 67.24%

Epoch 3/10


Training:   0%|          | 0/62 [00:00<?, ?it/s]

Validating:   0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 1.3446 | Val Loss: 1.2591 | Val Acc: 66.83%

Epoch 4/10


Training:   0%|          | 0/62 [00:00<?, ?it/s]

Validating:   0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 1.3054 | Val Loss: 1.2325 | Val Acc: 69.66%
New best model saved at epoch 4 with acc 69.66%

Epoch 5/10


Training:   0%|          | 0/62 [00:00<?, ?it/s]

Validating:   0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 1.2943 | Val Loss: 1.2037 | Val Acc: 69.66%

Epoch 6/10


Training:   0%|          | 0/62 [00:00<?, ?it/s]

Validating:   0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 1.2532 | Val Loss: 1.2136 | Val Acc: 69.56%

Epoch 7/10


Training:   0%|          | 0/62 [00:00<?, ?it/s]

Validating:   0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 1.2474 | Val Loss: 1.2322 | Val Acc: 68.95%

Epoch 8/10


Training:   0%|          | 0/62 [00:00<?, ?it/s]

Validating:   0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 1.2392 | Val Loss: 1.2118 | Val Acc: 69.66%

Epoch 9/10


Training:   0%|          | 0/62 [00:00<?, ?it/s]

Validating:   0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 1.2236 | Val Loss: 1.2097 | Val Acc: 69.76%
New best model saved at epoch 9 with acc 69.76%

Epoch 10/10


Training:   0%|          | 0/62 [00:00<?, ?it/s]

Validating:   0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 1.2079 | Val Loss: 1.2044 | Val Acc: 68.65%


OSError: [Errno 22] Invalid argument: 'best_model_epoch*.pth'

In [4]:
# 生成提交文件
test_folder = 'data/testA'
model.load_state_dict(torch.load('best_model_epoch9.pth'))  # 加载最佳模型
model.eval()

results = []
for img_name in tqdm(os.listdir(test_folder)):
    if not img_name.endswith('.png'):
        continue
    img_path = os.path.join(test_folder, img_name)
    pred = tta_predict(model, img_path)
    results.append({'id': img_name[:-4], 'label': pred})

df = pd.DataFrame(results)
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
df.to_csv(f'submission_{timestamp}.csv', index=False)
print("Submission file generated successfully!")

  0%|          | 0/1795 [00:00<?, ?it/s]

  with torch.no_grad(), autocast():


Submission file generated successfully!
