In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset, ConcatDataset
from transformers import ViTForImageClassification, ViTFeatureExtractor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from torchvision import datasets, transforms
import numpy as np
import copy

# 1. 모델, 디바이스 및 데이터 전처리 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "google/vit-base-patch16-224-in21k"

# ViT 모델 및 feature extractor 설정
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
])

# 2. 데이터셋 로드 (여기서는 ImageFolder로 라벨이 있는 데이터를 사용)
trainset_path = 'C:/Users/jongcheol/OneDrive/바탕 화면/Semester2/train_data'
trainset = datasets.ImageFolder(root=trainset_path, transform=transform)
labels = np.array([label for _, label in trainset.imgs])

# 3. 5-Fold Cross Validation 설정
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies_vit = []             # 기본 ViT 성능
fold_accuracies_noisy_student = []   # Noisy Student 적용 ViT 성능

# 4. Cross Validation 학습 및 평가
for fold, (train_idx, val_idx) in enumerate(kf.split(np.zeros(len(labels)), labels)):
    print(f"\n=== Fold {fold + 1} 시작 ===")

    # Fold별 데이터셋 분리
    train_subset = Subset(trainset, train_idx)
    val_subset = Subset(trainset, val_idx)
    train_loader = DataLoader(train_subset, batch_size=16, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_subset, batch_size=16, shuffle=False, num_workers=0)

    # 5. 기본 ViT 모델 학습 및 평가
    vit_model = ViTForImageClassification.from_pretrained(model_name, num_labels=len(trainset.classes)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(vit_model.parameters(), lr=2e-5)

    # 기본 ViT 학습
    vit_model.train()
    for epoch in range(3):  # 예시로 3 에포크만 수행
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = vit_model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # 기본 ViT 성능 평가
    vit_model.eval()
    vit_preds, vit_labels = [], []
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            outputs = vit_model(images).logits
            _, preds = torch.max(outputs, 1)
            vit_preds.extend(preds.cpu().numpy())
            vit_labels.extend(labels.numpy())
    fold_accuracy_vit = accuracy_score(vit_labels, vit_preds)
    fold_accuracies_vit.append(fold_accuracy_vit)
    print(f"Fold {fold + 1} Basic ViT Accuracy: {fold_accuracy_vit * 100:.2f}%")

    # 6. Noisy Student 기법 - Teacher 모델 학습
    teacher_model = ViTForImageClassification.from_pretrained(model_name, num_labels=len(trainset.classes)).to(device)
    optimizer = torch.optim.Adam(teacher_model.parameters(), lr=2e-5)

    # Teacher 모델 학습
    teacher_model.train()
    for epoch in range(3):
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = teacher_model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # 7. Pseudo-labeling: Teacher 모델로 가짜 라벨 생성
    pseudo_labels = []
    pseudo_dataset = []
    teacher_model.eval()
    with torch.no_grad():
        for images, _ in val_loader:
            images = images.to(device)
            outputs = teacher_model(images).logits
            _, preds = torch.max(outputs, 1)
            pseudo_labels.extend(preds.cpu().numpy())
            pseudo_dataset.extend(images.cpu())

    # Noisy Student 데이터셋 생성
    pseudo_labels = torch.tensor(pseudo_labels)
    noisy_student_dataset = [(img, label) for img, label in zip(pseudo_dataset, pseudo_labels)]
    noisy_student_loader = DataLoader(noisy_student_dataset, batch_size=16, shuffle=True)

    # 8. Student 모델 학습 (Noisy Student 기법 적용)
    student_model = ViTForImageClassification.from_pretrained(model_name, num_labels=len(trainset.classes)).to(device)
    optimizer = torch.optim.Adam(student_model.parameters(), lr=2e-5)
    student_model.train()
    for epoch in range(3):
        for images, labels in noisy_student_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = student_model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # 9. Noisy Student 모델 성능 평가
    student_model.eval()
    student_preds, student_labels = [], []
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            outputs = student_model(images).logits
            _, preds = torch.max(outputs, 1)
            student_preds.extend(preds.cpu().numpy())
            student_labels.extend(labels.numpy())
    fold_accuracy_noisy_student = accuracy_score(student_labels, student_preds)
    fold_accuracies_noisy_student.append(fold_accuracy_noisy_student)
    print(f"Fold {fold + 1} Noisy Student ViT Accuracy: {fold_accuracy_noisy_student * 100:.2f}%")

# 10. 5-Fold 평균 정확도 비교 출력
print("\n=== 최종 5-Fold 평균 정확도 ===")
print(f"Basic ViT Model: {np.mean(fold_accuracies_vit) * 100:.2f}%")
print(f"Noisy Student ViT Model: {np.mean(fold_accuracies_noisy_student) * 100:.2f}%")


  from .autonotebook import tqdm as notebook_tqdm



=== Fold 1 시작 ===


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1 Basic ViT Accuracy: 95.96%


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1 Noisy Student ViT Accuracy: 92.96%

=== Fold 2 시작 ===


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 