In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# 하이퍼파라미터 설정
BATCH_SIZE = 64
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 2
NUM_EPOCHS = 5
LEARNING_RATE = 0.001
MAX_VOCAB_SIZE = 10000
MAX_SEQ_LEN = 500

# 데이터 준비
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer(text)

# IMDB 데이터셋 로드
train_iter = IMDB(split='train')
test_iter = IMDB(split='test')

# 어휘 사전 구축
vocab = build_vocab_from_iterator(yield_tokens(train_iter), max_tokens=MAX_VOCAB_SIZE, specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

# 데이터셋을 다시 로드하여 사용 (중요)
train_iter = IMDB(split='train')
test_iter = IMDB(split='test')

def text_pipeline(x):
    tokens = tokenizer(x)
    token_ids = [vocab[token] for token in tokens]
    # 시퀀스 길이 조정
    if len(token_ids) > MAX_SEQ_LEN:
        token_ids = token_ids[:MAX_SEQ_LEN]
    else:
        token_ids += [vocab["<pad>"]] * (MAX_SEQ_LEN - len(token_ids))
    return token_ids

def label_pipeline(x):
    return 1 if x == 'pos' else 0

# 데이터셋 생성
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, data_iter):
        self.data = []
        for label, text in data_iter:
            self.data.append((torch.tensor(text_pipeline(text), dtype=torch.long),
                              label_pipeline(label)))
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

train_dataset = IMDBDataset(IMDB(split='train'))
test_dataset = IMDBDataset(IMDB(split='test'))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 모델 정의
class CNNTransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(CNNTransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # CNN 인코더
        self.cnn_encoder = nn.Conv1d(in_channels=embedding_dim, out_channels=embedding_dim, kernel_size=5, padding=2, stride=2)

        # 트랜스포머 인코더 레이어
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)

        # CNN 디코더
        self.cnn_decoder = nn.ConvTranspose1d(in_channels=embedding_dim, out_channels=embedding_dim, kernel_size=5, padding=2, stride=2, output_padding=1)

        # 출력 레이어
        self.fc = nn.Linear(embedding_dim * MAX_SEQ_LEN, num_classes)

    def forward(self, x):
        # 임베딩
        x = self.embedding(x)  # [batch_size, seq_len, embedding_dim]
        x = x.permute(0, 2, 1)  # [batch_size, embedding_dim, seq_len]

        # CNN 인코더로 압축
        x = self.cnn_encoder(x)  # [batch_size, embedding_dim, seq_len/2]
        x = nn.ReLU()(x)
        x = x.permute(2, 0, 1)  # [seq_len/2, batch_size, embedding_dim]

        # 트랜스포머에 전달
        x = self.transformer_encoder(x)  # [seq_len/2, batch_size, embedding_dim]

        # CNN 디코더로 복원
        x = x.permute(1, 2, 0)  # [batch_size, embedding_dim, seq_len/2]
        x = self.cnn_decoder(x)  # [batch_size, embedding_dim, seq_len]
        x = nn.ReLU()(x)
        x = x.reshape(x.size(0), -1)  # [batch_size, embedding_dim * seq_len]

        # 출력 레이어
        logits = self.fc(x)  # [batch_size, num_classes]
        return logits

# 모델 초기화
model = CNNTransformerModel(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES)

# 손실 함수와 옵티마이저 설정
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 모델 학습
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {avg_loss:.4f}")

# 모델 평가
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")




Epoch [1/5], Loss: 0.0022
Epoch [2/5], Loss: 0.0000


KeyboardInterrupt: 