In [1]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

--2025-06-16 08:34:32--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.1’


2025-06-16 08:34:34 (36.6 MB/s) - ‘aclImdb_v1.tar.gz.1’ saved [84125825/84125825]



# **실습 코드를 기반으로 똑같이 LSTM 기반 학습 코드를 구현하여 10 epoch training에 대한 train / test 결과를 확인하세요. (20)**

In [7]:
import numpy as np
import torch
seed = 1234
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

import os
import pandas as pd
def load_imdb_split(split_dir):
    texts, labels = [], []
    for label in ['pos', 'neg']:
      folder = os.path.join(split_dir, label)
      for filename in os.listdir(folder):
        with open(os.path.join(folder, filename), 'r', encoding='utf-8') as f:
          texts.append(f.read())
          labels.append(1 if label == 'pos' else 0)
    return pd.DataFrame({'text': texts, 'label': labels})
train_df = load_imdb_split('aclImdb/train')
test_df = load_imdb_split('aclImdb/test')

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")
def yield_tokens(text_series):
    for text in text_series:
       yield tokenizer(text)

# Build vocab using training data only
vocab = build_vocab_from_iterator(yield_tokens(train_df['text']), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

MAX_LEN=256
def encode_text(text):
    tokens = tokenizer(text)
    return vocab(tokens[:MAX_LEN])

train_df['input_ids'] = train_df['text'].apply(encode_text)
test_df['input_ids'] = test_df['text'].apply(encode_text)

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
def create_tensor_dataset(inputs, labels):
    tensor_seqs = [torch.tensor(x) for x in inputs]
    padded_seqs = pad_sequence(tensor_seqs, batch_first=True)
    labels_tensor = torch.tensor(list(labels), dtype=torch.long)

    return TensorDataset(padded_seqs, labels_tensor)

train_dataset = create_tensor_dataset(train_df['input_ids'], train_df['label'])
test_dataset = create_tensor_dataset(test_df['input_ids'], test_df['label'])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

import torch.nn as nn
class SimpleLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=128, output_dim=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim,
        padding_idx=vocab["<unk>"])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x) # (batch, seq_len, embedding_dim)
        output, (hidden, _) = self.lstm(embedded) # hidden: (1, batch, hidden_dim)
        hidden = hidden.squeeze(0) # (batch, hidden_dim)
        return self.sigmoid(self.fc(hidden))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleLSTMClassifier(vocab_size=len(vocab), embedding_dim = 300, hidden_dim =
128).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


from tqdm import tqdm
def train_loop(model, train_loader, test_loader, optimizer, criterion, device, epochs=5):
    model.to(device)
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        total_correct = 0
        print(f"\n Epoch {epoch}/{epochs}")
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc="Training")
        for batch_idx, (x_batch, y_batch) in progress_bar:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device).float()
            optimizer.zero_grad()
            outputs = model(x_batch).squeeze(1)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            predictions = (outputs >= 0.5).long()
            total_correct += (predictions == y_batch).sum().item()
            progress_bar.set_postfix(loss=loss.item())
        train_acc = total_correct / len(train_loader.dataset)
        avg_loss = total_loss / len(train_loader)
        model.eval()
        with torch.no_grad():
            correct = 0
            for x_batch, y_batch in test_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device).float()
                outputs = model(x_batch).squeeze(1)
                preds = (outputs >= 0.5).long()
                correct += (preds == y_batch).sum().item()
            test_acc = correct / len(test_loader.dataset)

        print(f"Epoch {epoch} Summary - Avg Loss: {avg_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")
    return model

model = train_loop(
model=model,
train_loader=train_loader,
test_loader=test_loader,
optimizer=optimizer,
criterion=criterion,
device=device,
epochs=10
)


 Epoch 1/10


Training: 100%|██████████| 391/391 [00:09<00:00, 43.23it/s, loss=0.706]


Epoch 1 Summary - Avg Loss: 0.6932, Train Acc: 0.5064, Test Acc: 0.5038

 Epoch 2/10


Training: 100%|██████████| 391/391 [00:08<00:00, 47.82it/s, loss=0.676]


Epoch 2 Summary - Avg Loss: 0.6579, Train Acc: 0.6098, Test Acc: 0.5276

 Epoch 3/10


Training: 100%|██████████| 391/391 [00:08<00:00, 47.91it/s, loss=0.515]


Epoch 3 Summary - Avg Loss: 0.6119, Train Acc: 0.6470, Test Acc: 0.6026

 Epoch 4/10


Training: 100%|██████████| 391/391 [00:08<00:00, 46.84it/s, loss=0.574]


Epoch 4 Summary - Avg Loss: 0.5527, Train Acc: 0.7104, Test Acc: 0.5299

 Epoch 5/10


Training: 100%|██████████| 391/391 [00:08<00:00, 47.07it/s, loss=0.49]


Epoch 5 Summary - Avg Loss: 0.5075, Train Acc: 0.7400, Test Acc: 0.6811

 Epoch 6/10


Training: 100%|██████████| 391/391 [00:08<00:00, 46.67it/s, loss=0.562]


Epoch 6 Summary - Avg Loss: 0.4660, Train Acc: 0.7675, Test Acc: 0.7085

 Epoch 7/10


Training: 100%|██████████| 391/391 [00:08<00:00, 47.01it/s, loss=0.291]


Epoch 7 Summary - Avg Loss: 0.4576, Train Acc: 0.7524, Test Acc: 0.7157

 Epoch 8/10


Training: 100%|██████████| 391/391 [00:08<00:00, 47.02it/s, loss=0.277]


Epoch 8 Summary - Avg Loss: 0.3553, Train Acc: 0.8464, Test Acc: 0.7652

 Epoch 9/10


Training: 100%|██████████| 391/391 [00:08<00:00, 46.64it/s, loss=0.445]


Epoch 9 Summary - Avg Loss: 0.3093, Train Acc: 0.8748, Test Acc: 0.7005

 Epoch 10/10


Training: 100%|██████████| 391/391 [00:08<00:00, 46.36it/s, loss=0.346]


Epoch 10 Summary - Avg Loss: 0.2594, Train Acc: 0.9032, Test Acc: 0.7810


In [22]:
import torchtext
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleLSTMClassifier(vocab_size=len(vocab), embedding_dim = 300, hidden_dim =
128).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
vectors = torchtext.vocab.GloVe(name='6B', dim=300,cache='~/.vector_cache')
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
model.embedding.weight.data = pretrained_embedding


In [15]:
model.embedding.weight.requires_grad = False

In [21]:
import torch.nn as nn

def init_weights(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Embedding):
        nn.init.xavier_uniform_(m.weight)  # 예: Xavier 초기화
        if hasattr(m, 'bias') and m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'weight_ih' in name:
                nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                nn.init.zeros_(param.data)

# 모델에 초기화 적용
naive_transformer_model.apply(init_weights)


TransformerClassifier(
  (embedding): Embedding(100683, 300, padding_idx=0)
  (pos_encoder): PositionalEncoding()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
        )
        (linear1): Linear(in_features=300, out_features=128, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=128, out_features=300, bias=True)
        (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (classifier): Linear(in_features=300, out_features=1, bias=True)
)

# **epoch, learning_rate, batch_size를 고정한채 LSTM 기반 모델의 test set 성능을 87% 이상으로 올려보고 어떻게 성능을 향상시켰는지 작성하세요. (20) +Glove (6B, 300dim) pretrained word embedding 모델을 적용하여 성능을 확인하세요. (20)**

In [16]:
model = train_loop(
model=model,
train_loader=train_loader,
test_loader=test_loader,
optimizer=optimizer,
criterion=criterion,
device=device,
epochs=10
)


 Epoch 1/10


Training: 100%|██████████| 391/391 [00:04<00:00, 83.49it/s, loss=0.685]


Epoch 1 Summary - Avg Loss: 0.6868, Train Acc: 0.5282, Test Acc: 0.5090

 Epoch 2/10


Training: 100%|██████████| 391/391 [00:04<00:00, 83.67it/s, loss=0.64]


Epoch 2 Summary - Avg Loss: 0.6722, Train Acc: 0.5817, Test Acc: 0.6434

 Epoch 3/10


Training: 100%|██████████| 391/391 [00:04<00:00, 82.54it/s, loss=0.586]


Epoch 3 Summary - Avg Loss: 0.6148, Train Acc: 0.6869, Test Acc: 0.7507

 Epoch 4/10


Training: 100%|██████████| 391/391 [00:04<00:00, 80.66it/s, loss=0.608]


Epoch 4 Summary - Avg Loss: 0.6000, Train Acc: 0.6981, Test Acc: 0.7208

 Epoch 5/10


Training: 100%|██████████| 391/391 [00:04<00:00, 82.58it/s, loss=0.313]


Epoch 5 Summary - Avg Loss: 0.4310, Train Acc: 0.8100, Test Acc: 0.8476

 Epoch 6/10


Training: 100%|██████████| 391/391 [00:04<00:00, 81.96it/s, loss=0.442]


Epoch 6 Summary - Avg Loss: 0.3340, Train Acc: 0.8591, Test Acc: 0.8600

 Epoch 7/10


Training: 100%|██████████| 391/391 [00:04<00:00, 83.56it/s, loss=0.358]


Epoch 7 Summary - Avg Loss: 0.3058, Train Acc: 0.8713, Test Acc: 0.8686

 Epoch 8/10


Training: 100%|██████████| 391/391 [00:04<00:00, 83.20it/s, loss=0.292]


Epoch 8 Summary - Avg Loss: 0.2835, Train Acc: 0.8819, Test Acc: 0.8684

 Epoch 9/10


Training: 100%|██████████| 391/391 [00:04<00:00, 84.81it/s, loss=0.0967]


Epoch 9 Summary - Avg Loss: 0.2609, Train Acc: 0.8922, Test Acc: 0.8731

 Epoch 10/10


Training: 100%|██████████| 391/391 [00:04<00:00, 83.83it/s, loss=0.129]


Epoch 10 Summary - Avg Loss: 0.2378, Train Acc: 0.9045, Test Acc: 0.8726


In [27]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300, num_heads=4, hidden_dim=128,
                 num_layers=2, num_classes=1, max_len=256, dropout=0.1, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.pos_encoder = PositionalEncoding(embedding_dim, max_len)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.classifier = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        embedded = self.embedding(x)  # (batch, seq_len, emb_dim)
        encoded = self.pos_encoder(embedded)
        transformer_out = self.transformer_encoder(encoded)  # (batch, seq_len, emb_dim)
        pooled = transformer_out.mean(dim=1)  # mean pooling
        # return self.classifier(pooled).squeeze(1)  # (batch,)
        return torch.sigmoid(self.classifier(pooled))  # .squeeze(1) 제거, sigmoid 적용

naive_transformer_model = TransformerClassifier(
    vocab_size=len(vocab),
    embedding_dim=300,
    # num_heads=8,
    num_heads=4,
    hidden_dim=256,
    # num_layers=6,
    num_layers=2,
    pad_idx=vocab["<unk>"]
).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(naive_transformer_model.parameters(), lr=1e-3)

naive_transformer_model = train_loop(
    model=naive_transformer_model,
    train_loader=train_loader,
    test_loader=test_loader,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    epochs=10
)


 Epoch 1/10


Training: 100%|██████████| 391/391 [01:01<00:00,  6.35it/s, loss=0.685]


Epoch 1 Summary - Avg Loss: 0.6953, Train Acc: 0.5003, Test Acc: 0.5013

 Epoch 2/10


Training:  29%|██▉       | 113/391 [00:17<00:43,  6.37it/s, loss=0.711]


KeyboardInterrupt: 