In [1]:
# 라이브러리 임포트
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

In [5]:
import requests
dataset_file_origin = 'https://www.gutenberg.org/cache/epub/1513/pg1513-images.html#sceneIII_30.1'

# 요청을 보내고 응답을 받기
response = requests.get(dataset_file_origin)

# 응답이 성공적이면 파일로 저장
if response.status_code == 200:
    # 파일을 열고 데이터 쓰기
    with open("romeo_and_juliet.txt", "w", encoding="utf-8") as file:
        file.write(response.text)
    print("파일이 성공적으로 다운로드되었습니다.")
else:
    print(f"파일 다운로드 실패: 상태 코드 {response.status_code}")

파일이 성공적으로 다운로드되었습니다.


In [8]:
text = ""
with open("romeo_and_juliet.txt", "r", encoding="utf-8") as file:
    text = file.read()
print(text[:100])

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"><style>
#pg-header div, #pg-footer div


In [25]:
# 데이터 전처리 - html tag 제거
import re
from bs4 import BeautifulSoup

# 읽어온 텍스트를 BeautifulSoup 객체로 변환
soup = BeautifulSoup(text, 'html.parser')

# p 태그 중 class가 drama인 태그 선택
drama_paragraphs = soup.find_all('p', class_='drama')

# 텍스트 추출
cleaned_text = ""
for para in drama_paragraphs:
    cleaned_text += para.get_text(separator="\n") + "\n"

# 불필요한 공백 제거
cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text.strip())

# 결과 출력 (일부만 출력)
print(cleaned_text[:1000])

ESCALUS, Prince of Verona.

MERCUTIO, kinsman to the Prince, and friend to Romeo.

PARIS, a young Nobleman, kinsman to the Prince.

Page to Paris.

MONTAGUE, head of a Veronese family at feud with the Capulets.

LADY MONTAGUE, wife to Montague.

ROMEO, son to Montague.

BENVOLIO, nephew to Montague, and friend to Romeo.

ABRAM, servant to Montague.

BALTHASAR, servant to Romeo.

CAPULET, head of a Veronese family at feud with the Montagues.

LADY CAPULET, wife to Capulet.

JULIET, daughter to Capulet.

TYBALT, nephew to Lady Capulet.

CAPULET’S COUSIN, an old man.

NURSE to Juliet.

PETER, servant to Juliet’s Nurse.

SAMPSON, servant to Capulet.

GREGORY, servant to Capulet.

Servants.

FRIAR LAWRENCE, a Franciscan.

FRIAR JOHN, of the same Order.

An Apothecary.

CHORUS.

Three Musicians.

An Officer.

Citizens of Verona; several Men and Women, relations to both
houses; Maskers, Guards, Watchmen and Attendants.

CHORUS.

Two households, both alike in dignity,

In fair Verona, where we

In [34]:
# 데이터 전처리
chars = sorted(list(set(cleaned_text)))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

In [35]:
# 시퀀스 데이터 생성 함수 정의
def create_sequences(text, seq_length):
    sequences = []
    targets = []
    for i in range(0, len(text) - seq_length):
        seq = text[i:i+seq_length]   # 시퀀스 생성
        target = text[i+seq_length]  # 시퀀스 다음에 오는 문자
        sequences.append([char_to_idx[char] for char in seq])
        targets.append(char_to_idx[target])
    return sequences, targets

In [36]:
# 시퀀스 길이 설정
seq_length = 100

# 시퀀스 데이터 생성
sequences, targets = create_sequences(cleaned_text, seq_length)

In [37]:
print(sequences[:10])

[[13, 27, 11, 9, 20, 29, 27, 3, 1, 24, 53, 44, 49, 38, 40, 1, 50, 41, 1, 30, 40, 53, 50, 49, 36, 5, 0, 0, 21, 13, 26, 11, 29, 28, 17, 23, 3, 1, 46, 44, 49, 54, 48, 36, 49, 1, 55, 50, 1, 55, 43, 40, 1, 24, 53, 44, 49, 38, 40, 3, 1, 36, 49, 39, 1, 41, 53, 44, 40, 49, 39, 1, 55, 50, 1, 26, 50, 48, 40, 50, 5, 0, 0, 24, 9, 26, 17, 27, 3, 1, 36, 1, 60, 50, 56, 49, 42, 1, 22, 50], [27, 11, 9, 20, 29, 27, 3, 1, 24, 53, 44, 49, 38, 40, 1, 50, 41, 1, 30, 40, 53, 50, 49, 36, 5, 0, 0, 21, 13, 26, 11, 29, 28, 17, 23, 3, 1, 46, 44, 49, 54, 48, 36, 49, 1, 55, 50, 1, 55, 43, 40, 1, 24, 53, 44, 49, 38, 40, 3, 1, 36, 49, 39, 1, 41, 53, 44, 40, 49, 39, 1, 55, 50, 1, 26, 50, 48, 40, 50, 5, 0, 0, 24, 9, 26, 17, 27, 3, 1, 36, 1, 60, 50, 56, 49, 42, 1, 22, 50, 37], [11, 9, 20, 29, 27, 3, 1, 24, 53, 44, 49, 38, 40, 1, 50, 41, 1, 30, 40, 53, 50, 49, 36, 5, 0, 0, 21, 13, 26, 11, 29, 28, 17, 23, 3, 1, 46, 44, 49, 54, 48, 36, 49, 1, 55, 50, 1, 55, 43, 40, 1, 24, 53, 44, 49, 38, 40, 3, 1, 36, 49, 39, 1, 41, 53, 44

In [42]:
# PyTorch Dataset 및 데이터로더 생성
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx]), torch.tensor(self.targets[idx])

In [43]:
# 데이터셋 및 데이터로더 인스턴스 생성
dataset = TextDataset(sequences, targets)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

In [44]:
# 하이퍼파라미터 설정
vocab_size = len(chars)
hidden_size = 256
output_size = len(chars)
num_layers = 2

#### LSTM 모델 구조 & 클래스 정의

In [61]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size, num_layers=1):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers, batch_first=True)  # RNN 레이어
        self.fc = nn.Linear(hidden_size, output_size)  # 완전 연결층

    def forward(self, x, hidden):
        out, hidden = self.lstm(x, hidden)  # RNN 순전파
        out = self.fc(out[:, -1, :])  # 마지막 시퀀스 출력만 사용
        return out, hidden

    def init_hidden(self, batch_size):
        # 초기 hidden state 설정
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return (h0, c0)

In [63]:
# GPU 사용 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델 인스턴스 생성 및 GPU로 이동
model = LSTMModel(vocab_size, hidden_size, output_size, num_layers).to(device)

In [64]:
# 손실 함수와 옵티마이저 정의
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [69]:
# 모델 훈련 함수
def train_model(model, dataloader, criterion, optimizer, num_epochs=20):
    model.train()  # 모델을 훈련 모드로 설정
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in dataloader:
            inputs = nn.functional.one_hot(inputs, num_classes=vocab_size).float().to(device)  # 원-핫 인코딩 및 GPU로 이동
            labels = labels.to(device)

            hidden = model.init_hidden(inputs.size(0))  # 각 배치마다 hidden 상태 초기화

            # 옵티마이저 초기화
            optimizer.zero_grad()

            # 순전파, 역전파, 최적화
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # hidden 상태를 detach하여 그래프의 연결을 끊음
            hidden = tuple([each.detach() for each in hidden])

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(dataloader):.4f}')

    print('Finished Training')

In [70]:
def generate_text(model, start_str, length, temperature=1.0):
    model.eval()  # 모델을 평가 모드로 설정
    hidden = model.init_hidden(1)  # 초기 hidden 상태 설정

    input_seq = torch.tensor([char_to_idx[char] for char in start_str]).unsqueeze(0).to(device)
    generated_text = start_str

    with torch.no_grad():
        for _ in range(length):
            input_seq = nn.functional.one_hot(input_seq, num_classes=vocab_size).float()
            output, hidden = model(input_seq, hidden)

            # 다음 문자를 샘플링
            output = output.squeeze().div(temperature).exp().cpu()
            top_char = torch.multinomial(output, 1)[0]

            generated_char = idx_to_char[top_char.item()]
            generated_text += generated_char

            # 다음 입력 시퀀스 준비
            input_seq = torch.tensor([[top_char]]).to(device)

    return generated_text

In [71]:
print(model)

LSTMModel(
  (lstm): LSTM(68, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=68, bias=True)
)


In [72]:
# 모델 훈련
train_model(model, dataloader, criterion, optimizer, num_epochs=5)

Epoch 1/5, Loss: 2.3542
Epoch 2/5, Loss: 1.7890
Epoch 3/5, Loss: 1.6006
Epoch 4/5, Loss: 1.4865
Epoch 5/5, Loss: 1.3980
Finished Training


In [73]:
# 테스트 시작 문자열 및 생성할 텍스트 길이
start_str = "To be, or not to be, that is the question:"
length = 200

In [74]:
# 텍스트 생성
generated_text = generate_text(model, start_str, length, temperature=0.8)
print(generated_text)

To be, or not to be, that is the question:

Thou hast, ring in the child!

ROMEO.

Then did is lade?—You denespity how a tears,

The fair more marriage of the fiery

In that do pirdor the poot sounds you, [
fiely, behome,

With tears no nothe
