<a href="https://colab.research.google.com/github/JSJeong-me/AI-Innovation-2024/blob/main/NLP/4-6-LSTM-predict-word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## https://machinelearningmastery.com/text-generation-with-lstm-in-pytorch/

In [1]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1FzTezch20eBDFLFIkMTbkOCtKUnVqMet' -O wonderland.txt

--2024-10-07 23:27:46--  https://docs.google.com/uc?export=download&id=1FzTezch20eBDFLFIkMTbkOCtKUnVqMet
Resolving docs.google.com (docs.google.com)... 142.251.175.100, 142.251.175.113, 142.251.175.101, ...
Connecting to docs.google.com (docs.google.com)|142.251.175.100|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1FzTezch20eBDFLFIkMTbkOCtKUnVqMet&export=download [following]
--2024-10-07 23:27:47--  https://drive.usercontent.google.com/download?id=1FzTezch20eBDFLFIkMTbkOCtKUnVqMet&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.251.12.132, 2404:6800:4003:c11::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.251.12.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13903 (14K) [application/octet-stream]
Saving to: ‘wonderland.txt’


2024-10-07 23:27:53 (83.1 MB/s) - ‘wonderland.txt’ saved [13903/139

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [None]:


# load ascii text and covert to lowercase
filename = "wonderland.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

In [None]:


# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

In [None]:


# reshape X to be [samples, time steps, features]
X = torch.tensor(dataX, dtype=torch.float32).reshape(n_patterns, seq_length, 1)
X = X / float(n_vocab)
y = torch.tensor(dataY)

In [None]:


class CharModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=2, batch_first=True, dropout=0.2)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(256, n_vocab)
    def forward(self, x):
        x, _ = self.lstm(x)
        # take only the last output
        x = x[:, -1, :]
        # produce output
        x = self.linear(self.dropout(x))
        return x


In [None]:

n_epochs = 40
batch_size = 128
model = CharModel()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")
loader = data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=batch_size)

best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch.to(device))
        loss = loss_fn(y_pred, y_batch.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            y_pred = model(X_batch.to(device))
            loss += loss_fn(y_pred, y_batch.to(device))
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))

torch.save([best_model, char_to_int], "single-char.pth")

In [None]:


# Generation using the trained model
best_model, char_to_int = torch.load("single-char.pth")
n_vocab = len(char_to_int)
int_to_char = dict((i, c) for c, i in char_to_int.items())
model.load_state_dict(best_model)


In [5]:

# randomly generate a prompt
filename = "wonderland.txt"
seq_length = 100
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
start = np.random.randint(0, len(raw_text)-seq_length)
prompt = raw_text[start:start+seq_length]
pattern = [char_to_int[c] for c in prompt]


In [6]:
prompt

'y sat down, and nobody spoke for some minutes. alice thought to herself, “i don’t see how he can eve'

In [3]:
# prompt = "nother rush at the stick, and tumbled head over heels in its hurry to get hold of it; then alice, th"

In [7]:

model.eval()
print('Prompt: "%s"' % prompt)
with torch.no_grad():
    for i in range(1000):
        # format input array of int into PyTorch tensor
        x = np.reshape(pattern, (1, len(pattern), 1)) / float(n_vocab)
        x = torch.tensor(x, dtype=torch.float32)
        # generate logits as output from the model
        prediction = model(x.to(device))
        # convert logits into one character
        index = int(prediction.argmax())
        result = int_to_char[index]
        print(result, end="")
        # append the new character into the prompt for the next iteration
        pattern.append(index)
        pattern = pattern[1:]
print()
print("Done.")

Prompt: "y sat down, and nobody spoke for some minutes. alice thought to herself, “i don’t see how he can eve"
r finishe an ifr fear  when all the bourd ne the hoope oe the pame of the had never begore the thie at the white rabbit hurried tprs oi the dour oo aagk the wan oefe the tas oooee of the hoosess ponee of the sert of the had not the san off as her fear  and  shth the was serereene of the hing and the har and gloves, and  as the was sery oook the were all surning and tooce in the dound ne hear the rabbit sere all sorning an ier fear  he hand  whth a large dan in the distance, and she whrh little sas so tee sooee of her head to her fead mede the thiee was so tee ion he dan her feed here thenk the tas off a head uire to tee ion he dan her feed here thenk ier siater and blnvereny dresmed and she sert of the had nedd her feed me her head to her fead hed he i’ve been changed abfore the thie at the white rabbit hurried tprs oi the dour oo aagk the wan oefe the tas oooee of the hoosess

In [None]:
# Prompt: "nother rush at the stick, and tumbled head over heels in its hurry to get hold of it; then alice, th"