## Basic LSTM rewrite for rease of further experiments

still loosely based on:  
https://machinelearningmastery.com/text-generation-with-lstm-in-pytorch/

but also:  
https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/10_time_series_forecasting_in_tensorflow.ipynb

and some:  
https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/08_introduction_to_nlp_in_tensorflow.ipynb

### Imports

In [8]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data

### Function to load the data

In [10]:
def load_text_data(filename):
    raw_text = open(filename, 'r', encoding='utf-8').read()
    raw_text = raw_text.lower()
    return raw_text

### Function to Map Chars to Integers

In [11]:
def map_chars_to_int(raw_text):
    # create mapping of unique chars to integers
    chars = sorted(list(set(raw_text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for c, i in char_to_int.items())

    n_chars = len(raw_text)
    vocab_length = len(chars)
    print("Total Characters: ", n_chars)
    print("Vocab len: ", vocab_length)
    
    return chars, char_to_int, int_to_char, vocab_length

### Function to window over data (currently unused)
to create training data and labels  
with specified sequence (input) window length and horizon (output) len

only works with numbers...

In [25]:
# Create function to label windowed data
def get_labelled_windows(x, horizon=1):
    """
    Creates labels for windowed dataset.
    
    E.g. if horizon=1 (default)
    Input: [1, 2, 3, 4, 5, 6] -> Output: ([1, 2, 3, 4, 5], [6])
    """
    return x[:, :-horizon], x[:, -horizon:]


def make_windows(x, window_size, horizon = 1):
    """
    Turns a 1D array into a 2D array of sequential windows of window_size.
    """
    # 1. Create a window of specific window_size (add the horizon on the end for later labelling)
    window_step = np.expand_dims(np.arange(window_size+horizon), axis=0)
    # print(f"Window step:\n {window_step}")
    
    # 2. Create a 2D array of multiple window steps (minus 1 to account for 0 indexing)
    window_indexes = window_step + np.expand_dims(np.arange(len(x)-(window_size+horizon-1)), axis=0).T # create 2D array of windows of size window_size
    # print(f"Window indexes:\n {window_indexes[:3], window_indexes[-3:], window_indexes.shape}")
    
    # 3. Index on the target array (time series) with 2D array of multiple window steps
    windowed_array = x[window_indexes]
    
    # 4. Get the labelled windows
    windows, labels = get_labelled_windows(windowed_array, horizon=horizon)
    
    return windows, labels

### Function to window over chars and convert chars to numbers

In [12]:
# def chars_to_numbers_X_Y(raw_data, chars, char_to_int, window_size, horizon):
def chars_to_numbers_X_Y_1(raw_data, chars, char_to_int, window_size):
    n_chars = len(raw_data)
    dataX = []
    dataY = []
    for i in range(0, n_chars - window_size, 1):
        seq_in = raw_text[i:i + window_size]
        # seq_out = raw_text[i + window_size: i + window_size + horizon]
        seq_out = raw_text[i+window_size]
        dataX.append([char_to_int[char] for char in seq_in])
        # dataY.append([char_to_int[char] for char in seq_out])
        dataY.append(char_to_int[seq_out])
    return dataX, dataY

In [6]:
#chars, char_to_int, int_to_char, vocab_length = map_chars_to_int(raw_text)

In [7]:
#dataX, dataY = chars_to_numbers_X_Y(raw_text, chars, char_to_int, seq_length, horizon)

In [8]:
# len_X = len(dataX)
# len_X

### Function to reshape data for training and turn into tensors
reshape X to be [number_of_sequences, length_of_sentence, num_of_features] where:  
number_of_seq.. = amount of samples generated  
length_of_sentence also known as time steps = seq_length  
num_of_features = output len, 1 character or more  

plus normalization of data by vocab_length - total number of distinct characters - **necesarry or not?**

In [13]:
def reshape_train_data(x, y, window_size, horizon, device, vocab_length):
    n_seq = len(x)
    X = torch.tensor(x, dtype=torch.float32).reshape(n_seq, window_size, horizon).to(device)
    X = X / float(vocab_length)
    y = torch.tensor(y).to(device)
    y = y.squeeze()
    print(X.shape, y.shape)
    return X, y

In [10]:
# X, y = reshape_train_data(dataX, dataY, seq_length, horizon, device)

### Function to define LSTM-only model

In [14]:
def create_LSTM_model(vocab_length, input_size=1, hidden_size=256, num_layers=2, dropout_perc=0.2, batch_first=True):
    """
    we pass in vocab_length, to define linear after LSTM output range, because that's the number of possible characters
    the network output could be
    """
    class CharModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.lstm = nn.LSTM(input_size=input_size,
                                hidden_size=hidden_size,
                                num_layers=num_layers,
                                batch_first=batch_first,
                                dropout=dropout_perc)
            self.dropout = nn.Dropout(dropout_perc)
            self.linear = nn.Linear(hidden_size, vocab_length)
        def forward(self, x):
            x, _ = self.lstm(x)
            # take only the last output
            x = x[:, -1, :]
            # produce output
            x = self.linear(self.dropout(x))
            return x
    model = CharModel()
    model = model.to(device)
    return model

### Define traning optimalizer, loss function, data loader

In [15]:
def create_adam_loss_loader(X, y, model, batch_size, lr):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss(reduction="sum")
    loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X, y), shuffle=True, batch_size=batch_size)
    return optimizer, loss_fn, loader

### Function to train model
and save the best model to file

In [16]:
def train_model(model, optimizer, loss_fn, loader, epochs, device):
    best_model = None
    best_loss = np.inf
    for epoch in range(n_epochs):
        model.train()
        for X_batch, y_batch in loader:
            y_pred = model(X_batch.to(device))
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # Validation
        model.eval()
        loss = 0
        with torch.inference_mode():
            for X_batch, y_batch in loader:
                y_pred = model(X_batch.to(device))
                loss += loss_fn(y_pred, y_batch)
            if loss < best_loss:
                best_loss = loss
                best_model = model.state_dict()
            print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))
    return best_model

### Functions to save models

In [17]:
def save_char_model_with_mappings(best_model, file_name, char_to_int, int_to_char):
    """
    char_to_int / char mappign saved along with model
    """
    torch.save([best_model, char_to_int, int_to_char], f"{file_name}.pth")

### Function to evaluate the model / make example predictions

In [18]:
def get_random_raw_line(raw_text, seq_length, char_to_int):
    start = np.random.randint(0, len(raw_text)-seq_length)
    prompt = raw_text[start:start+seq_length]
    print(f"Chosen prompt: {prompt}")
    return prompt

In [19]:
def eval_char_model(prompt, model, char_to_int, int_to_char, vocab_length, output_length = 250):
    """
    vocab_length for normalization
    """
    pattern = [char_to_int[c] for c in prompt]
    print('Prompt: "%s"' % prompt)
    with torch.inference_mode():
        for i in range(output_length):
            # format input array of int into PyTorch tensor
            x = np.reshape(pattern, (1, len(pattern), 1)) / float(vocab_length)
            x = torch.tensor(x, dtype=torch.float32).to(device)
            # generate logits as output from the model
            prediction = model(x)
            # convert logits into one character
            index = int(prediction.argmax())
            result = int_to_char[index]
            print(result, end="")
            # append the new character into the prompt for the next iteration
            pattern.append(index)
            pattern = pattern[1:]
    print()
    print("Done.")

### function to load model from state_dict

In [16]:
# def load_model(filename):
#     best_model, char_to_int, int_to_char = torch.load(filename)
# # reload the model
# class CharModel(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True)
#         self.dropout = nn.Dropout(0.2)
#         self.linear = nn.Linear(256, n_vocab)
#     def forward(self, x):
#         x, _ = self.lstm(x)
#         # take only the last output
#         x = x[:, -1, :]
#         # produce output
#         x = self.linear(self.dropout(x))
#         return x
# model = CharModel()
# model.load_state_dict(best_model)

## Basic function usage (for char-by-char LSTM and raw data start point):
1. raw_text = load_text_data(filename)
2. chars, char_to_int, int_to_char, vocab_length = map_chars_to_int(raw_text)
3. dataX, dataY = chars_to_numbers_X_Y(raw_data, chars, char_to_int, window_size, horizon) ... or chars_to_number_X_Y_1 for horizon of 1 (default for now)
4. X, y = reshape_train_data(x, y, window_size, horizon, device, vocab_length)
5. model = create_LSTM_model(vocab_length, input_size=1, hidden_size=256, num_layers=2, dropout_perc=0.2, batch_first=True)
6. create_adam_loss_loader(X, y, model, batch_size, lr) -> optimizer, loss_fn, loader
7. best_model = train_model(model, optimizer, loss_fn, loader, epochs)
8. save_char_model_with_mappings(best_model, file_name, char_to_int, int_to_char)
9. prompt = get_random_raw_line(raw_text, seq_length)
10. eval_char_model(prompt, model, char_to_int, int_to_char, vocab_length, output_length = 250)

where:
* filename - path to file with raw text data
* window_size = seq_length - length of input the model will be trained to predict based on
* horizon = input_size - the length of its output (one character, many characters, words etc) 
* device - "cuda" or "cpu"
* optimizer = Adam etc
* lossfn = crossentrophy etc
* char_to_int, int_to_char - mapping functions
* output_length - predicted eval text length
* epochs - tranign epochs

### Example usage:

In [35]:
# data path
# filename = "mfdoom_100.txt"
filename = "mfdoom_10.txt"
filename_no_ext = "mfdoom_10"

# input window
window_size = 100 #seq_length
# output window / predicted
horizon = 1

# --------- training -----------
# torch device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#
n_epochs = 50
batch_size = 256

###
hidden_size = 500
num_layers = 3

lr = 0.001

lr_str = str(lr).replace('.', '_')

model_type = "char_LSTM"

experiment_name = f'{model_type}_{filename_no_ext}_window_{window_size}_horizon_{horizon}_epochs_{n_epochs}_hidden_{hidden_size}_layers_{num_layers}_lr_{lr_str}'

In [58]:
raw_text = load_text_data(filename)

In [59]:
chars, char_to_int, int_to_char, vocab_length = map_chars_to_int(raw_text)

Total Characters:  95040
Vocab len:  64


In [60]:
dataX, dataY = chars_to_numbers_X_Y_1(raw_text, chars, char_to_int, window_size)

In [61]:
# dataX[:5]
len(dataX)
dataY[:5]

[34, 31, 1, 49, 34]

In [62]:
X, y = reshape_train_data(dataX, dataY, window_size, horizon, device, vocab_length)

torch.Size([94940, 100, 1]) torch.Size([94940])


In [63]:
y[:2]

tensor([34, 31], device='cuda:0')

In [64]:
model_doom_big = create_LSTM_model(vocab_length, hidden_size=256, num_layers=2)

In [21]:
optimizer, loss_fn, loader = create_adam_loss_loader(X, y, model_doom_big, batch_size, lr)

NameError: name 'X' is not defined

In [66]:
# y.squeeze().shape

In [67]:
best_model_doom_big = train_model(model_doom_big, optimizer, loss_fn, loader, experiment_name, n_epochs, char_to_int, int_to_char)

Epoch 0: Cross-entropy: 289788.6250
Epoch 1: Cross-entropy: 270306.1562
Epoch 2: Cross-entropy: 264447.8125
Epoch 3: Cross-entropy: 259005.6562
Epoch 4: Cross-entropy: 253187.5781
Epoch 5: Cross-entropy: 248495.3594
Epoch 6: Cross-entropy: 244034.0625
Epoch 7: Cross-entropy: 240350.9062
Epoch 8: Cross-entropy: 236372.0938
Epoch 9: Cross-entropy: 233145.3906
Epoch 10: Cross-entropy: 230923.9375
Epoch 11: Cross-entropy: 231813.6875
Epoch 12: Cross-entropy: 224254.0156
Epoch 13: Cross-entropy: 220783.9375
Epoch 14: Cross-entropy: 220087.6406
Epoch 15: Cross-entropy: 216040.7656
Epoch 16: Cross-entropy: 213813.8906
Epoch 17: Cross-entropy: 213026.5156
Epoch 18: Cross-entropy: 208641.2500
Epoch 19: Cross-entropy: 206389.4531
Epoch 20: Cross-entropy: 203122.1875
Epoch 21: Cross-entropy: 200168.4688
Epoch 22: Cross-entropy: 202001.5312
Epoch 23: Cross-entropy: 195821.3125
Epoch 24: Cross-entropy: 193132.1875
Epoch 25: Cross-entropy: 190988.9375
Epoch 26: Cross-entropy: 188532.8594
Epoch 27: C

In [68]:
prompt = get_random_raw_line(raw_text, window_size, char_to_int)

Chosen prompt: effects
before you press charges use your noodle
so what when he grab the mic he crush your cute cut


In [69]:
eval_model(prompt = prompt,
           model = model_doom_big,
           char_to_int = char_to_int,
           int_to_char = int_to_char,
           vocab_length = vocab_length,
           output_length = 250)

Prompt: "effects
before you press charges use your noodle
so what when he grab the mic he crush your cute cut"
 the word ‘ou might also like
iow mand the same saidns bead beat

ye can sepping all thet
in the forn like she word ‘ou might also like
io so she coon with a bount of the sooe whth the broker then i meed to het she said the wante hear to the case to 
Done.


### TODOs:
1. function chars_to_numbers_X_Y for now only works with horizon = 1, fix that - generally - generating multiple characters / words with RNNs
2. slowa - jesli tak, to inne kroki zamiast char itd, wiec wciaz troche chuj a nie uniwersalne)
3. TorchText?
4. inni artysci (na koniec?)
5. inne architektury (powinno zamieniac wtedy tylko krok z modelem)
6. 

In [70]:
# torch.save([best_model, char_to_int, int_to_char], f"{experiment_name}.pth")

NameError: name 'best_model' is not defined

## Word tokenizing and embeddings with spacy

In [1]:
import spacy
import collections

In [2]:
nlp = spacy.load("en_core_web_trf")
import en_core_web_trf
nlp = en_core_web_trf.load()

In [41]:
# doc = nlp("This is a sentence.")
# print([(w.text, w.pos_) for w in doc])

### Function to load the data from file into a worldlist with possible extra preprocessing

In [3]:
def create_wordlist(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        data = file.read()
        # create wordlist
        doc = nlp(data)
        wordlist = []
        for word in doc:
            # + extra preprocessing, like excluding certain characters
            if word.text not in ("\n","\n\n",'\u2009','\xa0'):
                wordlist.append(word.text.lower())
    return wordlist

#### Function to create word counts, a vocabulary and word mapping to indexes

In [4]:
def create_word_counts(wordlist):
    # count the number of words
    word_counts = collections.Counter(wordlist)
    
    # Mapping from index to word : that's the vocabulary
    vocabulary_inv = [x[0] for x in word_counts.most_common()] # if most_common parameter omitted, returns all words
    vocabulary_inv = list(sorted(vocabulary_inv))

    # Mapping from word to index
    vocab = {x: i for i, x in enumerate(vocabulary_inv)}
    words = [x[0] for x in word_counts.most_common()]

    #size of the vocabulary
    vocab_size = len(words)
    print("vocab size: ", vocab_size)

    return vocab, words, vocab_size
    #save the words and vocabulary
    # with open(os.path.join(vocab_file), 'wb') as f:
    #     cPickle.dump((words, vocab, vocabulary_inv), f)

### Split word data into X (window) and Y (horizon)

In [5]:
def create_word_windows(wordlist, window_size):
    #create sequences
    sequences = []
    next_words = []
    for i in range(0, len(wordlist) - window_size, 1):
        sequences.append(wordlist[i: i + window_size])
        next_words.append(wordlist[i + window_size])
    
    print('len X:', len(sequences))
    print('len y:', len(next_words))
    return sequences, next_words

### Transform windowed data to index form

In [6]:
def create_word_X_Y(sequences, next_words, window_size, vocab, vocab_size):  
    X = np.zeros((len(sequences), window_size, vocab_size), dtype=np.float32)
    y = np.zeros((len(sequences), vocab_size), dtype=np.bool_)
    for i, sentence in enumerate(sequences):
        for t, word in enumerate(sentence):
            X[i, t, vocab[word]] = 1
        y[i, vocab[next_words[i]]] = 1
    return X, y

### Steps
1. create_wordlist(filepath) -> wordlist
2. create_word_counts(wordlist) -> vocab, words, vocab_size
3. create_word_windows(wordlist, window_size) -> sequences, next_words
4. create_word_X_Y(sequences, next_words, window_size, vocab, vocab_size) -> X, y
5. device setup
6. model = create_LSTM_model(vocab_length, input_size=1, hidden_size=256, num_layers=2, dropout_perc=0.2, batch_first=True)
7. create_adam_loss_loader(X, y, model, batch_size, lr) -> optimizer, loss_fn, loader8. 
best_model = train_model(model, optimizer, loss_fn, loader, epochs
9. save_word_model
10. prompt = get_random_raw_line(raw_text, seq_length)
11. eval_word_model)

#### Example usage:

In [23]:
# data path
# filename = "mfdoom_100.txt"
filename = "mfdoom_50.txt"
filename_no_ext = "mfdoom_50"

# input window
window_size = 10 #seq_length
# output window / predicted
horizon = 1

# --------- training -----------
# torch device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#
n_epochs = 100
batch_size = 256

###
hidden_size = 256
num_layers = 2

lr = 0.01

lr_str = str(lr).replace('.', '_')

model_type = "word_LSTM"

experiment_name = f'{model_type}_{filename_no_ext}_window_{window_size}_horizon_{horizon}_epochs_{n_epochs}_hidden_{hidden_size}_layers_{num_layers}_lr_{lr_str}'

In [26]:
wordlist = create_wordlist(filename)

In [27]:
vocab, words, vocab_size = create_word_counts(wordlist)

vocab size:  4262


In [28]:
sequences, next_words = create_word_windows(wordlist, window_size)

len X: 21467
len y: 21467


In [169]:
# X, y = create_word_X_Y(sequences, next_words, window_size, vocab, vocab_size)

In [29]:
def X_Y_to_numbers_reshape(sequences, next_words, window_size, vocab, vocab_size, device):
    dataX = []
    dataY = []
    for i, seq in enumerate(sequences):
        dataX.append([vocab[char] for char in seq])
        dataY.append(vocab[next_words[i]])
    n_seq = len(sequences)
    X = torch.tensor(dataX, dtype=torch.float32).reshape(n_seq, window_size, 1).to(device)
    X = X / float(vocab_size)
    y = torch.tensor(dataY).to(device)
    y = y.squeeze()
    print(X.shape, y.shape)
    return X, y

In [30]:
X, y = X_Y_to_numbers_reshape(sequences, next_words, window_size, vocab, vocab_size, device)

torch.Size([21467, 10, 1]) torch.Size([21467])


In [171]:
# X = torch.tensor(X).to(device)
# y = torch.tensor(y).to(device)

In [184]:
word_doom_small_model = create_LSTM_model(vocab_length = vocab_size,
                                          input_size = 1,
                                          hidden_size = hidden_size,
                                          num_layers=num_layers)

In [188]:
optimizer, loss_fn, loader = create_adam_loss_loader(X, y, word_doom_small_model, batch_size, lr)

In [189]:
best_model = train_model(word_doom_small_model, optimizer, loss_fn, loader, n_epochs, device)

Epoch 0: Cross-entropy: 134334.3281
Epoch 1: Cross-entropy: 133306.8906
Epoch 2: Cross-entropy: 132716.8125
Epoch 3: Cross-entropy: 132247.0312
Epoch 4: Cross-entropy: 131984.1719
Epoch 5: Cross-entropy: 131511.8906
Epoch 6: Cross-entropy: 131423.7188
Epoch 7: Cross-entropy: 130830.5000
Epoch 8: Cross-entropy: 130729.2500
Epoch 9: Cross-entropy: 130268.3125
Epoch 10: Cross-entropy: 130054.8672
Epoch 11: Cross-entropy: 130292.2891
Epoch 12: Cross-entropy: 129344.4062
Epoch 13: Cross-entropy: 129343.7812
Epoch 14: Cross-entropy: 129209.1016
Epoch 15: Cross-entropy: 128723.7812
Epoch 16: Cross-entropy: 128777.2344
Epoch 17: Cross-entropy: 128699.2344
Epoch 18: Cross-entropy: 128166.9375
Epoch 19: Cross-entropy: 127921.9141
Epoch 20: Cross-entropy: 127977.9219
Epoch 21: Cross-entropy: 127375.3125
Epoch 22: Cross-entropy: 127022.9922
Epoch 23: Cross-entropy: 127292.4688
Epoch 24: Cross-entropy: 126703.6641
Epoch 25: Cross-entropy: 126816.2656
Epoch 26: Cross-entropy: 126589.6328
Epoch 27: C

In [163]:
#sequences[:5]

In [164]:
#next_words[:5]

In [165]:
# X[:5]

In [166]:
# y[:5]

In [31]:
def get_random_raw_words(wordlist, seq_length):
    start = np.random.randint(0, len(wordlist)-seq_length)
    prompt = wordlist[start:start+seq_length]
    print(f"Chosen prompt: {prompt}")
    return prompt

In [32]:
word_counts = collections.Counter(wordlist)    
# Mapping from index to word : that's the vocabulary
vocabulary_inv = [x[0] for x in word_counts.most_common()] # if most_common parameter omitted, returns all words
vocabulary_inv = list(sorted(vocabulary_inv))

In [33]:
def eval_word_model(prompt, model, vocab, vocab_length, output_length = 100):
    """
    vocab_length for normalization
    """
    result_total = []
    pattern = [vocab[c] for c in prompt]
    print('Prompt: "%s"' % prompt)
    with torch.inference_mode():
        for i in range(output_length):
            # format input array of int into PyTorch tensor
            x = np.reshape(pattern, (1, len(pattern), 1)) / float(vocab_length)
            x = torch.tensor(x, dtype=torch.float32).to(device)
            # generate logits as output from the model
            prediction = model(x)
            # convert logits into one character
            index = int(prediction.argmax())
            result = vocabulary_inv[index]
            result_total.append(result)
            # print(result, end="")
            # append the new character into the prompt for the next iteration
            pattern.append(index)
            pattern = pattern[1:]
    print(' '.join(result_total))
    print("Done.")
    return result_total

In [192]:
prompt = get_random_raw_words(wordlist, window_size)

Chosen prompt: [',', 'peeps', ',', 'brothers', ',', 'sisters', ',', 'duns', ',', 'dunnies']


In [193]:
result = eval_word_model(prompt, word_doom_small_model, vocab, vocab_size, 100)

Prompt: "[',', 'peeps', ',', 'brothers', ',', 'sisters', ',', 'duns', ',', 'dunnies']"
, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,
Done.


In [178]:
torch.save([best_model, vocab, vocabulary_inv], f"{experiment_name}.pth")
# torch.save([best_model, vocab, vocabulary_inv, hidden_size, num_layers, dropout_perc, window_size], f"{experiment_name}.pth")

### Load the model

In [9]:
model_path = "word_LSTM_mfdoom_10_window_10_horizon_1_epochs_50_hidden_256_layers_2_lr_0_01.pth"

In [24]:
best_model, vocab, vocabulary_inv = torch.load(model_path)
# best_model, vocab, vocabulary_inv, hidden_size, num_layers, dropout_perc, window_size = torch.load(model_path)
vocab_length = len(vocab)

In [25]:
# model = CharModel()
# model.load_state_dict(best_model)
loaded_model = create_LSTM_model(vocab_length, input_size=1,
                  hidden_size=hidden_size,
                  num_layers=num_layers,
                  dropout_perc=0.2,
                  batch_first=True)
loaded_model.load_state_dict(best_model)

<All keys matched successfully>

In [51]:
prompt = get_random_raw_words(wordlist, window_size)

Chosen prompt: ['(', 'misc', '.', 'voice', '):', 'food', ',', 'we', 'need', 'food']


In [36]:
result = eval_word_model(prompt, loaded_model, vocab, vocab_size, 100)

Prompt: "[',', 'lock', '-', 'down', ',', 'wet', 'dreams', 'of', 'fox', "'"]"
brown on doomsday ! , ever since the womb ‘ til i 'm back where my brother went , that 's what my tomb will say right above my government ; dumile with know drew in her only " that , that got and now 's loco 'm well open these they shut ( voice last time i split the wishbone a man first throwing mcs pox you it it her face , lay beer , do n't , " rock lives 's poor , un nigga ? we shall now vote doom , the or . i down n't
Done.


In [52]:
prompt[:5]

['(', 'misc', '.', 'voice', '):']

In [53]:
pattern = [vocab[c] for c in prompt]

In [54]:
pattern[:5]

[15, 2337, 21, 4035, 17]

In [40]:
output_len = 10

In [83]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float32')
    preds = np.log(np.abs(preds)) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [84]:
with torch.inference_mode():
    for i in range(output_len):
        x = np.reshape(pattern, (1, len(pattern), 1)) / float(vocab_length)
        x = torch.tensor(x, dtype=torch.float32).to(device)
        # print(x)
        prediction = loaded_model(x)
        # argm = prediction.argmax()
        sampled = sample(prediction.cpu().squeeze())
        # index = int(sampled)
        # argm = argm.cpu()
        # print(prediction)
        print(sampled)

2816
2698
1314
3624
1536
3044
623
2344
2181
2388


In [88]:
def eval_word_model_with_sampling(prompt, model, vocab, vocab_length, temperature = 0.8, output_length = 100):
    """
    vocab_length for normalization
    """
    result_total = []
    pattern = [vocab[c] for c in prompt]
    print('Prompt: "%s"' % prompt)
    with torch.inference_mode():
        for i in range(output_length):
            # format input array of int into PyTorch tensor
            x = np.reshape(pattern, (1, len(pattern), 1)) / float(vocab_length)
            x = torch.tensor(x, dtype=torch.float32).to(device)
            # generate logits as output from the model
            prediction = model(x)
            index = sample(prediction.cpu().squeeze(), temperature=temperature)
            # convert logits into one character
            # index = int(prediction.argmax())
            result = vocabulary_inv[index]
            result_total.append(result)
            # print(result, end="")
            # append the new character into the prompt for the next iteration
            pattern.append(index)
            pattern = pattern[1:]
    print(' '.join(result_total))
    print("Done.")
    return result_total

In [92]:
eval_word_model_with_sampling(prompt, loaded_model, vocab, vocab_length, 0.8, 100)

Prompt: "['(', 'misc', '.', 'voice', '):', 'food', ',', 'we', 'need', 'food']"
always surprise bounce feeling bugged powerful queen extorter jar dot crabs awfully seductive blast autograph release vic eagle stink phases chalk mention whities rookies jason licker jam sneak beer arteries south % gloves triple smoking trading skill hella permission conducting treat planets lunch blazing rudely retarded trickles icicles rhyming address berries fingernail frontin poultry chance flesh suggest bike rapping whiner machine cheez states intercoms doze ayo pain incredible games briz joking actions gravy thigh tears : insulted noose receive dictionary bootlegger speed throws mescalines benjamin forget quit blooper jiminy thuggers haha slicker thief temazepam sweet kissed duck negro toke trust
Done.


['always',
 'surprise',
 'bounce',
 'feeling',
 'bugged',
 'powerful',
 'queen',
 'extorter',
 'jar',
 'dot',
 'crabs',
 'awfully',
 'seductive',
 'blast',
 'autograph',
 'release',
 'vic',
 'eagle',
 'stink',
 'phases',
 'chalk',
 'mention',
 'whities',
 'rookies',
 'jason',
 'licker',
 'jam',
 'sneak',
 'beer',
 'arteries',
 'south',
 '%',
 'gloves',
 'triple',
 'smoking',
 'trading',
 'skill',
 'hella',
 'permission',
 'conducting',
 'treat',
 'planets',
 'lunch',
 'blazing',
 'rudely',
 'retarded',
 'trickles',
 'icicles',
 'rhyming',
 'address',
 'berries',
 'fingernail',
 'frontin',
 'poultry',
 'chance',
 'flesh',
 'suggest',
 'bike',
 'rapping',
 'whiner',
 'machine',
 'cheez',
 'states',
 'intercoms',
 'doze',
 'ayo',
 'pain',
 'incredible',
 'games',
 'briz',
 'joking',
 'actions',
 'gravy',
 'thigh',
 'tears',
 ':',
 'insulted',
 'noose',
 'receive',
 'dictionary',
 'bootlegger',
 'speed',
 'throws',
 'mescalines',
 'benjamin',
 'forget',
 'quit',
 'blooper',
 'jiminy',
 'th