In [1]:
import numpy as np
filename = "../data/full_tab_string.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()

 
# create mapping of unique chars to integers
chars = ['G19', 'A04', 'D12', 'E06', 'e15', 'e17', 'e16', 'G10', 'e18', 
          'D00', 'G11', 'A15', 'G06', 'e00', 'D10', 'A01', 'D01', 'B08', 'B07', 
          'e12', 'E17', 'A02', 'B04', 'D06', 'G08', 'B13', 'D15', 'E09', 'B03', 
          'B09', 'B02', 'A14', 'A12', 'e08', 'B01', 'e14', 'E07', 'G15', 'E03', 
          'A11', 'E19', 'D17', 'D04', 'D14', 'D05', 'B12', 'B17', 'D09', 'B06', 
          'G16', 'e01', 'G09', 'e07', 'e02', 'A10', 'D11', 'B20', 'B10', 'B15', 
          'A05', 'A06', 'e11', 'B14', 'B18', 'G18', 'E14', 'G21', 'D07', 'B11', 
          'B00', 'e04', 'G04', 'E10', 'e05', 'e09', 'e21', 'A03', 'G17', 'G03', 'E11', 
          'D16', 'E08', 'B16', 'B19', 'B21', 'D02', 'E00', 'e06', 'G12', 'E04', 'e19', 
          'A07', 'e13', 'G02', 'A08', 'A09', 'D03', 'A00', 'G00', 'A13', 'e03', 'G01', 'G07', 
          'E12', 'D08', 'e20', 'D13', 'G05', 'G14', 'D19', 'B05', 'e10', 'E02', 'e22', 
          'E05', 'E01', 'E13']
char_to_int = dict((c, i) for i, c in enumerate(chars))
 
# summarize the loaded data
n_chars = len(raw_text) // 4
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)
print(char_to_int)

Total Characters:  37043
Total Vocab:  117
{'G19': 0, 'A04': 1, 'D12': 2, 'E06': 3, 'e15': 4, 'e17': 5, 'e16': 6, 'G10': 7, 'e18': 8, 'D00': 9, 'G11': 10, 'A15': 11, 'G06': 12, 'e00': 13, 'D10': 14, 'A01': 15, 'D01': 16, 'B08': 17, 'B07': 18, 'e12': 19, 'E17': 20, 'A02': 21, 'B04': 22, 'D06': 23, 'G08': 24, 'B13': 25, 'D15': 26, 'E09': 27, 'B03': 28, 'B09': 29, 'B02': 30, 'A14': 31, 'A12': 32, 'e08': 33, 'B01': 34, 'e14': 35, 'E07': 36, 'G15': 37, 'E03': 38, 'A11': 39, 'E19': 40, 'D17': 41, 'D04': 42, 'D14': 43, 'D05': 44, 'B12': 45, 'B17': 46, 'D09': 47, 'B06': 48, 'G16': 49, 'e01': 50, 'G09': 51, 'e07': 52, 'e02': 53, 'A10': 54, 'D11': 55, 'B20': 56, 'B10': 57, 'B15': 58, 'A05': 59, 'A06': 60, 'e11': 61, 'B14': 62, 'B18': 63, 'G18': 64, 'E14': 65, 'G21': 66, 'D07': 67, 'B11': 68, 'B00': 69, 'e04': 70, 'G04': 71, 'E10': 72, 'e05': 73, 'e09': 74, 'e21': 75, 'A03': 76, 'G17': 77, 'G03': 78, 'E11': 79, 'D16': 80, 'E08': 81, 'B16': 82, 'B19': 83, 'B21': 84, 'D02': 85, 'E00': 86, 'e06': 87

In [2]:
seq_length = 40
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 4):
    seq_in = raw_text[i:i + seq_length]
    seq_in = seq_in.split()

    seq_out = raw_text[i + seq_length:i + seq_length+3]

    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  9251


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
 
 
# reshape X to be [samples, time steps, features]
mod_seq_length = seq_length // 4
X = torch.tensor(dataX, dtype=torch.float32).reshape(n_patterns, mod_seq_length, 1)
X = X / float(n_vocab)
y = torch.tensor(dataY)
print(X.shape, y.shape)

torch.Size([9251, 10, 1]) torch.Size([9251])


In [4]:
import torch.optim as optim
import torch.utils.data as data
 
class CharModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(256, n_vocab)
        
    def forward(self, x):
        x, _ = self.lstm(x)
        # take only the last output
        x = x[:, -1, :]
        # produce output
        x = self.linear(self.dropout(x))
        return x

In [5]:

n_epochs = 100
batch_size = 128
model = CharModel()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")
loader = data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=batch_size)
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            y_pred = model(X_batch)
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))
 
torch.save([best_model, char_to_int], "single-char.pth")

Epoch 0: Cross-entropy: 35123.4414
Epoch 1: Cross-entropy: 35115.0547
Epoch 2: Cross-entropy: 35048.6758
Epoch 3: Cross-entropy: 35047.5781
Epoch 4: Cross-entropy: 35021.4922
Epoch 5: Cross-entropy: 34916.0508
Epoch 6: Cross-entropy: 34814.4609
Epoch 7: Cross-entropy: 34657.5469
Epoch 8: Cross-entropy: 33527.3242
Epoch 9: Cross-entropy: 33051.7109
Epoch 10: Cross-entropy: 32259.2852
Epoch 11: Cross-entropy: 31774.3086
Epoch 12: Cross-entropy: 31850.6992
Epoch 13: Cross-entropy: 30459.0195
Epoch 14: Cross-entropy: 28976.2773
Epoch 15: Cross-entropy: 27589.5938
Epoch 16: Cross-entropy: 25601.4805
Epoch 17: Cross-entropy: 24047.9961
Epoch 18: Cross-entropy: 23041.7227
Epoch 19: Cross-entropy: 22404.2988


In [6]:
tab_to_follow = ['A06', 'A08', 'A09', 'D06', 'D08', 'D09', 'G06', 'G08']
tab_int = [char_to_int[char] for char in tab_to_follow]
print(tab_int)
input_tab = 60 

[60, 94, 95, 23, 104, 47, 12, 24]


In [32]:
Eb = ['A06', 'E11', 'D01']
F = ['A08', 'E13', 'D03']
Gb = ['A09', 'E14', 'D04']
Ab = ['D06', 'A11', 'G01']
Bb = ['D08', 'A13', 'G03']
Cb = ['D09', 'A14', 'G04']
Db = ['G06', 'D11', 'B02']
Eb2 = ['G08', 'D13', 'B04']

tab_int_set = []
tab_int_set.append([char_to_int[char] for char in Eb])
tab_int_set.append([char_to_int[char] for char in F])
tab_int_set.append([char_to_int[char] for char in Gb])
tab_int_set.append([char_to_int[char] for char in Ab])
tab_int_set.append([char_to_int[char] for char in Bb])
tab_int_set.append([char_to_int[char] for char in Cb])
tab_int_set.append([char_to_int[char] for char in Db])
tab_int_set.append([char_to_int[char] for char in Eb2])

int_to_char = dict((i, c) for c, i in char_to_int.items())

model.eval()
print('Prompt: "%s"' % input_tab)
with torch.no_grad():
    for i in range(8):
        # format input array of int into PyTorch tensor
        x = np.reshape(tab_int_set[i], (1, len(tab_int_set[i]), 1)) / float(n_vocab)
        x = torch.tensor(x, dtype=torch.float32)

        # generate logits as output from the model
        prediction = model(x)

        indices = torch.tensor(tab_int_set[i])
        torch.index_select(prediction, 1, indices)

        pred_set = []
        pred_set.append(prediction[0][tab_int_set[i][0]])
        pred_set.append(prediction[0][tab_int_set[i][1]])
        pred_set.append(prediction[0][tab_int_set[i][2]])
        pred_set = torch.Tensor(pred_set)
        print(pred_set)
        print(pred_set.argmax())
        # convert logits into one character
        index = int(tab_int_set[i][pred_set.argmax()])
        result = int_to_char[index]
        print(result, end=" ")
        # append the new character into the prompt for the next iteration
        tab_int_set[i].append(index)
        tab_int_set[i] = tab_int_set[i][1:]
print("Done.")

Prompt: "60"
tensor([-0.3441, -1.2526, -0.8380])
tensor(0)
A06 tensor([-1.2390, -1.6690, -0.8857])
tensor(2)
D03 tensor([-0.8438, -0.7367, -0.2894])
tensor(2)
D04 tensor([-0.3261, -0.2557, -1.6228])
tensor(1)
A11 tensor([-0.0401, -1.0427, -0.4801])
tensor(0)
D08 tensor([ 0.2313, -1.0570, -0.2214])
tensor(0)
D09 tensor([-0.3017, -0.2318,  0.1552])
tensor(2)
B02 tensor([ 0.5206, -1.3698, -0.5225])
tensor(0)
G08 Done.
