In [24]:
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data.dataset import Dataset 
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import pynvml
import psutil
from pathlib import Path
import datetime
# import nltk

In [25]:
batch_size = 128
device = 'cuda' if torch.cuda.is_available() else 'cpu'
cpu = 'cpu'
print(device)

pynvml.nvmlInit()

deviceCount = pynvml.nvmlDeviceGetCount()
for i in range(deviceCount):
    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
    print("Device", i, ":", pynvml.nvmlDeviceGetName(handle))

torch.cuda.empty_cache()

def get_mem_msg():
    mem_info = psutil.virtual_memory()
    used = round(mem_info.used / 1024 ** 3, 2)
    total = round(mem_info.total / 1024 ** 3, 2)
    msg = f'Mem: {used}/{total} GB, GPUs: '

    for i in range(deviceCount):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        used = round(mem_info.used / 1024 ** 3, 2)
        total = round(mem_info.total / 1024 ** 3, 2)
        msg += f'{used}/{total} GB, '

    return msg

print(get_mem_msg())

cuda
Device 0 : b'GeForce GTX 1650'
Mem: 8.69/15.49 GB, GPUs: 1.28/3.82 GB, 


In [26]:
books = [
    'gatsby.txt',
    'alice.txt',
    'frankenstein.txt',
    'grimm.txt',
    'sherlock.txt',
]

txt = ""
for b in books:   
    with open(b, 'r') as f:
        txt += f.read()

In [27]:
# txt = txt.replace("’", "'")
# txt = txt.replace('—', '-')
txt = txt.replace('’', "'")
# txt = txt.replace('“', '"')
# txt = txt.replace('”', '"')

# tokens = nltk.word_tokenize(txt)
# print(tokens[:10000])

chars = sorted(list(set(txt)))
print(chars)

txt = txt.lower()
words = re.findall(r"mr\.|mrs\.|[a-z0-9']+|[.,!?;:]", txt)

words_set = sorted(set(words))

words_nums = []
for w in words:
    words_nums.append(words_set.index(w))
print(words_nums[:100])

['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', '½', 'à', 'â', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ô', 'ù', 'œ', '\u200a', '—', '‘', '“', '”', '…', '\ufeff']
[13731, 10615, 6339, 4431, 9416, 13731, 6188, 5907, 18, 1940, 5069, 19, 11923, 5405, 13801, 4431, 7561, 5571, 13731, 14652, 9416, 684, 687, 7101, 13731, 14504, 13014, 613, 8925, 9555, 9806, 9416, 13731, 15404, 942, 9215, 3102, 613, 15329, 528, 9215, 11404, 15135, 19, 15512, 8554, 3055, 7578, 18, 6000, 7578, 1056, 9507, 10974, 14652, 7578, 14404, 13731, 13692, 9416, 13731, 10615, 6339, 8096, 7137, 15329, 13801, 4431, 9507, 9468, 942, 15471, 19, 6339, 19, 9525

In [28]:
seq_length = 10
dataX = []
dataY = []
for i in range(0, len(words_nums) - seq_length, 1):
    seq_in = words_nums[i:i + seq_length]
    seq_out = words_nums[i + seq_length]
    dataX.append(seq_in)
    dataY.append(seq_out)
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  429250


In [29]:
n_words_set = len(words_set)
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_words_set)
# one hot encode the output variable
y = np.array(dataY)
# y = np_utils.to_categorical(dataY)

print(X)
print(y)

[[[0.88381823]
  [0.6832518 ]
  [0.40802008]
  ...
  [0.3802137 ]
  [0.0011586 ]
  [0.12487127]]

 [[0.6832518 ]
  [0.40802008]
  [0.28520855]
  ...
  [0.0011586 ]
  [0.12487127]
  [0.32627446]]

 [[0.40802008]
  [0.28520855]
  [0.60607621]
  ...
  [0.12487127]
  [0.32627446]
  [0.00122297]]

 ...

 [[0.44007467]
  [0.89727085]
  [0.85298661]
  ...
  [0.89727085]
  [0.42127961]
  [0.01107106]]

 [[0.89727085]
  [0.85298661]
  [0.89727085]
  ...
  [0.42127961]
  [0.01107106]
  [0.59037075]]

 [[0.85298661]
  [0.89727085]
  [0.61553811]
  ...
  [0.01107106]
  [0.59037075]
  [0.28527291]]]
[ 5069    19 11923 ...  9172  4432    19]


In [30]:
class TextGenDataset(Dataset):
    def __init__(self, input_seqs, target_labels, n_patterns):
        self.input_seqs = input_seqs
        self.target_labels = target_labels
        self.n_patterns = n_patterns
    
    def __len__(self):
        return n_patterns
    
    def __getitem__(self, index):
        input_seq = torch.tensor(self.input_seqs[index], dtype=torch.float)
        target_label = torch.tensor(self.target_labels[index])
        return (input_seq, target_label)

In [31]:
text_gen_train = TextGenDataset(X, y, n_patterns)
text_gen_train_loader = torch.utils.data.DataLoader(dataset=text_gen_train,
                                                    batch_size=batch_size,
                                                    shuffle=True)
print(n_words_set)

15536


In [32]:
class LSTM(nn.Module):
    """ Custom CNN-LSTM model for sequence prediction problem """

    def __init__(self):
        """ Define and instantiate your layers"""
        super(LSTM, self).__init__()
        
        self.lstm1 = nn.LSTM(1, 512, batch_first=True)
        self.lstm2 = nn.LSTM(512, 2048, batch_first=True)
        
        self.fc1 = nn.Linear(2048, 8192)
        self.fc2 = nn.Linear(8192, n_words_set)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        out, _ = self.lstm1(x)
        out, _ = self.lstm2(out)
        out_last = out[:,-1,:]
        res = F.relu(self.fc1(out_last))
        res = self.fc2(res)
        res = self.log_softmax(res)
        return res

In [23]:
save_dir = Path("training_data") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
save_dir.mkdir(parents=True)
log_interval = 10

model = LSTM()
model = model.to(device)
loss_func = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00025)


def train(model):
    for epoch in range(100):
        save_path = self.save_dir / f"net_{epoch}.p"
        torch.save(model.state_dict(), save_path)
        total_loss = 0
        # sets training mode if we are doing dropout when training
        model.train()
        for batch_idx, (input_seqs, target_labels) in enumerate(text_gen_train_loader):
            input_seqs = input_seqs.to(device)
            target_labels = target_labels.to(device)

            res = model(input_seqs)

            optimizer.zero_grad()
            loss = loss_func(res, target_labels)
            loss.backward()
            optimizer.step()

            # print statistics
            total_loss += loss.item()
            if batch_idx % log_interval == log_interval - 1:
                torch.cuda.empty_cache()
                avg_loss = total_loss / log_interval
                mem_msg = get_mem_msg()
                print(f'epoch: {epoch}, loss: {avg_loss}, {mem_msg}')
                total_loss = 0

train(model)
print('Training Complete')

NameError: name 'n_words_set' is not defined