In [22]:
from datasets import load_dataset
from collections import Counter
import torch
import random
vocab = 30000
text_len = 30
# Load Wikitext-103 dataset
dataset = load_dataset(path="wikitext", name='wikitext-2-raw-v1', split="train")
# Define tokenizer
tokenizer = lambda x: x.split()  # Simple tokenizer splitting by space

# Tokenize the text and count frequency
counter = Counter()
train_data=[]
for example in dataset:
    tokens = tokenizer(example["text"])
    if len(tokens)>=text_len:
      train_data.append(tokens[:text_len])
    counter.update(tokens)

# Select the 5000 most common words
most_common_words = counter.most_common(vocab-1)
word_to_index = {word: i for i, (word, _) in enumerate(most_common_words)}

# values = list(word_to_index.values())
# random.shuffle(values)
# for i,k in enumerate(word_to_index.keys()):
#   word_to_index[k] = values[i]


# Define a function to convert text to one-hot encoding
def text_to_one_hot(text):
    token = text
    index = word_to_index.get(token, vocab-1)
    return index

# Convert example text into one-hot encoding as a torch tensor
print(train_data[:3])
train_w_data = []
for i in train_data:
  train_w_data.append([text_to_one_hot(word) for word in i])
train_data = train_w_data


[['Senjō', 'no', 'Valkyria', '3', ':', '<unk>', 'Chronicles', '(', 'Japanese', ':', '戦場のヴァルキュリア3', ',', 'lit', '.', 'Valkyria', 'of', 'the', 'Battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'Valkyria', 'Chronicles', 'III', 'outside', 'Japan'], ['The', 'game', 'began', 'development', 'in', '2010', ',', 'carrying', 'over', 'a', 'large', 'portion', 'of', 'the', 'work', 'done', 'on', 'Valkyria', 'Chronicles', 'II', '.', 'While', 'it', 'retained', 'the', 'standard', 'features', 'of', 'the', 'series'], ['It', 'met', 'with', 'positive', 'sales', 'in', 'Japan', ',', 'and', 'was', 'praised', 'by', 'both', 'Japanese', 'and', 'western', 'critics', '.', 'After', 'release', ',', 'it', 'received', 'downloadable', 'content', ',', 'along', 'with', 'an', 'expanded']]


In [32]:
from torch.utils.data import Dataset

class Text_dataset(Dataset):

  def __init__(self, text_list):
    self.data= text_list

  def __len__(self):
    return len(self.data)

  # This will
  def __getitem__(self,i):
    return torch.tensor(self.data[i][:-1]), torch.tensor(self.data[i][1:])

In [45]:
import torch.nn as nn

class LSTM_Gen(nn.Module):
  def __init__(self,input_size=1000, hidden_size=1024, hidden_layer=1,embedding_size=512, batch_size=20):
    super(LSTM_Gen, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.hidden_layer = hidden_layer
    self.embedding_size = embedding_size
    self.dropout = nn.Dropout(p=0.5)
    self.embedding = torch.nn.Embedding(self.input_size, self.embedding_size)
    self.lstm = torch.nn.LSTM(self.embedding_size, self.hidden_size, self.hidden_layer, batch_first=True)
    self.fc1 = torch.nn.Linear(self.hidden_size,self.input_size)
    self.batch_size = batch_size
    self.embedding.weight = self.fc1.weight


  def forward(self,x,hidden):
    x = self.embedding(x)
    x = self.dropout(x)
    x, _ = self.lstm(x, hidden)
    out = self.fc1(self.dropout(x))
    return out


In [47]:
from torch.utils.data import DataLoader
import torch.optim as optim

batch_size = 20
hidden_size = 1024
hidden_layer = 2
embedding_size = 1024
epoch = 5
learning_rate = 0.001
weight_decay = 0.00005

if torch.cuda.is_available():
    # Set device to CUDA
    device = torch.device("cuda")
    print("CUDA is available! Using GPU for training.")
else:
    # Set device to CPU
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU for training.")
dset = Text_dataset(train_data)
model = LSTM_Gen(vocab,hidden_size,hidden_layer,embedding_size,batch_size)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = torch.nn.CrossEntropyLoss()
model.train()
for e in range(epoch):
  train_loader = DataLoader(dset, batch_size=batch_size, shuffle=True)

  for i, data in enumerate(train_loader):
    inp, label = data[0],data[1]
    inp = inp.to(device)
    label = label.to(device)
    out = None
    hidden = (torch.zeros(hidden_layer, batch_size, hidden_size).to(device),torch.zeros(hidden_layer,batch_size, hidden_size).to(device))

    out = model(inp,hidden)
    out = out.permute(1,0,2)
    label = label.permute(1,0)
    loss = criterion(out[0], label[0])
    for j in range(1,text_len-1):
      loss += criterion(out[j], label[j])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if i%10==0:
      print(f'loss at epoch {e} and batch {i} is '+"Loss: {:.3f}".format(loss))
      print([torch.argmax(ans).item() for ans in out[:,0,:]],label[:,0])





CUDA is not available. Using CPU for training.
loss at epoch 0 and batch 0 is Loss: 299.083
[8784, 8784, 5643, 29111, 14544, 271, 23793, 20766, 271, 5643, 5666, 5643, 11786, 18742, 5666, 18742, 11786, 5643, 20606, 2565, 5643, 271, 5643, 7194, 26243, 9456, 18537, 5643, 17752] tensor([ 1068,    11,  2202,     1,     0,  3395, 24732,     1,    24,   638,
            1, 11175, 29999,     1,     4,    40,   440,     1,  2451,  1068,
           11,  2202,     4, 29999,  1068,    11,  2202,     1,  2844])
loss at epoch 0 and batch 10 is Loss: 232.792
[0, 1, 0, 1, 2, 3, 3, 1, 1, 3, 1, 1, 3, 4, 2, 1, 4, 1, 1, 6, 5, 1, 3, 0, 3, 1, 4, 1, 4] tensor([ 1750,     3,    24,   415,    13, 16591,   281,     6,  5609,     1,
        17418,   382,   783,     6,     0, 26662,     4,    29,  3491,    25,
            0,  1320,     6,   207,  5487,     2,   359,     7,    61])
loss at epoch 0 and batch 20 is Loss: 222.252
[29999, 0, 29999, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 29999, 29999, 0, 0, 1,