In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
from collections import Counter
import torch

# Load Wikitext-103 dataset
dataset = load_dataset(path="wikitext", name="wikitext-103-v1", split="train")
# Define tokenizer
tokenizer = lambda x: x.split()  # Simple tokenizer splitting by space

# Tokenize the text and count frequency
counter = Counter()
train_data=[]
for example in dataset:
    tokens = tokenizer(example["text"])
    if example["text"]!='':
      train_data.append(example["text"])
    counter.update(tokens)

# Select the 5000 most common words
most_common_words = counter.most_common(5000)
word_to_index = {word: i for i, (word, _) in enumerate(most_common_words)}

# Define a function to convert text to one-hot encoding
def text_to_one_hot(text):
    token = text
    index = word_to_index.get(token, -1)
    return index

# Convert example text into one-hot encoding as a torch tensor
train_w_data = [text_to_one_hot(word) for sentence in train_data for word in sentence.split(' ')]
train_data = [num for num in train_w_data if num!=-1]
example_text = train_data[0]

one_hot_encoding = text_to_one_hot(example_text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
from torch.utils.data import Dataset

class Text_dataset(Dataset):

  def __init__(self, text_list, text_len=30):
    self.data= text_list
    self.text_len = text_len

  def __len__(self):
    return len(self.data)//self.text_len

  # This will
  def __getitem__(self,i):
    one_hots = []
    for j in range(i*self.text_len,i*self.text_len+self.text_len):
      one_hots.append(self.data[j])
    return torch.tensor(one_hots[:-1]), one_hots[-1]

In [13]:
import torch.nn as nn

class LSTM_Gen(nn.Module):
  def __init__(self,input_size=5000, hidden_size=1024, hidden_layer=1,embedding_size=512, batch_size=20):
    super(LSTM_Gen, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.hidden_layer = hidden_layer
    self.embedding_size = embedding_size
    self.embedding = torch.nn.Embedding(self.input_size, self.embedding_size)
    self.lstm = torch.nn.LSTM(self.embedding_size, self.hidden_size, self.hidden_layer, batch_first=True)
    self.fc1 = torch.nn.Linear(self.hidden_size,self.input_size)
    self.relu = torch.nn.ReLU()
    self.batch_size = batch_size
    self.hidden = (torch.zeros(self.hidden_layer, batch_size, self.hidden_size),torch.zeros(self.hidden_layer,batch_size, self.hidden_size))


  def forward(self,x):
    x = self.embedding(x)
    x, _ = self.lstm(x, self.hidden)
    x = self.fc1(x)
    out = self.relu(x)
    return out[:,-1,:].squeeze(dim=1)


In [None]:
from torch.utils.data import DataLoader
import torch.optim as optim

batch_size = 20
text_len = 30
hidden_size = 1024
hidden_layer = 1
embedding_size = 512
epoch = 5
learning_rate = 0.0001
weight_decay = 0.0005

if torch.cuda.is_available():
    # Set device to CUDA
    device = torch.device("cuda")
    print("CUDA is available! Using GPU for training.")
else:
    # Set device to CPU
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU for training.")
dset = Text_dataset(train_data, text_len)
model = LSTM_Gen(5000,hidden_size,hidden_layer,embedding_size,batch_size)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = torch.nn.CrossEntropyLoss()

for e in range(epoch):
  train_loader = DataLoader(dset, batch_size=batch_size, shuffle=True)

  for i, data in enumerate(train_loader):
    inp, label = data[0],data[1]
    inp = inp.to(device)
    label = label.to(device)
    out = None
    out = model(inp)
    loss = criterion(out, label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if i%1000==0:
      print(f'loss at epoch {e} and batch {i} is '+"Loss: {:.3f}".format(loss))
      print(torch.argmax(out[0]),label[0])





CUDA is not available. Using CPU for training.
loss at epoch 0 and batch 0 is Loss: 8.503
tensor(4786) tensor(4408)
