# RNN Date Generation Demo on Pytoch Lightning: Date Generation (One-to-Many)

In this demo, we will show you how to create a date generator using Pytoch Lightning. This demo is inspired by Andrew Ng's deeplearning.ai course on sequence models. In this demo, we create a one-to-many RNN model for generating date in the following format: e.g. "2002-03-11".  

In [None]:
import csv
import numpy as np
import random
import math
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
!pip install lightning
import lightning as L
from lightning import Trainer

# Generate Dataset
We generate a toy dataset using datetime library.  The target output only comes in one format (iso format).

In [None]:
#Generating a toy dataset
import datetime
base = datetime.datetime.today()
base = datetime.date(base.year, base.month, base.day)
date_list = [base - datetime.timedelta(days=x) for x in range(0, 1500)]
data = [date.isoformat() for date in date_list]
print(data[:5])
maxlen=10 #all the seqeunces have 10 characters

In [None]:
chars = list(set(''.join(data)))
data_size, vocab_size = len(data), len(chars)
print('There are %d lines and %d unique characters in your data.' % (data_size, vocab_size))
print("max length =",maxlen)
sorted_chars= sorted(chars)
print(sorted_chars)

In [None]:
# In this demo, we will use "<S>" as a seed character to initiate the sequence
sorted_chars.insert(0,"<S>")
vocab_size = len(sorted_chars)

print(f"All Characters: {sorted_chars}")
print(f"Vocab Size: {vocab_size}")

In [None]:
# Quick implementation of character tokenizer
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(sorted_chars) }
itos = { i:ch for i,ch in enumerate(sorted_chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("2024-10-26"))
print(decode(encode("2024-10-26")))

In [None]:
itos

In [None]:
stoi

# Preprocessing data

In [None]:
#Encoding data
encoded = []
for line in data:
    line = [l for l in line] #change from string to list
    indices = encode(line)
    encoded.append(indices)

In [None]:
class DateDataset(Dataset):
  def __init__(self, data):
    data = [[0] + d for d in data] # add <s> at the start of every data point
    self.encoded = torch.LongTensor(data)

  def __getitem__(self, idx):
    return self.encoded[idx]

  def __len__(self):
    return len(self.encoded)

In [None]:
class DateDataModule(L.LightningDataModule):

  def __init__(self, train_data, batch_size, num_workers=0):
      super().__init__()
      self.train_data = train_data
      self.batch_size = batch_size
      self.num_workers = num_workers


  def setup(self, stage: str):
    pass

  def collate_fn(self, batch):
      one_hot_x = torch.stack([F.one_hot(b, num_classes=vocab_size) for b in batch])
      return {"x": one_hot_x.float(), "y": torch.stack(batch)}

  def train_dataloader(self):
      train_dataset = DateDataset(self.train_data)
      train_loader = DataLoader(train_dataset,
                                batch_size = self.batch_size,
                                shuffle = True,
                                collate_fn = self.collate_fn,
                                num_workers = self.num_workers)

      return train_loader

In [None]:
batch_size = 16
data_module = DateDataModule(encoded, batch_size=batch_size,num_workers=0)

# Create & train model


In [None]:
class SimpleRNN(L.LightningModule):
    def __init__(self, vocab_size, learning_rate, criterion):

        super().__init__()
        self.hidden_dim = 16
        self.vocab_size = vocab_size
        self.rnn = nn.RNNCell(self.vocab_size, self.hidden_dim)

        self.fc = nn.Linear(self.hidden_dim, self.vocab_size)
        self.learning_rate = learning_rate
        self.criterion = criterion


    def forward(self, src, hx):
        hx = self.rnn(src, hx)
        prediction_logit = self.fc(hx)
        return prediction_logit, hx

    def training_step(self, batch, batch_idx):
        src = batch['x'][:, :-1]
        target = batch['y'][:, 1:]
        temp = []
        hx = torch.randn(src.shape[0], self.hidden_dim).to(self.rnn.weight_ih.device)
        prediction = torch.zeros((src.shape[0], src.shape[1], self.vocab_size) ,device=hx.device)

        for i in range(src.shape[1]):
          prediction_logit, hx = self(src[:,i], hx)
          prediction[:, i, :] = prediction_logit

        prediction = prediction.reshape(-1, vocab_size)
        target = target.reshape(-1)
        loss = self.criterion(prediction, target)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.learning_rate)

In [None]:
criterion = nn.CrossEntropyLoss()
vocab_size = vocab_size
lr = 0.005
model = SimpleRNN(vocab_size, lr, criterion)

In [None]:
def generate(model):
  model.eval()
  with torch.no_grad():
    output_list = []
    input = F.one_hot(torch.zeros([1], dtype=torch.long), num_classes=vocab_size)
    input = input.float()
    input = input.to(model.device)
    hx = torch.randn(input.shape[0], 16).to(model.device)
    for i in range(10):
      logit, hx = model(input, hx)
      prob = F.softmax(logit, dim=-1)
      pred = torch.multinomial(prob, 1)
      output = pred.item()
      output_list.append(output)

      input = F.one_hot(torch.tensor([output], dtype=torch.long), num_classes=vocab_size)
      input = input.float()
      input = input.to(model.device)
  return decode(output_list)

In [None]:
class PrintCallback(L.pytorch.callbacks.Callback):
  def __init__(self, what="epochs", verbose=True):
        self.what = what
        self.verbose = verbose
        self.state = {"epochs": 0, "batches": 0}

  def on_train_epoch_end(self, *args, **kwargs):
        if self.what == "epochs":
            self.state["epochs"] += 1
        if self.state["epochs"] % 2 == 0:
            print('----- Generating text after Epoch: %d' % self.state["epochs"])
            for i in range(3):
              print(generate(model))


In [None]:
trainer = Trainer(
    max_epochs=10,
    callbacks=[PrintCallback()]
)

# Let's train the model and generate some text

In [None]:
for i in range(3): #before training
  print(generate(model))

In [None]:
trainer.fit(model, data_module)

In [None]:
for i in range(10):
  print(generate(model))