# This notebook will serve as a way to implement character generation LSTM and other implementation

In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import re
import numpy as np
import pickle

In [2]:
with open("/kaggle/input/rnn-input/encoding_map.pkl", "rb") as f:
    mapping = pickle.load(f)

# Data preparation

In [3]:
# Decode
int2char = {i: ch for ch, i in mapping.items()}
print(int2char)

nb_char = len(int2char)

{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '%', 5: '&', 6: "'", 7: '+', 8: ',', 9: '-', 10: '.', 11: '/', 12: '0', 13: '1', 14: '2', 15: '3', 16: '4', 17: '5', 18: '6', 19: '7', 20: '8', 21: '9', 22: ':', 23: ';', 24: '?', 25: 'A', 26: 'B', 27: 'C', 28: 'D', 29: 'E', 30: 'F', 31: 'G', 32: 'H', 33: 'I', 34: 'J', 35: 'K', 36: 'L', 37: 'M', 38: 'N', 39: 'O', 40: 'P', 41: 'Q', 42: 'R', 43: 'S', 44: 'T', 45: 'U', 46: 'V', 47: 'W', 48: 'X', 49: 'Y', 50: 'Z', 51: 'a', 52: 'b', 53: 'c', 54: 'd', 55: 'e', 56: 'f', 57: 'g', 58: 'h', 59: 'i', 60: 'j', 61: 'k', 62: 'l', 63: 'm', 64: 'n', 65: 'o', 66: 'p', 67: 'q', 68: 'r', 69: 's', 70: 't', 71: 'u', 72: 'v', 73: 'w', 74: 'x', 75: 'y', 76: 'z', 77: 'À', 78: 'Ç', 79: 'É', 80: 'Ê', 81: 'à', 82: 'â', 83: 'ç', 84: 'è', 85: 'é', 86: 'ê', 87: 'ë', 88: 'î', 89: 'ï', 90: 'ô', 91: 'ù', 92: 'û', 93: 'Ā', 94: 'ū', 95: 'α', 96: 'β', 97: 'γ', 98: 'ε', 99: 'ζ', 100: 'η', 101: 'θ', 102: 'τ'}


## Creation of the dataset

To increase the randomness during the training :

For each epoch the entire corpus will have a random specific offset value in order that the model during training doesn't see the exact same text during X epochs.

In [4]:
class CharDataset(Dataset) :
    def __init__(self,text,length_seq) :
        self.text = text
        self.length_seq = length_seq
        self.max_start = len(self.text) - length_seq - 1
        self.offset = 0
        
    def set_offset(self, offset) :
        self.offset = offset
        
    def __len__(self) :
        return self.max_start

    def __getitem__(self, i) :
        s = (i + self.offset) % self.max_start
        x = torch.from_numpy(self.text[s:s+self.length_seq])
        y = torch.from_numpy(self.text[s+1:s+self.length_seq+1])
        return x, y

In [5]:
seq_length = 250
dataset_ = np.load("/kaggle/input/rnn-input/corpora_encoded.npy","r")
len_train = int(len(dataset_)*0.85)
train = dataset_[:len_train]
test = dataset_[len_train:]

In [6]:
train_ds = CharDataset(train,length_seq=seq_length)
test_ds = CharDataset(test,length_seq=seq_length)

In [7]:
print(train_ds[0][0])
print("".join([int2char[i] for i in train_ds[0][0][:seq_length].numpy()]))

tensor([95,  0,  0, 96,  0, 29, 58,  8,  1, 55, 58,  0, 29, 58,  0, 11, 96,  0,
        97,  0, 37, 65, 59,  8,  1, 60,  6, 51, 59,  1, 66, 51, 69,  1, 54,  6,
        70, 65, 71, 68,  1, 52, 71, 69,  1, 64, 59,  1, 62, 51,  1, 53, 51, 68,
        68, 59, 84, 68, 55,  1, 54,  6, 43, 65, 66, 68, 51, 64, 65,  0, 34,  6,
        69, 71, 59, 69,  1, 51, 72, 55, 53,  1, 62,  6, 51, 63, 59,  1, 51, 71,
         1, 27, 65, 71, 68, 69,  1, 34, 71,  1, 65, 71,  1, 51, 71,  1, 40, 68,
        51, 54, 65,  0, 29, 64, 70, 68, 55,  1, 66, 65, 70, 55, 69,  8,  1, 65,
        64,  1, 51,  1, 68, 59, 55, 64,  1, 81,  1, 66, 51, 68, 70,  1, 62, 51,
         1, 52, 59, 53, 68, 51, 72, 55,  1, 55, 70,  1, 62,  6, 55, 64, 70, 68,
        55, 66, 68, 55, 64, 55, 71, 68, 59, 51, 70,  0, 49,  1, 51,  1, 68, 59,
        55, 64,  1, 67, 71, 59,  1, 69,  6, 65, 56, 56, 68, 55,  1, 81,  1, 64,
        65, 71, 69,  0, 37, 86, 63, 55,  1, 63, 51,  1, 66, 68, 65, 66, 68, 55,
         1, 64, 51, 64, 51,  1, 69, 55, 

  x = torch.from_numpy(self.text[s:s+self.length_seq])


## Creation of Dataloader and co

In [8]:
batch_size = 2048

train_dl = DataLoader(train_ds, batch_size=batch_size, pin_memory=True, pin_memory_device="cuda:0", 
                        num_workers=4, prefetch_factor=4, shuffle=False, drop_last=True) #Shuffle False because we need the RNN to use previous sequences data to predict next one
test_dl = DataLoader(test_ds, batch_size=batch_size, pin_memory=True, pin_memory_device="cuda:0", 
                       num_workers=2, prefetch_factor=2, shuffle=False, drop_last=True)

### Check the good dataloading and offset validity

In [9]:
for i in train_dl :
    print(i[0])
    break

tensor([[95,  0,  0,  ..., 71, 59,  1],
        [ 0,  0, 96,  ..., 59,  1, 51],
        [ 0, 96,  0,  ...,  1, 51, 59],
        ...,
        [70, 69,  1,  ..., 69,  1, 66],
        [69,  1, 70,  ...,  1, 66, 65],
        [ 1, 70, 55,  ..., 66, 65, 71]])


In [10]:
train_ds.set_offset(1)
next(iter(train_dl))

[tensor([[ 0,  0, 96,  ..., 59,  1, 51],
         [ 0, 96,  0,  ...,  1, 51, 59],
         [96,  0, 29,  ..., 51, 59,  1],
         ...,
         [69,  1, 70,  ...,  1, 66, 65],
         [ 1, 70, 55,  ..., 66, 65, 71],
         [70, 55, 69,  ..., 65, 71, 68]]),
 tensor([[ 0, 96,  0,  ...,  1, 51, 59],
         [96,  0, 29,  ..., 51, 59,  1],
         [ 0, 29, 58,  ..., 59,  1, 54],
         ...,
         [ 1, 70, 55,  ..., 66, 65, 71],
         [70, 55, 69,  ..., 65, 71, 68],
         [55, 69,  1,  ..., 71, 68,  1]])]

In [11]:
len(train_dl)

641

## Models

### Training part

In [12]:
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=1):
        super(CharLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size = embedding_dim, 
                            hidden_size = hidden_size, 
                            num_layers = num_layers, 
                            batch_first=True, dropout = 0.15)
        self.drop = nn.Dropout(p=0.15)
        self.ln = nn.LayerNorm(hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.drop(self.embedding(x))              # (batch, seq, hidden_size)
        out, hidden = self.lstm(x, hidden)
#        out = self.ln(out)
        out = self.drop(out)
        logits = self.fc(out)                  
        return logits, hidden

    def init_hidden(self, batch_size, device):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device = device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device = device)
        return (h0,c0)

In [13]:
device1 = torch.device("cuda:0")
device2 = torch.device("cuda:1")

In [14]:
embedding_dim = 128
hidden_size = 384
vocab_size = len(mapping)
num_epoch = 50

nb_step_train = len(train_dl)
nb_step_test = len(test_dl)

model = CharLSTM(vocab_size, embedding_dim, hidden_size, num_layers=3).to(device1)
model = torch.compile(model)

loss_fn = nn.CrossEntropyLoss(ignore_index = 102)

opti = torch.optim.AdamW(model.parameters(), lr=0.004, weight_decay=1e-4)
sched_warm = torch.optim.lr_scheduler.LinearLR(opti, start_factor=0.2, end_factor=1.0, total_iters=nb_step_train*4)
sched_post = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(opti, T_0=nb_step_train*5, T_mult=2, eta_min=2e-4) #1 epoch => 2 => 4 => 8

In [15]:
list_offset = []
l_tot = []

#Early stopping
early_stopping_count = 0
patience = 5

best_val = float("inf")

scaler = torch.amp.GradScaler()
hid = model.init_hidden(batch_size, device1)
hid_ = model.init_hidden(batch_size, device1)

for epoch in range(num_epoch) :
    #Offset the datas
    offset = np.random.randint(0, seq_length-1) #Set the offset
    train_ds.set_offset(offset); test_ds.set_offset(offset)

    model.train();
    hid = tuple(h.zero_() for h in hid);

    #Create loss per epoch
    l_train = 0.0
    l_test = 0.0
    
    for X,Y in iter(train_dl) :
        X = X.to(device1); Y= Y.to(device1, dtype=torch.long)
        opti.zero_grad(set_to_none=True)
        
        #Computation of model
        hid = tuple(h.detach() for h in hid)
        
        with torch.amp.autocast(device_type="cuda:0"):
            pred, hid = model(X, hid)
            loss = loss_fn(pred.view(-1, vocab_size), Y.view(-1))

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        scaler.step(opti) 
        scaler.update()
        l_train += loss.detach()

        #Scheduler part
        
        if sched_post.T_cur == 0 and epoch > 4:  #After warm restart decrease the max learning rate
            sched_post.base_lrs[0] = sched_post.base_lrs[0] * 0.8
            sched_post.eta_min = sched_post.eta_min * 1.5
            print(f"Decrease {sched_post.base_lrs[0]}, {sched_post.eta_min}")

        step_scheduler = sched_warm if epoch < 4 else sched_post
        step_scheduler.step()

    #Test data part
    
    model.eval()
    
    
    with torch.inference_mode():
        hid_ = tuple(h.zero_() for h in hid_);
        with torch.amp.autocast("cuda"):
            for X,Y in iter(test_dl) : 
                X = X.to(device1); Y= Y.to(device1, dtype=torch.long)
                hid_ = tuple(h.detach() for h in hid_)
                            
                pred, hid_ = model(X, hid_)
                loss = loss_fn(pred.view(-1, vocab_size), Y.view(-1))
                l_test += loss.detach()

        print(epoch,np.exp(l_train.item()/nb_step_train), np.exp(l_test.item()/nb_step_test),"\n")
     
        #Record the loss of the epoch
        l_tot.append(l_test.item()); 

        if l_test < best_val :
            best_val = l_test.item()
            early_stopping_count = 0
            torch.save({
                "epoch": epoch,
                "model_state_dict": model._orig_mod.state_dict(),
                "optimizer_state_dict": opti.state_dict(),
                "scheduler_state_dict": sched_post.state_dict(),
                "val_loss": l_test,
            }, "model.pt")
        
        elif l_test >= best_val :
            early_stopping_count += 1

#        if early_stopping_count == patience :
#            print("Early Stopping")
#            break 

print(f"Liste of offset used : {list_offset}")

0 8.996990051119065 5.589982534819884 

1 5.120284186339614 4.630063708498374 

2 4.461122284941358 4.333786918485035 

3 3.995945358317471 4.086505760633653 

4 3.5734178911286083 3.856873539672058 

5 3.4350564368284466 3.8145723745753175 

6 3.337989358036094 3.78325865404298 

7 3.277555258316564 3.770164860664598 

8 3.252615220890893 3.7598808642026893 

Decrease 0.0032, 0.00030000000000000003
9 3.2787619986844994 3.7705039356292014 

10 3.2401127950644364 3.769858904775695 

11 3.2208087915452794 3.7680572723581043 



KeyboardInterrupt: 

In [None]:
14

3.9