# This notebook will serve as a way to train character generation RNN

This code is used on Kaggle

In [None]:
import numpy as np
import pickle

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
with open("/kaggle/input/rnn-input/Global_mapping.pkl", "rb") as f:
    mapping = pickle.load(f)

# Data preparation

In [3]:
# Decode
int2char = {i: ch for ch, i in mapping.items()}
print(int2char)

nb_char = len(int2char)

{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '%', 5: '&', 6: "'", 7: '+', 8: ',', 9: '-', 10: '.', 11: '/', 12: '0', 13: '1', 14: '2', 15: '3', 16: '4', 17: '5', 18: '6', 19: '7', 20: '8', 21: '9', 22: ':', 23: ';', 24: '<', 25: '>', 26: '?', 27: 'A', 28: 'B', 29: 'C', 30: 'D', 31: 'E', 32: 'F', 33: 'G', 34: 'H', 35: 'I', 36: 'J', 37: 'K', 38: 'L', 39: 'M', 40: 'N', 41: 'O', 42: 'P', 43: 'Q', 44: 'R', 45: 'S', 46: 'T', 47: 'U', 48: 'V', 49: 'W', 50: 'X', 51: 'Y', 52: 'Z', 53: 'a', 54: 'b', 55: 'c', 56: 'd', 57: 'e', 58: 'f', 59: 'g', 60: 'h', 61: 'i', 62: 'j', 63: 'k', 64: 'l', 65: 'm', 66: 'n', 67: 'o', 68: 'p', 69: 'q', 70: 'r', 71: 's', 72: 't', 73: 'u', 74: 'v', 75: 'w', 76: 'x', 77: 'y', 78: 'z', 79: '§', 80: 'À', 81: 'Ç', 82: 'É', 83: 'Ê', 84: 'Î', 85: 'Ô', 86: 'Ö', 87: 'Ü', 88: 'à', 89: 'á', 90: 'â', 91: 'ã', 92: 'ä', 93: 'æ', 94: 'ç', 95: 'è', 96: 'é', 97: 'ê', 98: 'ë', 99: 'í', 100: 'î', 101: 'ï', 102: 'ñ', 103: 'ô', 104: 'ö', 105: 'ù', 106: 'û', 107: 'ü', 108: 'Ā', 109: 'ā', 110: 'ğ'

## Creation of the dataset

To increase the randomness during the training :

For each epoch the entire corpus will have a random specific offset value in order that the model during training doesn't see the exact same text during X epochs.

In [4]:
class CharDataset(Dataset) :
    def __init__(self,text,length_seq) :
        self.text = text
        self.length_seq = length_seq
        self.max_start = len(self.text) - length_seq - 1
        self.offset = 0
        
    def set_offset(self, offset) :
        self.offset = offset
        
    def __len__(self) :
        return self.max_start

    def __getitem__(self, i) :
        s = (i + self.offset) % self.max_start
        x = torch.from_numpy(self.text[s:s+self.length_seq])
        y = torch.from_numpy(self.text[s+1:s+self.length_seq+1])
        return x, y

In [5]:
seq_length = 250
dataset_ = np.load("/kaggle/input/rnn-input/corpus_encoded.npy","r")
len_train = int(len(dataset_)*0.9)
train = dataset_[:len_train]
test = dataset_[len_train:]

In [6]:
train_ds = CharDataset(train,length_seq=250)
test_ds = CharDataset(test,length_seq=250)

In [7]:
print(train_ds[0][0])
print("".join([int2char[i] for i in train_ds[0][0][:seq_length].numpy()]))

tensor([24, 28, 31, 33, 35, 40, 40, 35, 40, 33, 25,  0, 24, 29, 41, 47, 42, 38,
        31, 46, 25,  0, 36,  6, 53, 61, 65, 57,  1, 68, 53, 71,  1, 64, 57, 71,
         1, 54, 57, 73, 70, 57, 72, 72, 57, 71,  1, 56, 67, 66, 55,  1, 58, 53,
        73, 72,  1, 69, 73,  6, 55, 57, 71,  1, 68, 67, 73, 58, 58, 61, 53, 71,
        71, 57, 71,  1, 65,  6, 57, 66, 72, 57, 66, 56, 57, 66, 72,  0, 51,  1,
        53,  1, 69, 73,  6, 56, 53, 66, 71,  1, 27, 64, 53, 56, 56, 61, 66,  1,
        69, 73, 57,  1, 72,  6, 57, 66,  1, 74, 57, 70, 70, 53, 71,  1, 73, 66,
        57,  1, 63, 61, 58, 58, 57, 70,  1, 71, 73, 70,  1, 73, 66,  1, 65, 57,
        66, 56, 61, 53, 66, 72,  0, 51,  1, 53,  1, 68, 64, 73, 71,  1, 56,  6,
        70, 57, 71, 68, 57, 55, 72,  1, 57, 66, 72, 70, 57,  1, 64, 57, 71,  1,
        56, 53, 70, 67, 66, 66, 57, 71,  8,  1, 64, 57, 71,  1, 68, 57, 72, 61,
        72, 57, 71,  1, 56,  6, 13, 18,  1, 68, 61, 59, 57, 71,  0, 40, 67, 70,
        65, 53, 64,  1, 69, 73, 57,  1, 

  x = torch.from_numpy(self.text[s:s+self.length_seq])


## Creation of Dataloader and co

200 context for previous model

In [8]:
batch_size = 512

train_dl = DataLoader(train_ds, batch_size=batch_size, pin_memory=True, pin_memory_device="cuda:0", 
                        num_workers=4, prefetch_factor=4, shuffle=False, drop_last=True) #Shuffle False because we need the RNN to use previous sequences data to predict next one
test_dl = DataLoader(test_ds, batch_size=batch_size, pin_memory=True, pin_memory_device="cuda:0", 
                       num_workers=2, prefetch_factor=2, shuffle=False, drop_last=True)

### Check the good dataloading and offset validity

In [9]:
for i in train_dl :
    print(i[0])
    break

tensor([[24, 28, 31,  ..., 66,  0, 45],
        [28, 31, 33,  ...,  0, 45, 57],
        [31, 33, 35,  ..., 45, 57,  1],
        ...,
        [57,  1, 68,  ...,  1, 73, 66],
        [ 1, 68, 70,  ..., 73, 66,  1],
        [68, 70, 61,  ..., 66,  1, 71]])


In [10]:
train_ds.set_offset(1)
next(iter(train_dl))

[tensor([[28, 31, 33,  ...,  0, 45, 57],
         [31, 33, 35,  ..., 45, 57,  1],
         [33, 35, 40,  ..., 57,  1, 68],
         ...,
         [ 1, 68, 70,  ..., 73, 66,  1],
         [68, 70, 61,  ..., 66,  1, 71],
         [70, 61, 71,  ...,  1, 71, 55]]),
 tensor([[31, 33, 35,  ..., 45, 57,  1],
         [33, 35, 40,  ..., 57,  1, 68],
         [35, 40, 40,  ...,  1, 68, 70],
         ...,
         [68, 70, 61,  ..., 66,  1, 71],
         [70, 61, 71,  ...,  1, 71, 55],
         [61, 71, 67,  ..., 71, 55, 60]])]

Seems to work

In [11]:
len(train_dl)

1742

## Models

### Training part

In [12]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers=1):
        super(CharRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, num_layers, batch_first=True, dropout = 0.15, nonlinearity ="relu")
        self.drop = nn.Dropout(p=0.15)
        self.ln = nn.LayerNorm(hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)              # (batch, seq, hidden_size)
        x = self.drop(x)
        out, hidden = self.rnn(x, hidden)
        out = self.ln(out)
        out = self.drop(out)
        out = self.fc(out)                  
        return out, hidden

    def init_hidden(self, batch_size):
#        return torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return torch.randn(self.num_layers, batch_size, self.hidden_size)

In [13]:
device1 = torch.device("cuda:0")
device2 = torch.device("cuda:1")

In [14]:
vocab_size = len(int2char)
hidden_size = 250
num_epoch = 100

nb_step_train = len(train_dl)
nb_step_test = len(test_dl)

model = CharRNN(vocab_size, hidden_size, num_layers=3)   

model.to(device1)

loss_fn = nn.CrossEntropyLoss()

opti = torch.optim.AdamW(model.parameters(), lr=0.0015, weight_decay=1e-3)
sched_warm = torch.optim.lr_scheduler.LinearLR(opti, start_factor=0.2, end_factor=1.0, total_iters=nb_step_train)
sched_post = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(opti, T_0=len(train_dl)*10, T_mult=2, eta_min=0.0001) #1 epoch => 2 => 4 => 8

In [15]:
list_offset = []
l_tot = []

#Early stopping
early_stopping_count = 0
patience = 5

best_val = float("inf")

scaler = torch.amp.GradScaler()

hid = model.init_hidden(batch_size).to(device1)

hid_ = model.init_hidden(batch_size).to(device1)

for epoch in range(num_epoch) :
    
    offset = np.random.randint(0, seq_length-1) #Set the offset
    list_offset.append(offset) #Keep in memory

    #Offset the datas
    train_ds.set_offset(offset); test_ds.set_offset(offset)

    model.train();
    hid.zero_();

    #Create loss per epoch
    l_train = 0.0
    l_test = 0.0
    
    for X,Y in iter(train_dl) :
        X = X.to(device1); Y= Y.to(device1)
        opti.zero_grad(set_to_none=True)
        
        #Computation of model
        hid = hid.detach();
        
        with torch.amp.autocast(device_type="cuda:0"):
            pred, hid = model(X, hid)
            loss = loss_fn(pred.view(-1, vocab_size), Y.view(-1))

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
        scaler.step(opti); scaler.update()
        l_train += loss.detach()

        #Scheduler part
        #Warm start
        
        if sched_post.T_cur == 0 and epoch > 1:  #After warm restart decrease the max learning rate
            sched_post.base_lrs[0] = sched_post.base_lrs[0] * 0.6
            sched_post.eta_min = sched_post.eta_min * 1.25
            print(f"Decrease {sched_post.base_lrs[0]}, {sched_post.eta_min}")

        step_scheduler = sched_warm if epoch == 0 else sched_post
        step_scheduler.step()

    #Test data part
    
    model.eval()
    
    
    with torch.inference_mode():
        hid_.zero_()  
        with torch.amp.autocast("cuda"):
            for X,Y in iter(test_dl) : 
                X = X.to(device1); Y= Y.to(device1)
                hid_ = hid_.detach()
                            
                pred, hid_ = model(X, hid_)
                loss = loss_fn(pred.view(-1, vocab_size), Y.view(-1))
                l_test += loss.detach()

        print(epoch,np.exp(l_test.item()/nb_step_test),"\n")
     
        #Record the loss of the epoch
        l_tot.append(l_test.item()); 

        if l_test < best_val :
            best_val = l_test.item()
            early_stopping_count = 0
            torch.save({
                "epoch": epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": opti.state_dict(),
                "scheduler_state_dict": sched_post.state_dict(),
                "val_loss": l_test,
            }, "model")
        
        elif l_test >= best_val :
            early_stopping_count += 1

        if early_stopping_count == patience :
            print("Early Stopping")
            break 

print(f"Liste of offset used : {list_offset}")

0 6.213998203017805 

1 5.102510106735906 

2 4.956317091198305 

3 4.770973074054662 

4 4.708132925428319 

5 4.657053951892793 

6 4.619699820159317 

7 4.5934330493870545 

8 4.579322688367851 

9 4.573801917823297 

10 4.571850364942143 

Decrease 0.0009, 0.000125
11 4.571408687727923 

12 4.5476968784192815 

13 4.52594190559593 

14 4.504352967062929 

15 4.4835319505337985 

16 4.465478933123772 

17 4.451887632472756 

18 4.437950322360753 

19 4.424950749387633 

20 4.414524484453296 

21 4.406728053109271 

22 4.396226549763397 

23 4.3889988618282985 

24 4.381038298683579 

25 4.3769592717581505 

26 4.372171215927915 

27 4.368648199114715 

28 4.367300004398819 

29 4.365718202070801 

30 4.363397283750461 

Decrease 0.00054, 0.00015625
31 4.369227802078988 

32 4.362731532957022 

33 4.357066072132486 

34 4.349843614841055 

35 4.345079037963747 

36 4.340306640244396 

37 4.3328070781560895 

38 4.328477958927817 

39 4.321076042924992 

40 4.3163429764467 

41 4.3137

KeyboardInterrupt: 