In [1]:
import torch
import torch.nn as nn
from torch.utils.data import random_split, Dataset, DataLoader

import math
import numpy as np
import pickle

In [2]:
with open("../../../Corpus/Encoding_RNN_LSTM/Char_level/encoding_map.pkl", "rb") as f:
    mapping = pickle.load(f)

mapping["PAD"] = len(mapping)

In [3]:
# Decode
int2char = {i: ch for ch, i in mapping.items()}
print(int2char)
nb_char = len(int2char)

{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '%', 5: '&', 6: "'", 7: ')', 8: '+', 9: ',', 10: '-', 11: '.', 12: '/', 13: '0', 14: '1', 15: '2', 16: '3', 17: '4', 18: '5', 19: '6', 20: '7', 21: '8', 22: '9', 23: ':', 24: ';', 25: '?', 26: 'a', 27: 'b', 28: 'c', 29: 'd', 30: 'e', 31: 'f', 32: 'g', 33: 'h', 34: 'i', 35: 'j', 36: 'k', 37: 'l', 38: 'm', 39: 'n', 40: 'o', 41: 'p', 42: 'q', 43: 'r', 44: 's', 45: 't', 46: 'u', 47: 'v', 48: 'w', 49: 'x', 50: 'y', 51: 'z', 52: 'à', 53: 'â', 54: 'ç', 55: 'è', 56: 'é', 57: 'ê', 58: 'ë', 59: 'î', 60: 'ï', 61: 'ô', 62: 'ù', 63: 'û', 64: 'α', 65: 'β', 66: 'γ', 67: 'ε', 68: 'ζ', 69: 'η', 70: 'θ', 71: '€', 72: 'PAD'}


## Creation of the dataset

The corpus is a huge chunk of text but we need to split it in songs because we don't want that the model learn that there is a chance of having a text after the end of a song. Thiw will prevent learning non existing relationship in the dataset.

In [None]:
dataset_ = np.load("../../../Corpus/Encoding_RNN_LSTM/Char_level/corpora_encoded.npy","r")

result = []
for t in dataset_:
    if t == 64 : # α or beginning of a song
        current = []
        current.append(t)
    elif t == 70: # θ or end of a song
        current.append(t)
        result.append(torch.tensor(current))
    else :
        current.append(t)
if current:  
    result.append(torch.tensor(current))

In [None]:
class SongDataset(Dataset):
    def __init__(self, texts, length_seq, stride, pad_id=mapping["PAD"], use_offset=True):
        self.samples = []
        self.length_seq = length_seq
        self.stride = stride
        self.pad_id = pad_id

        for text in texts:

            L = len(text)
            offset = torch.randint(0, stride, (1,)).item() if use_offset else 0

            # --- Boucle principale ---
            for start in range(offset, max(1, L - self.length_seq - 1), self.stride):
                x_start, x_end = start, start + self.length_seq
                y_start, y_end = start + 1, start + 1 + self.length_seq

                x = text[x_start:x_end]
                y = text[y_start:y_end]

                # --- Padding uniforme ---
                def pad_to_len(seq, pad_id, target_len):
                    pad_len = target_len - len(seq)
                    if pad_len > 0:
                        seq = torch.cat([seq, torch.full((pad_len,), pad_id, dtype=seq.dtype)])
                    return seq

                x = pad_to_len(x, self.pad_id, self.length_seq)
                y = pad_to_len(y, self.pad_id, self.length_seq)

                self.samples.append((x, y))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [None]:
batch_size = 1024
seq_length = 256
stride = 16

In [None]:
#Then we can split between songs and not part of a song
len_train = int(len(result) * 0.85)
len_test = len(result) - len_train

train, test = random_split(result, [len_train, len_test])

#Use of stride 16 to limit the overlap between each sequence, that reduce training time and reduce overfit 
# and also because one character doesn't contain a lot of information in itself
train_ds = SongDataset(train,length_seq=seq_length, stride = stride, use_offset = True)
test_ds = SongDataset(test,length_seq=seq_length, stride = stride, use_offset = False)

#Shuffle False because a song is like a time series and cannot be shuffle randomly inside it. The position of each part has a meaning.
train_dl = DataLoader(train_ds, batch_size=batch_size, pin_memory=True, pin_memory_device="cuda:0", shuffle=False, drop_last=True)
test_dl = DataLoader(test_ds, batch_size=batch_size, pin_memory=True, pin_memory_device="cuda:0", shuffle=False, drop_last=True)

## Models

### Training part

In [None]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, num_layers=1, dropout = 0):
        super(CharRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=72)
        self.rnn = nn.RNN(emb_size, hidden_size, num_layers, batch_first=True, dropout = dropout, nonlinearity ="relu")
        self.drop = nn.Dropout(p=dropout)
        self.ln = nn.LayerNorm(hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.drop(self.embedding(x))
        out, hidden = self.rnn(x, hidden)
        out = self.drop(out)
        out = self.fc(out)                  
        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)

In [None]:
device1 = torch.device("cuda:0")

In [None]:
embedding_dim = 64
vocab_size = len(int2char)
hidden_size = 512
num_epoch = 50

nb_step_train = len(train_dl)
nb_step_test = len(test_dl)

model = CharRNN(vocab_size, embedding_dim, hidden_size, num_layers=2, dropout = 0.2).to(device1)
model = torch.compile(model)

loss_fn = nn.CrossEntropyLoss(ignore_index=72)

opti = torch.optim.AdamW(model.parameters(), lr=0.002, weight_decay=1e-4)
sched_warm = torch.optim.lr_scheduler.LinearLR(opti,start_factor=0.2,end_factor=1.0,total_iters=nb_step_train * 3)
sched_post = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(opti, T_0=nb_step_train*10, T_mult=2, eta_min=0.0002) 

### Functions for training

In [None]:
@torch.no_grad()
def evaluate_tf1(model, dl, loss_fn, device, vocab_size, bs = batch_size, sl = seq_length):
    """Validation with teacher forcing = 1 (parallel, fast). Returns (ppl, acc)."""
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    correct = 0
    total = 0

    for X, Y in dl:
        X = X.to(device)
        Y = Y.to(device, dtype=torch.long)
        hid = model.init_hidden(bs, device1)

        with torch.amp.autocast(device_type="cuda"):
            pred, _ = model(X, hid) 
            loss = loss_fn(pred.view(-1, vocab_size), Y.view(-1))

        total_loss += loss.item() * bs * sl
        total_tokens += bs * sl

        pred_ids = pred.argmax(dim=-1)
        correct += (pred_ids == Y).sum().item()
        total += bs * sl

    ppl = math.exp(total_loss / max(1, total_tokens))
    acc = correct / max(1, total)
    return ppl, acc


@torch.no_grad()
def evaluate_free(model, dl, loss_fn, device, bs=batch_size, sl=seq_length):
    """
    Autoregressive validation (teacher forcing = 0).
    Steps one token at a time and feeds predictions back in.
    Returns ppl.
    """
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    for X, Y in dl:
        X = X.to(device)
        Y = Y.to(device, dtype=torch.long)
        hid = model.init_hidden(bs, device1)

        # Start with the first input token
        inp = X[:, :1]  # (bs, 1)
        for t in range(sl):
            with torch.amp.autocast(device_type="cuda"):
                pred, hid = model(inp, hid)          # (bs, 1, vocab)
                logits = pred[:, -1, :]              # (bs, vocab)
                loss = loss_fn(logits, Y[:, t])      # CE over current step

            total_loss += loss.item() * bs
            total_tokens += bs

            # Greedy next-token to feed back in
            next_token = logits.argmax(dim=-1).unsqueeze(1)  # (bs, 1)
            inp = next_token

    ppl = math.exp(total_loss / max(1, total_tokens))
    return ppl

def sample_with_temp(logits, temp=1.0):
    probs = (logits / temp).softmax(dim=-1)
    next_token = torch.multinomial(probs, num_samples=1)
    return next_token

def distinct_n_chars(text, n=3):
    ngrams = [text[i:i+n] for i in range(len(text)-n+1)]
    return len(set(ngrams)) / max(1, len(ngrams))

To train the model, first I put :
- Every 2/3 epoch : teacher_forcing_ratio - 0.02

Then when teacher_forcing_ratio < 0.5 :
- Every epoch or 2 epoch : teacher_forcing_ratio - 0.01

I stop the teacher_forcing_ratio around 0.25/0.3

In [None]:
l_tot = []
bs = batch_size
sl = seq_length
teacher_forcing_ratio = 1

scaler = torch.amp.GradScaler()

for epoch in range(num_epoch):

    # --- Teacher Forcing ratio decay ---
    if epoch % 3 == 0 :
        teacher_forcing_ratio = max(0.0, min(1.0, teacher_forcing_ratio - 0.02))
        print(f"\nEpoch {epoch} | Teacher forcing ratio = {teacher_forcing_ratio:.2f}")

    model.train()

    # -------------- TRAIN LOOP --------------
    train_loss_sum = 0.0
    train_tokens = 0
    nb_step_train = 0

    for X, Y in iter(train_dl):
        hid = model.init_hidden(batch_size).to(device1)
        X = X.to(device1)
        Y = Y.to(device1, dtype=torch.long)
        opti.zero_grad(set_to_none=True)

        if teacher_forcing_ratio == 1.0:
            with torch.amp.autocast(device_type="cuda"):
                pred, hid = model(X, hid)
                loss = loss_fn(pred.view(-1, vocab_size), Y.view(-1))
        else:
            # ---- Pass 1: forward with TF=1  ----
            with torch.no_grad(), torch.amp.autocast(device_type="cuda"):
                pred_tf, _ = model(X, hid)   
            pred_tokens = pred_tf.argmax(dim=-1)  

            # ---- Random mask for TF < 1 ----
            mask = (torch.rand_like(X.float()) < teacher_forcing_ratio)
            X_mixed = torch.where(mask, X, pred_tokens)

            # ---- Pass 2: forward with partial TF ----
            with torch.amp.autocast(device_type="cuda"):
                pred, hid = model(X_mixed, hid)
                loss = loss_fn(pred.view(-1, vocab_size), Y.view(-1))

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        scaler.step(opti)
        scaler.update()

        train_loss_sum += loss.detach().item() * bs * sl
        train_tokens += bs * sl
        nb_step_train += 1

        if epoch < 3:
            sched_warm.step()
        else:
            sched_post.step()

    train_ppl = math.exp(train_loss_sum / max(1, train_tokens))

    # -------------- VALIDATION --------------
    val_ppl_tf1, val_acc = evaluate_tf1(model, test_dl, loss_fn, device1, vocab_size)
    val_ppl_free = evaluate_free(model, test_dl, loss_fn, device1)

    print(
        f"Epoch {epoch} | "
        f"Train PPL: {train_ppl:.3f} | "
        f"Val PPL (TF=1): {val_ppl_tf1:.3f} | "
        f"Val PPL (free): {val_ppl_free:.3f} | "
        f"Val Acc: {val_acc:.3f}"
    )

    # --------- Sample generation + diversity metrics ---------
    model.eval()
    with torch.no_grad():
        # Warm up for generation
        start = X[0:1, :20]  
        hid_gen = model.init_hidden(1).to(device1)
        inp = start

        gen_chars = []
        for t in range(200):  
            pred, hid_gen = model(inp, hid_gen)
            logits = pred[:, -1, :]  
            next_char = sample_with_temp(logits, temp=0.6)
            gen_chars.append(int2char[next_char.item()])
            inp = next_char 

        gen_text = "".join(gen_chars)

    d2 = distinct_n_chars(gen_text, n=2)
    d3 = distinct_n_chars(gen_text, n=3)

    
    if epoch % 4 == 0 :
        print("\n=== Initial text ===")
        print("".join([int2char[i] for i in X[0:1,:].squeeze(0).tolist()]))
        print("\n=== Sample Generation ===")
        print(gen_text[:200])  
        print(f"Distinct-2: {d2:.3f} | Distinct-3: {d3:.3f}", end="\n")

    l_tot.append(val_acc)
    torch.save(
            {
                "epoch": epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": opti.state_dict(),
                "scheduler_state_dict": sched_post.state_dict(),
                "val_acc": val_acc,
            },
            "model",
    )

Epoch 100 | Train PPL: 7.216 | Val PPL (TF=1): 5.303 | Val PPL (free): 35.853 | Val Acc: 0.492

=== Initial text ===
, mama désolé
l'été, la vie est si belle, augmente les décibels
on fly-ye-ye
l'été la vie est si belle, augmente les décibels
faut qu'on s'taille-aille-aille
/ε
γ
après que le corps d'cette se frotte sur ma  aouh
j'sors d'ma douche, j'aime voir le sun qui 

=== Sample Generation ===
 le  porhhes se coouveent   temps
on s'aiit qu'j'vous fais  d'aassooiree, j'ai d'la horritt
    mes miss  maa cherché, je coois qu'aa  fait les pétages  qu'écrire le  vediss, j'ai l'aille 
je   temps 
Distinct-2: 0.497 | Distinct-3: 0.768
Epoch 101 | Train PPL: 7.221 | Val PPL (TF=1): 5.310 | Val PPL (free): 31.878 | Val Acc: 0.492

Epoch 102 | Teacher forcing ratio = 0.25
Epoch 102 | Train PPL: 7.220 | Val PPL (TF=1): 5.344 | Val PPL (free): 31.376 | Val Acc: 0.491
Epoch 103 | Train PPL: 7.221 | Val PPL (TF=1): 5.335 | Val PPL (free): 32.391 | Val Acc: 0.491
Epoch 104 | Train PPL: 7.220 | Va

KeyboardInterrupt: 

Epoch 15 | Teacher forcing ratio = 0.88
Epoch 15 | Train PPL: 5.163 | Val PPL (TF=1): 3.891 | Val PPL (free): 570.861 | Val Acc: 0.579

=== Sample Generation ===
omis j'ai pas d'amour est parti de la maison d'un peu d'couplet d'la maison d'un peu d'couplet d'la maison d'un peu d'couplet d'la maison d'un peu d'couplet d'la maison d'un peu d'couplet d'la maison 
Distinct-1: 0.085 | Distinct-2: 0.241 | Distinct-3: 0.313

Epoch 16 | Teacher forcing ratio = 0.86
Epoch 16 | Train PPL: 5.367 | Val PPL (TF=1): 3.916 | Val PPL (free): 546.126 | Val Acc: 0.577

=== Sample Generation ===
 s'en fout d'me faire des coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups d'coups 
Distinct-1: 0.080 | Distinct-2: 0.136 | Distinct-3: 0.167