In [1]:
pip install datasets

Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 19.0.1
    Uninstalling pyarrow-19.0.1:
      Successfully uninstalled pyarrow-19.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0

# Chargement et exploration des données

In [2]:
from datasets import load_dataset
import json, os

ds = load_dataset("sander-wood/irishman")  # splits: train / validation
os.makedirs("irishman", exist_ok=True)

# Sauvegarde au format proche de ton TP : liste d'objets
with open("irishman/train.json", "w", encoding="utf-8") as f:
    json.dump(list(ds["train"]), f, ensure_ascii=False)

with open("irishman/validation.json", "w", encoding="utf-8") as f:
    json.dump(list(ds["validation"]), f, ensure_ascii=False)

print("OK -> irishman/train.json & irishman/validation.json")

README.md: 0.00B [00:00, ?B/s]

train.json:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

validation.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/214122 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2162 [00:00<?, ? examples/s]

OK -> irishman/train.json & irishman/validation.json


# Etape 1 : Data Exploration 

a) Caractères uniques (train) : 
On a extrait l’ensemble des caractères présents dans toutes les partitions ABC du jeu d’entraînement (lettres, chiffres, symboles, espaces, retours à la ligne, etc.).

b) Nombre de caractères uniques :
Il y a 95 caractères uniques dans le dataset d’entraînement.

c) Pourquoi utiliser des indices au lieu des caractères ? : 
Parce qu’un modèle (PyTorch) ne traite que des valeurs numériques : on convertit chaque caractère en index (id) pour pouvoir l’encoder (via embedding/one-hot) et apprendre à prédire le caractère suivant.

In [3]:
import json

with open("irishman/train.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

with open("irishman/validation.json", "r", encoding="utf-8") as f:
    val_data = json.load(f)

print("Nb chansons train :", len(train_data))
print("Nb chansons val   :", len(val_data))

print("\nClés disponibles dans un exemple :", list(train_data[0].keys()))

Nb chansons train : 214122
Nb chansons val   : 2162

Clés disponibles dans un exemple : ['abc notation', 'control code']


In [5]:
def find_abc_key(example: dict):
    keys = list(example.keys())
    # priorité à une clé qui contient "abc" et "notation"
    for k in keys:
        lk = k.lower()
        if "abc" in lk and "notation" in lk:
            return k
    # sinon une clé qui contient "abc"
    for k in keys:
        if "abc" in k.lower():
            return k
    raise ValueError(f"Aucune clé ABC trouvée. Clés: {keys}")

abc_key = find_abc_key(train_data[0])
print("abc_key =", abc_key)

first_song = train_data[0][abc_key]
print("\n--- Première chanson (texte brut) ---\n")
print(first_song)

print("\n--- Aperçu lignes (structure) ---\n")
for line in first_song.splitlines()[:15]:
    print(line)

abc_key = abc notation

--- Première chanson (texte brut) ---

X:1
L:1/8
M:4/4
K:Emin
|: E2 EF E2 EF | DEFG AFDF | E2 EF E2 B2 |1 efe^d e2 e2 :|2 efe^d e3 B |: e2 ef g2 fe | 
 defg afdf |1 e2 ef g2 fe | efe^d e3 B :|2 g2 bg f2 af | efe^d e2 e2 ||

--- Aperçu lignes (structure) ---

X:1
L:1/8
M:4/4
K:Emin
|: E2 EF E2 EF | DEFG AFDF | E2 EF E2 B2 |1 efe^d e2 e2 :|2 efe^d e3 B |: e2 ef g2 fe | 
 defg afdf |1 e2 ef g2 fe | efe^d e3 B :|2 g2 bg f2 af | efe^d e2 e2 ||


In [6]:
unique_chars = set()

for obj in train_data:
    s = obj[abc_key]
    unique_chars.update(list(s))

print("Nb caractères uniques (train) :", len(unique_chars))
print("Exemple de caractères :", sorted(list(unique_chars))[:80])

Nb caractères uniques (train) : 95
Exemple de caractères : ['\n', ' ', '!', '"', '#', '$', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o']


# Etape 2 : Mapping

In [7]:
# Étape 2 — Mapping caractères-index

# On part de unique_chars calculé à l'étape 1
# Important : on fixe un ordre stable (tri) pour que les index soient reproductibles
idx2char = sorted(list(unique_chars))          # index -> char
char2idx = {c: i for i, c in enumerate(idx2char)}  # char -> index

print("Taille vocabulaire :", len(idx2char))
print("Exemple mapping char2idx :", {c: char2idx[c] for c in idx2char[:10]})
print("Exemple mapping idx2char :", [idx2char[i] for i in range(10)])

Taille vocabulaire : 95
Exemple mapping char2idx : {'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '&': 6, "'": 7, '(': 8, ')': 9}
Exemple mapping idx2char : ['\n', ' ', '!', '"', '#', '$', '&', "'", '(', ')']


	•	a) char2idx : dictionnaire qui associe chaque caractère à un index unique.
	•	b) idx2char : liste qui permet de retrouver le caractère à partir de son index (idx2char[i]).

# Étape 3 : Vectorisation des chaînes

In [8]:
# Étape 3 — Vectorisation des chaînes

def vectorize_string(s: str, char2idx: dict):
    return [char2idx[c] for c in s]

# Test avec la première chanson du train
first_song = train_data[0][abc_key]

vec = vectorize_string(first_song, char2idx)

print("Longueur texte :", len(first_song))
print("Longueur vect  :", len(vec))
print("Début (indices):", vec[:50])
print("Début (reconstruit):", "".join(idx2char[i] for i in vec[:200]))

Longueur texte : 183
Longueur vect  : 183
Début (indices): [56, 26, 17, 0, 44, 26, 17, 15, 24, 0, 45, 26, 20, 15, 20, 0, 43, 26, 37, 77, 73, 78, 0, 92, 26, 1, 37, 18, 1, 37, 38, 1, 37, 18, 1, 37, 38, 1, 92, 1, 36, 37, 38, 39, 1, 33, 38, 36, 38, 1]
Début (reconstruit): X:1
L:1/8
M:4/4
K:Emin
|: E2 EF E2 EF | DEFG AFDF | E2 EF E2 B2 |1 efe^d e2 e2 :|2 efe^d e3 B |: e2 ef g2 fe | 
 defg afdf |1 e2 ef g2 fe | efe^d e3 B :|2 g2 bg f2 af | efe^d e2 e2 ||


La vectorisation transforme chaque caractère de la partition ABC en un indice numérique selon char2idx. On obtient ainsi une séquence d’entiers exploitable par le modèle (Embedding + LSTM).

# *Étape 4 : Padding des séquences***

In [9]:
# Étape 4a — Longueur maximale des séquences (train)
max_len = max(len(obj[abc_key]) for obj in train_data)
print("Longueur maximale (train) :", max_len)

Longueur maximale (train) : 2968


In [10]:
# Étape 4b — Padding / Troncature

def pad_or_truncate(s: str, max_len: int, pad_char: str = " "):
    if len(s) < max_len:
        return s + pad_char * (max_len - len(s))
    return s[:max_len]

# Test sur une chanson
s = train_data[0][abc_key]
s2 = pad_or_truncate(s, max_len)

print("Avant :", len(s))
print("Après :", len(s2))
print("Derniers caractères (après) :", repr(s2[-50:]))

Avant : 183
Après : 2968
Derniers caractères (après) : '                                                  '


	•	a) On calcule la longueur maximale pour connaître la taille cible commune.
	•	b) Le padding ajoute des espaces aux séquences courtes, et la troncature coupe les séquences trop longues, afin de pouvoir créer des batches (tensors) de même dimension.

# Création du dataset PyTorch****

**Étape 1 — Tout regrouper dans une fonction “prepare_data”**

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

def prepare_data(train_data, val_data, abc_key, char2idx, max_len, pad_char=" "):
    def pad_or_truncate(s: str):
        if len(s) < max_len:
            return s + pad_char * (max_len - len(s))
        return s[:max_len]

    def vectorize(s: str):
        return [char2idx[c] for c in s]

    train_seqs = [torch.tensor(vectorize(pad_or_truncate(obj[abc_key])), dtype=torch.long)
                  for obj in train_data]
    val_seqs   = [torch.tensor(vectorize(pad_or_truncate(obj[abc_key])), dtype=torch.long)
                  for obj in val_data]

    return train_seqs, val_seqs

**Étape 2 — Classe MusicDataset + DataLoader (batch=8 pour vérifier)**

In [12]:
class MusicDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences  # liste de tensors [max_len]

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]     # [L]
        x = seq[:-1]                  # [L-1]
        y = seq[1:]                   # [L-1] (décalée)
        return x, y

# Construire les sequences
train_seqs, val_seqs = prepare_data(train_data, val_data, abc_key, char2idx, max_len)

train_ds = MusicDataset(train_seqs)
val_ds   = MusicDataset(val_seqs)

train_loader_check = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader_check   = DataLoader(val_ds, batch_size=8, shuffle=False)

xb, yb = next(iter(train_loader_check))
print("x batch shape:", xb.shape)  # [8, L-1]
print("y batch shape:", yb.shape)  # [8, L-1]
print("Exemple x[0][:20]:", xb[0, :20])
print("Exemple y[0][:20]:", yb[0, :20])

x batch shape: torch.Size([8, 2967])
y batch shape: torch.Size([8, 2967])
Exemple x[0][:20]: tensor([56, 26, 18, 17, 16, 18, 25, 16,  0, 44, 26, 17, 15, 24,  0, 45, 26, 22,
        15, 24])
Exemple y[0][:20]: tensor([26, 18, 17, 16, 18, 25, 16,  0, 44, 26, 17, 15, 24,  0, 45, 26, 22, 15,
        24,  0])


le dataset renvoie (x, y) où y est la même séquence que x mais décalée d’un caractère, pour apprendre à prédire le prochain caractère.


**2) Implémentation du modèle LSTM**

In [13]:
!pip -q uninstall -y tensorboard tensorboard-data-server protobuf
!pip -q install tensorboard==2.15.2 protobuf==3.20.3

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
onnx 1.18.0 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which is incompa

In [14]:
import torch.nn as nn

class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        # x: [B, T]
        e = self.embed(x)                 # [B, T, E]
        out, hidden = self.lstm(e, hidden)# [B, T, H]
        logits = self.fc(out)             # [B, T, V]
        return logits, hidden

**Boucle d’entraînement (TensorBoard + Early stopping + save best)**

In [15]:
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
import os

def accuracy_from_logits(logits, y):
    # logits: [B,T,V], y: [B,T]
    preds = logits.argmax(dim=-1)
    return (preds == y).float().mean().item()

def train_model(model, train_ds, val_ds, num_training_iterations=3000,
                batch_size=256, learning_rate=5e-3,
                patience=3, log_dir="runs/music_rnn", save_path="best_music_rnn.pt"):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    writer = SummaryWriter(log_dir=log_dir)

    steps_per_epoch = len(train_loader)
    max_epochs = (num_training_iterations + steps_per_epoch - 1) // steps_per_epoch

    best_val_loss = float("inf")
    bad_epochs = 0
    global_step = 0

    for epoch in range(1, max_epochs + 1):
        # ---- Train ----
        model.train()
        train_loss_sum, train_acc_sum, n_train = 0.0, 0.0, 0

        for xb, yb in train_loader:
            if global_step >= num_training_iterations:
                break

            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            logits, _ = model(xb)

            # CrossEntropy: [B*T, V] vs [B*T]
            loss = criterion(logits.reshape(-1, logits.size(-1)), yb.reshape(-1))
            loss.backward()
            optimizer.step()

            acc = accuracy_from_logits(logits, yb)

            bs = xb.size(0)
            train_loss_sum += loss.item() * bs
            train_acc_sum  += acc * bs
            n_train += bs

            global_step += 1

        train_loss = train_loss_sum / max(1, n_train)
        train_acc  = train_acc_sum  / max(1, n_train)

        # ---- Val ----
        model.eval()
        val_loss_sum, val_acc_sum, n_val = 0.0, 0.0, 0

        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                logits, _ = model(xb)
                loss = criterion(logits.reshape(-1, logits.size(-1)), yb.reshape(-1))
                acc = accuracy_from_logits(logits, yb)

                bs = xb.size(0)
                val_loss_sum += loss.item() * bs
                val_acc_sum  += acc * bs
                n_val += bs

        val_loss = val_loss_sum / max(1, n_val)
        val_acc  = val_acc_sum  / max(1, n_val)

        # TensorBoard logs
        writer.add_scalar("loss/train", train_loss, epoch)
        writer.add_scalar("loss/val",   val_loss,   epoch)
        writer.add_scalar("acc/train",  train_acc,  epoch)
        writer.add_scalar("acc/val",    val_acc,    epoch)

        print(f"Epoch {epoch}/{max_epochs} | train_loss={train_loss:.4f} val_loss={val_loss:.4f} | train_acc={train_acc:.4f} val_acc={val_acc:.4f}")

        # Early stopping + best save
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            bad_epochs = 0
            torch.save(model.state_dict(), save_path)
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print("Early stopping déclenché.")
                break

    writer.close()
    print("Best model saved ->", save_path)
    return save_path

2025-12-24 11:54:39.900721: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766577280.135348      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766577280.195317      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [16]:
vocab_size = len(idx2char)
embedding_dim = 128
hidden_size = 512

model = MusicRNN(vocab_size, embedding_dim, hidden_size)

best_path = train_model(
    model,
    train_ds, val_ds,
    num_training_iterations=3000,
    batch_size=32,
    learning_rate=5e-3,
    patience=3,
    save_path="best_music_rnn.pt"
)

Epoch 1/1 | train_loss=0.1122 val_loss=0.0950 | train_acc=0.9645 val_acc=0.9692
Best model saved -> best_music_rnn.pt


In [17]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MusicRNN(vocab_size=len(idx2char), embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)
model.load_state_dict(torch.load("best_music_rnn.pt", map_location=device))
model.eval()

MusicRNN(
  (embed): Embedding(95, 128)
  (lstm): LSTM(128, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=95, bias=True)
)

In [18]:
import torch.nn.functional as F

def generate_music(model, start_seq, char2idx, idx2char, length=200, temperature=1.0):
    device = next(model.parameters()).device
    model.eval()

    # vectoriser la séquence de départ
    start_ids = [char2idx[c] for c in start_seq]
    generated = start_ids[:]  # liste d'indices

    hidden = None

    with torch.no_grad():
        # passer toute la seed pour initialiser l'état caché
        x = torch.tensor([start_ids], dtype=torch.long, device=device)  # [1, T]
        _, hidden = model(x, hidden)

        last_id = start_ids[-1]

        for _ in range(length):
            inp = torch.tensor([[last_id]], dtype=torch.long, device=device)  # [1,1]
            logits, hidden = model(inp, hidden)  # logits: [1,1,V]
            logits = logits[0, 0]  # [V]

            # temperature + proba
            logits = logits / max(temperature, 1e-6)
            probs = F.softmax(logits, dim=-1)

            # échantillonnage
            next_id = torch.multinomial(probs, num_samples=1).item()

            generated.append(next_id)
            last_id = next_id

    return "".join(idx2char[i] for i in generated)

In [19]:
seed = "X:1\nL:1/8\nM:4/4\nK:Em\n"
generated_abc = generate_music(model, seed, char2idx, idx2char, length=200, temperature=1.0)

print("----- Generated ABC -----")
print(generated_abc)

----- Generated ABC -----
X:1
L:1/8
M:4/4
K:Em
 E2 BE G/A/B B2 | A2 A2 G2 E>G | E2 E>D E2 (E/F/E/D/) | E2 E G3 G3 G | 
 B2 B>c B>A G2 | E4 E4 | F4 G3 F | E/E/E/E/ G>E E4 ||                                                                           
