In [1]:
!pip install mido numpy torch
!pip install miditok

Collecting mido
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mido
Successfully installed mido-1.3.3
Collecting miditok
  Downloading miditok-3.0.6-py3-none-any.whl.metadata (10 kB)
Collecting symusic>=0.5.0 (from miditok)
  Downloading symusic-0.5.8-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (9.0 kB)
Collecting pySmartDL (from symusic>=0.5.0->miditok)
  Downloading pySmartDL-1.3.4-py3-none-any.whl.metadata (2.8 kB)
Downloading miditok-3.0.6-py3-none-any.whl (158 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.9/158.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading symusic-0.5.8-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# 2. Imports
import os
import math
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import TransformerDecoder, TransformerDecoderLayer
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # <--- ADD THIS LINE
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DataCollator
from tqdm.notebook import tqdm
from pathlib import Path

# 3. System and Device Information
print("--- System Info ---")
print(f"PyTorch version: {torch.__version__}")
IS_CUDA_AVAILABLE = torch.cuda.is_available()
print(f"CUDA is available: {IS_CUDA_AVAILABLE}")
if IS_CUDA_AVAILABLE:
    print(f"Device name: {torch.cuda.get_device_name(0)}")
DEVICE = torch.device("cuda" if IS_CUDA_AVAILABLE else "cpu")
print(f"Using device: {DEVICE}")
print("---------------------\n")

# 4. Tokenizer Configuration
NUM_BINS = 10
V_TOKENS = [f"<v_{i}>" for i in range(NUM_BINS)]
A_TOKENS = [f"<a_{i}>" for i in range(NUM_BINS)]
BASE_SPECIAL_TOKENS = ["PAD", "BOS", "EOS", "MASK"]
ALL_SPECIAL_TOKENS = BASE_SPECIAL_TOKENS + V_TOKENS + A_TOKENS

config = TokenizerConfig(
    pitch_range=(21, 109),
    beat_res={(0, 4): 8, (4, 12): 4},
    num_velocities=32,
    special_tokens=ALL_SPECIAL_TOKENS,
    use_chords=True,
    use_rests=True,
    use_tempos=True,
    use_time_signatures=True,
    use_programs=False
)
tokenizer = REMI(config)
print(f"Tokenizer initialized. Vocabulary size: {len(tokenizer)}\n")

# 5. Model Hyperparameter Configurations
class SmallModelConfig:
    def __init__(self, vocab_size, max_seq_len):
        self.vocab_size = vocab_size
        self.max_seq_len = max_seq_len
        self.d_model = 512    # Tăng từ 256
        self.nhead = 8        # Tăng từ 4
        self.d_hid = 2048     # Tăng từ 1024
        self.nlayers = 6      # Tăng từ 4
        self.dropout = 0.2

# 6. Global Utility Functions
def quantize_va(v, a, num_bins=10):
    """Quantizes Valence/Arousal values into token strings."""
    v_bin = int((v + 1.0) / 2.0 * (num_bins - 1))
    a_bin = int((a + 1.0) / 2.0 * (num_bins - 1))
    v_bin = max(0, min(num_bins - 1, v_bin))
    a_bin = max(0, min(num_bins - 1, a_bin))
    return f"<v_{v_bin}>", f"<a_{a_bin}>"

# 7. Dataset Classes
class VAMIDI_Dataset(Dataset): # For our labelled, fine-tuning data
    def __init__(self, root_dir, tokenizer, va_data_df, max_seq_len, num_va_bins=10, stride=512):
        self.tokenizer = tokenizer
        self.root_dir = root_dir
        self.max_seq_len = max_seq_len
        self.num_va_bins = num_va_bins
        self.stride = stride
        self.samples = []
        va_data_df['filename'] = va_data_df['midi'].apply(lambda path: os.path.basename(path))
        self.va_lookup = va_data_df.set_index('filename').to_dict('index')
        self._prepare_data()
    def _prepare_data(self):
        print(f"Starting fine-tuning data preparation with chunk size {self.max_seq_len}...")
        available_files = {f for f in os.listdir(self.root_dir) if f.endswith(('.mid', '.midi'))}
        files_to_process = [f for f in available_files if f in self.va_lookup]
        for filename in tqdm(files_to_process, desc="Processing labelled MIDI files"):
            midi_path = os.path.join(self.root_dir, filename)
            try: tokens = self.tokenizer(midi_path)
            except Exception as e: continue
            if len(tokens) == 0 or len(tokens[0].ids) == 0: continue
            token_ids = tokens[0].ids
            v, a = self.va_lookup[filename]['valence'], self.va_lookup[filename]['arousal']
            v_token_str, a_token_str = quantize_va(v, a, num_bins=self.num_va_bins)
            v_token_id, a_token_id = self.tokenizer[v_token_str], self.tokenizer[a_token_str]
            bos_id, eos_id = self.tokenizer['BOS_None'], self.tokenizer['EOS_None']
            full_ids = [v_token_id, a_token_id] + token_ids
            start = 0
            while True:
                end = start + self.max_seq_len - 2
                chunk = full_ids[start:end]
                final_sequence = [bos_id] + chunk + [eos_id]
                self.samples.append(torch.tensor(final_sequence, dtype=torch.long))
                if end >= len(full_ids): break
                start += self.stride
        print(f"Fine-tuning data preparation complete. Total chunks: {len(self.samples)}")
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx): return {'input_ids': self.samples[idx]}

class MIDIDataset_Pretrain(Dataset): # For the large, unlabelled pre-training data
    def __init__(self, root_dir, tokenizer, max_seq_len, stride=512):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.stride = stride
        self.samples = []
        self._prepare_data(root_dir)
    def _prepare_data(self, root_dir):
        print(f"Starting pre-training data preparation with chunk size {self.max_seq_len}...")
        paths = list(Path(root_dir).glob('**/*.mid')) + list(Path(root_dir).glob('**/*.midi'))
        print(f"Found {len(paths)} MIDI files for pre-training in {root_dir}.")
        for path in tqdm(paths, desc="Processing pre-training MIDI files"):
            try: tokens = self.tokenizer(str(path))
            except Exception as e: continue
            if len(tokens) == 0 or len(tokens[0].ids) == 0: continue
            token_ids = tokens[0].ids
            bos_id, eos_id = self.tokenizer['BOS_None'], self.tokenizer['EOS_None']
            start = 0
            while True:
                end = start + self.max_seq_len - 2
                chunk = token_ids[start:end]
                final_sequence = [bos_id] + chunk + [eos_id]
                self.samples.append(torch.tensor(final_sequence, dtype=torch.long))
                if end >= len(token_ids): break
                start += self.stride
        print(f"Pre-training data preparation complete. Total chunks: {len(self.samples)}")
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx): return {'input_ids': self.samples[idx]}

# 8. Model Architecture
class PositionalEncoding(nn.Module):
    # Give the positional encoding buffer a little extra room to avoid off-by-one errors during generation.
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000): 
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # This line is now safe because self.pe is much longer than x
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class MusicTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.token_encoder = nn.Embedding(config.vocab_size, config.d_model)

        self.pos_encoder = PositionalEncoding(config.d_model, config.dropout, max_len=config.max_seq_len + 5)
        
        decoder_layer = TransformerDecoderLayer(d_model=config.d_model, nhead=config.nhead, dim_feedforward=config.d_hid, dropout=config.dropout, batch_first=True, norm_first=True)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=config.nlayers)
        self.output_head = nn.Linear(config.d_model, config.vocab_size)
        self.init_weights()
    def init_weights(self) -> None:
        initrange = 0.1
        self.token_encoder.weight.data.uniform_(-initrange, initrange)
        self.output_head.bias.data.zero_()
        self.output_head.weight.data.uniform_(-initrange, initrange)
    def forward(self, src: torch.Tensor, src_padding_mask: torch.Tensor = None) -> torch.Tensor:
        seq_len = src.size(1)
        causal_mask = nn.Transformer.generate_square_subsequent_mask(seq_len, device=src.device)
        
        src_emb = self.token_encoder(src) * math.sqrt(self.config.d_model)
        src_emb = src_emb.permute(1, 0, 2)
        src_emb = self.pos_encoder(src_emb)
        src_emb = src_emb.permute(1, 0, 2)
        
        output = self.transformer_decoder(tgt=src_emb, memory=src_emb, tgt_mask=causal_mask, tgt_key_padding_mask=src_padding_mask)
        output = self.output_head(output)
        return output

--- System Info ---
PyTorch version: 2.5.1+cu121
CUDA is available: True
Device name: Tesla T4
Using device: cuda
---------------------

Tokenizer initialized. Vocabulary size: 455



In [None]:
# 1. Define Parameters and Paths
CHUNK_SIZE = 1024
PRETRAIN_BATCH_SIZE = 8
MAESTRO_ROOT_PATH = "/kaggle/input/themaestrodatasetv2/maestro-v2.0.0"
PRETRAIN_SAVE_PATH = "/kaggle/working/models/music_transformer_pretrained.pth"
NUM_PRETRAIN_EPOCHS = 3

# 2. Create Dataset and DataLoader
data_collator = DataCollator(pad_token_id=tokenizer['PAD_None'])
pretrain_dataset = MIDIDataset_Pretrain(MAESTRO_ROOT_PATH, tokenizer, max_seq_len=CHUNK_SIZE)
pretrain_dataloader = DataLoader(
    pretrain_dataset,
    batch_size=PRETRAIN_BATCH_SIZE,
    shuffle=True,
    collate_fn=data_collator,
    num_workers=2
)

# 3. Initialize Model and Optimizer
model_config = SmallModelConfig(vocab_size=len(tokenizer), max_seq_len=CHUNK_SIZE)
model = MusicTransformer(model_config).to(DEVICE)
# Trong Cell 2, dòng optimizer
optimizer = AdamW(model.parameters(), lr=3e-5) # Giảm learning rate
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer['PAD_None'])

# 4. Pre-training Loop
print(f"--- Starting Pre-training on MAESTRO ({len(pretrain_dataset)} samples) ---")
os.makedirs("/kaggle/working/models", exist_ok=True)
model.train()
for epoch in range(1, NUM_PRETRAIN_EPOCHS + 1):
    total_loss = 0
    for batch in tqdm(pretrain_dataloader, desc=f"Pre-training Epoch {epoch}"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        padding_mask = (attention_mask == 0)

        logits = model(src=input_ids, src_padding_mask=padding_mask)

        shifted_logits = logits[:, :-1, :].contiguous()
        shifted_labels = input_ids[:, 1:].contiguous()

        # Flatten the tokens
        loss = loss_fn(
            shifted_logits.view(-1, model_config.vocab_size),
            shifted_labels.view(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(pretrain_dataloader)
    print(f"Pre-training Epoch {epoch} | Avg Loss: {avg_loss:.4f}")

# 5. Save Pre-trained Model
torch.save(model.state_dict(), PRETRAIN_SAVE_PATH)
print(f"Pre-trained model saved to {PRETRAIN_SAVE_PATH}")

Starting pre-training data preparation with chunk size 1024...
Found 1282 MIDI files for pre-training in /kaggle/input/themaestrodatasetv2/maestro-v2.0.0.


Processing pre-training MIDI files:   0%|          | 0/1282 [00:00<?, ?it/s]

Pre-training data preparation complete. Total chunks: 52096
--- Starting Pre-training on MAESTRO (52096 samples) ---


Pre-training Epoch 1:   0%|          | 0/6512 [00:00<?, ?it/s]

Pre-training Epoch 1 | Avg Loss: 2.5461


Pre-training Epoch 2:   0%|          | 0/6512 [00:00<?, ?it/s]

Pre-training Epoch 2 | Avg Loss: 0.2228


Pre-training Epoch 3:   0%|          | 0/6512 [00:00<?, ?it/s]

In [None]:
from miditok import TokSequence

# 1. Generation Parameters
# We might need to adjust these if the model repeats itself.
MAX_GEN_LEN = 1024
TEMPERATURE = 1.0
TOP_P = 0.95
REPETITION_PENALTY = 1.2
PRETRAINED_MODEL_PATH = "/kaggle/working/models/music_transformer_pretrained.pth"

# 2. Load the Pre-trained Model
print("--- Loading Pre-trained Model ---")
# Ensure model_config is defined from a previous cell
model_gen_pretrain = MusicTransformer(model_config).to(DEVICE)
model_gen_pretrain.load_state_dict(torch.load(PRETRAINED_MODEL_PATH, map_location=DEVICE, weights_only=True))
model_gen_pretrain.eval()
print("Pre-trained model loaded successfully.")

# 3. Generation Function (Simplified for pre-trained model)
def generate_from_pretrained(model, tokenizer, max_len, temperature, top_p, repetition_penalty):
    print(f"\nStarting generation from pre-trained model...")
    
    # --- The prompt is very simple: just the beginning of a sequence ---
    bos_id = tokenizer['BOS_None']
    input_ids = [bos_id]
    # ------------------------------------------------------------------
    
    with torch.no_grad():
        for _ in tqdm(range(max_len), desc="Generating tokens"):
            input_tensor = torch.tensor([input_ids], dtype=torch.long, device=DEVICE)
            logits = model(input_tensor)
            last_token_logits = logits[0, -1, :]

            # Apply Repetition Penalty
            # Penalize the last 20 tokens to encourage variety
            if len(input_ids) > 1:
                for token_id in set(input_ids[-20:]):
                    last_token_logits[token_id] /= repetition_penalty

            # Apply Temperature
            scaled_logits = last_token_logits / temperature
            
            # Apply Top-p (Nucleus) Sampling
            sorted_logits, sorted_indices = torch.sort(scaled_logits, descending=True)
            cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
            
            sorted_indices_to_remove = cumulative_probs > top_p
            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
            sorted_indices_to_remove[..., 0] = 0
            
            indices_to_remove = sorted_indices[sorted_indices_to_remove]
            scaled_logits[indices_to_remove] = -float('inf')

            probs = torch.nn.functional.softmax(scaled_logits, dim=-1)
            next_token_id = torch.multinomial(probs, num_samples=1)
            
            if next_token_id.item() == tokenizer['EOS_None']:
                print("\nEnd-of-Sequence token generated. Stopping.")
                break
            
            input_ids.append(next_token_id.item())
            
    print(f"Generated {len(input_ids)} tokens.")
    print("\n--- 40 Token đầu tiên được sinh ra ---")
    generated_token_strings = [tokenizer[id_] for id_ in input_ids[:40]]
    print(generated_token_strings)
    return input_ids

# 4. Generate, Convert to MIDI, and Save
pretrained_tokens = generate_from_pretrained(
    model_gen_pretrain, tokenizer, 
    max_len=MAX_GEN_LEN, temperature=TEMPERATURE, top_p=TOP_P, repetition_penalty=REPETITION_PENALTY
)

if pretrained_tokens:
    tok_seq = TokSequence(ids=pretrained_tokens)
    generated_midi_pretrained = tokenizer.decode([tok_seq])
    output_path_pretrained = "/kaggle/working/pretrained_generated_music.mid"
    generated_midi_pretrained.dump_midi(output_path_pretrained)
    print(f"\nMusic generated from pre-trained model saved to: {output_path_pretrained}")

In [None]:
# 1. Define Parameters and Paths
FINETUNE_BATCH_SIZE = 8
LABEL_FILE = "/kaggle/input/emodata/vgmidi-master/vgmidi_labelled.csv"
MIDI_DIR_CORRECTED = "/kaggle/input/emodata/vgmidi-master/labelled/phrases/phrases"
FINETUNE_SAVE_PATH = "/kaggle/working/models/music_transformer_finetuned.pth"

NUM_FINETUNE_EPOCHS = 4
# -------------------------------------------------------------

FINETUNE_LR = 5e-5 # Use a smaller learning rate

# 2. Create Fine-tuning Dataset and DataLoader
labels_df = pd.read_csv(LABEL_FILE)
finetune_dataset = VAMIDI_Dataset(
    root_dir=MIDI_DIR_CORRECTED,
    tokenizer=tokenizer,
    va_data_df=labels_df.copy(),
    max_seq_len=CHUNK_SIZE
)
finetune_dataloader = DataLoader(
    finetune_dataset,
    batch_size=FINETUNE_BATCH_SIZE,
    shuffle=True,
    collate_fn=data_collator
)

model.load_state_dict(torch.load(PRETRAIN_SAVE_PATH, weights_only=True))
print("Successfully loaded pre-trained weights for fine-tuning.")
optimizer = AdamW(model.parameters(), lr=FINETUNE_LR, weight_decay=0.01)

# 4. Fine-tuning Loop
print(f"--- Starting Fine-tuning ({len(finetune_dataset)} samples) for {NUM_FINETUNE_EPOCHS} epochs ---")
best_finetune_loss = float('inf')
model.train()
for epoch in range(1, NUM_FINETUNE_EPOCHS + 1):
    total_loss = 0
    for batch in tqdm(finetune_dataloader, desc=f"Fine-tuning Epoch {epoch}"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        padding_mask = (attention_mask == 0)

        # Corrected loss calculation
        logits = model(src=input_ids, src_padding_mask=padding_mask)
        shifted_logits = logits[:, :-1, :].contiguous()
        shifted_labels = input_ids[:, 1:].contiguous()

        loss = loss_fn(
            shifted_logits.view(-1, model_config.vocab_size),
            shifted_labels.view(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(finetune_dataloader)
    print(f"Fine-tuning Epoch {epoch} | Avg Loss: {avg_loss:.4f}")

    # We will always save the latest model in this shortened training schedule.
    # The 'best_loss' logic is less critical here since we are manually stopping early.
    torch.save(model.state_dict(), FINETUNE_SAVE_PATH)
    print(f"Model saved after epoch {epoch} to {FINETUNE_SAVE_PATH}")

print("--- Fine-tuning Complete ---")

Starting fine-tuning data preparation with chunk size 1024...


Processing labelled MIDI files:   0%|          | 0/204 [00:00<?, ?it/s]

Fine-tuning data preparation complete. Total chunks: 1012
Successfully loaded pre-trained weights for fine-tuning.
--- Starting Fine-tuning (1012 samples) for 4 epochs ---


Fine-tuning Epoch 1:   0%|          | 0/127 [00:00<?, ?it/s]



Fine-tuning Epoch 1 | Avg Loss: 0.0261
Model saved after epoch 1 to /kaggle/working/models/music_transformer_finetuned.pth


Fine-tuning Epoch 2:   0%|          | 0/127 [00:00<?, ?it/s]

Fine-tuning Epoch 2 | Avg Loss: 0.0150
Model saved after epoch 2 to /kaggle/working/models/music_transformer_finetuned.pth


Fine-tuning Epoch 3:   0%|          | 0/127 [00:00<?, ?it/s]

Fine-tuning Epoch 3 | Avg Loss: 0.0121
Model saved after epoch 3 to /kaggle/working/models/music_transformer_finetuned.pth


Fine-tuning Epoch 4:   0%|          | 0/127 [00:00<?, ?it/s]

Fine-tuning Epoch 4 | Avg Loss: 0.0110
Model saved after epoch 4 to /kaggle/working/models/music_transformer_finetuned.pth
--- Fine-tuning Complete ---


In [28]:
# Cell 4: Sinh nhạc (v3 - Lấy mẫu Nâng cao)
# =========================================

from miditok import TokSequence

# 1. Các tham số Sinh nhạc
TEMPERATURE = 1.5  # Tăng mạnh để tăng tính ngẫu nhiên
TOP_P = 0.9        # Giảm nhẹ
REPETITION_PENALTY # Hình phạt cho việc lặp lại token. > 1.0 sẽ hạn chế lặp lại.

FINAL_MODEL_PATH = "/kaggle/working/models/music_transformer_finetuned.pth"

# 2. Tải mô hình đã Fine-tune
print("--- Tải mô hình đã Fine-tune ---")
model_gen = MusicTransformer(model_config).to(DEVICE)
model_gen.load_state_dict(torch.load(FINAL_MODEL_PATH, map_location=DEVICE))
model_gen.eval()
print("Tải mô hình thành công.")

# 3. Hàm Sinh nhạc NÂNG CAO
def generate_music(model, tokenizer, prompt_v, prompt_a, max_len, temperature, top_p, repetition_penalty):
    print(f"\nBắt đầu sinh nhạc cho Valence={prompt_v}, Arousal={prompt_a}...")
    v_token_str, a_token_str = quantize_va(prompt_v, prompt_a, num_bins=NUM_BINS)
    v_token_id, a_token_id = tokenizer[v_token_str], tokenizer[a_token_str]
    bos_id = tokenizer['BOS_None']
    input_ids = [bos_id, v_token_id, a_token_id]
    
    with torch.no_grad():
        for _ in tqdm(range(max_len), desc="Đang sinh token"):
            input_tensor = torch.tensor([input_ids], dtype=torch.long, device=DEVICE)
            logits = model(input_tensor)
            last_token_logits = logits[0, -1, :]

            # --- Áp dụng Hình phạt Lặp lại ---
            # Trừng phạt các token trong 20 bước gần nhất
            for token_id in set(input_ids[-20:]):
                last_token_logits[token_id] /= repetition_penalty

            # --- Áp dụng Nhiệt độ (Temperature) ---
            scaled_logits = last_token_logits / temperature
            
            # --- Áp dụng Lấy mẫu Top-p (Nucleus) ---
            sorted_logits, sorted_indices = torch.sort(scaled_logits, descending=True)
            cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
            
            # Loại bỏ các token có xác suất tích lũy trên ngưỡng
            sorted_indices_to_remove = cumulative_probs > top_p
            # Dịch chỉ số sang phải để giữ lại token đầu tiên vượt ngưỡng
            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
            sorted_indices_to_remove[..., 0] = 0
            
            indices_to_remove = sorted_indices[sorted_indices_to_remove]
            scaled_logits[indices_to_remove] = -float('inf')

            # Lấy mẫu từ phân phối đã được lọc
            probs = torch.nn.functional.softmax(scaled_logits, dim=-1)
            next_token_id = torch.multinomial(probs, num_samples=1)
            
            if next_token_id.item() == tokenizer['EOS_None']:
                print("\nToken kết thúc chuỗi được sinh ra. Dừng lại.")
                break
            
            input_ids.append(next_token_id.item())
            
    print(f"Đã sinh ra {len(input_ids)} token.")
    print("\n--- 40 Token đầu tiên được sinh ra ---")
    generated_token_strings = [tokenizer[id_] for id_ in input_ids[:40]]
    print(generated_token_strings)
    
    return input_ids

# 4. Sinh nhạc, chuyển đổi sang MIDI và Lưu
# --- Sinh một bản nhạc Vui/Sôi động ---
happy_tokens_ids = generate_music(
    model_gen, tokenizer, prompt_v=0.8, prompt_a=0.7, 
    max_len=MAX_GEN_LEN, temperature=TEMPERATURE, top_p=TOP_P, repetition_penalty=REPETITION_PENALTY
)
if happy_tokens_ids:
    tok_seq = TokSequence(ids=happy_tokens_ids)
    generated_midi_happy = tokenizer.decode([tok_seq])
    output_path_happy = "/kaggle/working/happy_generated_music.mid"
    generated_midi_happy.dump_midi(output_path_happy)
    print(f"Nhạc vui đã được lưu tại: {output_path_happy}")

# --- Sinh một bản nhạc Buồn/Lặng ---
sad_tokens_ids = generate_music(
    model_gen, tokenizer, prompt_v=-0.8, prompt_a=-0.6, 
    max_len=MAX_GEN_LEN, temperature=TEMPERATURE, top_p=TOP_P, repetition_penalty=REPETITION_PENALTY
)
if sad_tokens_ids:
    tok_seq = TokSequence(ids=sad_tokens_ids)
    generated_midi_sad = tokenizer.decode([tok_seq])
    output_path_sad = "/kaggle/working/sad_generated_music.mid"
    generated_midi_sad.dump_midi(output_path_sad)
    print(f"Nhạc buồn đã được lưu tại: {output_path_sad}")

--- Tải mô hình đã Fine-tune ---
Tải mô hình thành công.

Bắt đầu sinh nhạc cho Valence=0.8, Arousal=0.7...


  model_gen.load_state_dict(torch.load(FINAL_MODEL_PATH, map_location=DEVICE))


Đang sinh token:   0%|          | 0/1024 [00:00<?, ?it/s]

Đã sinh ra 1027 token.

--- 40 Token đầu tiên được sinh ra ---
['BOS_None', '<v_8>', '<a_7>', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70', 'Pitch_70']
Nhạc vui đã được lưu tại: /kaggle/working/happy_generated_music.mid

Bắt đầu sinh nhạc cho Valence=-0.8, Arousal=-0.6...


Đang sinh token:   0%|          | 0/1024 [00:00<?, ?it/s]

Đã sinh ra 1027 token.

--- 40 Token đầu tiên được sinh ra ---
['BOS_None', '<v_0>', '<a_1>', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72', 'Pitch_72']
Nhạc buồn đã được lưu tại: /kaggle/working/sad_generated_music.mid


In [11]:
!pip install midi-player

Collecting midi-player
  Downloading midi_player-0.5.1-py3-none-any.whl.metadata (2.2 kB)
Downloading midi_player-0.5.1-py3-none-any.whl (6.4 kB)
Installing collected packages: midi-player
Successfully installed midi-player-0.5.1


In [18]:
from midi_player import MIDIPlayer

# Replace with your file path
midi_path = "/kaggle/working/happy_generated_music.mid"

# This embeds a clickable MIDI player widget with default dimensions
MIDIPlayer(midi_path, 400)

In [13]:
"""
from mido import MidiFile, MidiTrack, Message, MetaMessage

mid = MidiFile("/kaggle/working/happy_generated_music.mid")

if not any(len(t) for t in mid.tracks):
    track = MidiTrack()
    track.append(MetaMessage('end_of_track', time=0))
    mid.tracks.append(track)

for t in mid.tracks:
    if t and t[-1].type != 'end_of_track':
        t.append(MetaMessage('end_of_track', time=0))

mid.save("fixed.mid")


"""

In [15]:
!pip install midi2audio pydub
!sudo apt update
!sudo apt install -y fluidsynth libfluidsynth3 fluid-soundfont-gm ffmpeg

Collecting midi2audio
  Downloading midi2audio-0.1.1-py2.py3-none-any.whl.metadata (5.7 kB)
Downloading midi2audio-0.1.1-py2.py3-none-any.whl (8.7 kB)
Installing collected packages: midi2audio
Successfully installed midi2audio-0.1.1
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]        [0m
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]           [0m[33m[33m
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]      [0m[33m
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]      [0m
Get:8 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [79.8 kB][33m[33m[33m
Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InReleas

In [16]:
from midi2audio import FluidSynth
from pydub import AudioSegment
import os

def midi_to_mp3(midi_path, mp3_path, soundfont=None):
    wav_path = midi_path.replace(".mid", ".wav")
    fs = FluidSynth(soundfont) if soundfont else FluidSynth()
    fs.midi_to_audio(midi_path, wav_path)

    audio = AudioSegment.from_wav(wav_path)
    audio.export(mp3_path, format="mp3")

    os.remove(wav_path)  # clean up
    print(f"Converted to MP3 → {mp3_path}")


In [20]:
midi_to_mp3(
  "/kaggle/working/happy_generated_music.mid",
  "/kaggle/working/generated_music.mp3"
)

Converted to MP3 → /kaggle/working/generated_music.mp3
