In [1]:
# Install a potentially more compatible version of transformers and datasets
# Install a potentially more compatible version of transformers, datasets, and accelerate
!pip install datasets==2.16.1 transformers==4.38.0 peft==0.8.2 accelerate==0.27.2
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129
!pip install hf_xet


Looking in indexes: https://download.pytorch.org/whl/nightly/cu129


In [4]:
from datasets import load_dataset, concatenate_datasets

# Load your dataset
dataset = load_dataset("sander-wood/melodyhub")

# Combine train and validation splits into a single dataset
full_dataset = concatenate_datasets([dataset['train'], dataset['validation']])

# Select 50% of the full dataset
subset_dataset = full_dataset.train_test_split(test_size=0.5, seed=42)['train']  # Keep 50% of the data

# Split the 50% subset into train (25%), validation (12.5%), and test (12.5%)
train_val_split = subset_dataset.train_test_split(test_size=0.25, seed=42)
train_dataset = train_val_split['train']             # 75% of 50% = 37.5% of full
val_test_split = train_val_split['test'].train_test_split(test_size=0.5, seed=42)
new_validation_dataset = val_test_split['train']     # 12.5% of full
test_dataset = val_test_split['test']                # 12.5% of full

# 🔻 Reduce training set size for faster experimentation
train_dataset = train_dataset.select(range(5000))  # Use only first 5,000 samples
new_validation_dataset= train_dataset.select(range(5000))
test_dataset = train_dataset.select(range(5000))
# Print dataset sizes to verify
print("Original Full Dataset Size:", len(full_dataset))
print("Subset Used (50%):", len(subset_dataset))
print("Reduced Train Set Size:", len(train_dataset))
print("Validation Set Size:", len(new_validation_dataset))
print("Test Set Size:", len(test_dataset))


Original Full Dataset Size: 1067747
Subset Used (50%): 533873
Reduced Train Set Size: 5000
Validation Set Size: 5000
Test Set Size: 5000


In [5]:
from transformers import RobertaTokenizerFast

# Use the fast version for better performance and compatibility
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

# Centralized tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["input"],              # Make sure your dataset uses "input" as the key
        padding="max_length",           # Pad all sequences to the same max_length
        truncation=True,                # Truncate longer sequences
        max_length=256,                 # Explicitly set max length (adjust as needed)
        return_tensors="pt"             # Ensures PyTorch compatibility
    )

# Tokenize all datasets efficiently
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_valid_dataset = new_validation_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


# Preview tokenization
print("\n✅ Tokenized Datasets:")
print(f"Train: {len(tokenized_train_dataset)} samples")
print(f"Validation: {len(tokenized_valid_dataset)} samples")
print(f"Test: {len(tokenized_test_dataset)} samples")


Map: 100%|██████████| 5000/5000 [00:00<00:00, 5958.70 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 6147.71 examples/s]


✅ Tokenized Datasets:
Train: 5000 samples
Validation: 5000 samples
Test: 5000 samples





In [7]:
import os
import gc
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

# ========== CONFIG ==========
MODEL_NAME = "distilroberta-base"
MAX_LENGTH = 256
TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 2
GRAD_ACCUM_STEPS = 2
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5
USE_FP16 = True  # Set to False if not using a modern GPU

# Improve CUDA memory handling
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# ========== LABEL ENCODER ==========
class RobustLabelEncoder:
    def __init__(self, unknown_token: str = "<UNK>"):
        self.label_to_int = {unknown_token: 0}
        self.int_to_label = {0: unknown_token}
        self.unknown_token = unknown_token
        self.unknown_id = 0
        self.next_id = 1

    def fit(self, labels):
        for label in sorted(set(labels)):
            if label not in self.label_to_int:
                self.label_to_int[label] = self.next_id
                self.int_to_label[self.next_id] = label
                self.next_id += 1
        return self

    def transform(self, labels):
        return [self.label_to_int.get(label, self.unknown_id) for label in labels]

    def inverse_transform(self, ids):
        return [self.int_to_label.get(i, self.unknown_token) for i in ids]

# ========== DATA PREPARATION ==========

# Fit label encoder
all_labels = list(set(train_dataset["output"]) | set(new_validation_dataset["output"]) | set(test_dataset["output"]))
label_encoder = RobustLabelEncoder().fit(all_labels)

# Add numeric label field
def encode_label(example):
    example["labels"] = label_encoder.transform([example["output"]])[0]
    return example

train_dataset = train_dataset.map(encode_label)
new_validation_dataset = new_validation_dataset.map(encode_label)
test_dataset = test_dataset.map(encode_label)

# Tokenization
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(example):
    return tokenizer(example["input"], truncation=True, padding="max_length", max_length=MAX_LENGTH)

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_valid = new_validation_dataset.map(tokenize, batched=True)
tokenized_test = test_dataset.map(tokenize, batched=True)

# Remove unnecessary columns
columns_to_remove = ['dataset', 'task']
for dataset in [tokenized_train, tokenized_valid, tokenized_test]:
    for col in columns_to_remove:
        if col in dataset.column_names:
            dataset = dataset.remove_columns(col)

# ========== MEMORY CLEANUP ==========
gc.collect()
torch.cuda.empty_cache()

# ========== MODEL ==========
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_encoder.label_to_int),
)
model.gradient_checkpointing_enable()

# ========== TRAINING ARGUMENTS ==========
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    logging_steps=200,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    learning_rate=LEARNING_RATE,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    dataloader_num_workers=0,
    logging_dir="./logs",
    fp16=USE_FP16,
)

# ========== TRAINER ==========
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
)

# ========== TRAINING ==========
print("Training...")
trainer.train()

# ========== EVALUATION ==========
print("Evaluating...")
results = trainer.evaluate(tokenized_test)
print("Evaluation Results:", results)


Map: 100%|██████████| 5000/5000 [00:00<00:00, 18135.48 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 18444.50 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 8508.30 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 8479.44 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training...


Step,Training Loss,Validation Loss
1000,8.5211,8.508165


Checkpoint destination directory ./results\checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Evaluating...


Evaluation Results: {'eval_loss': 8.50816535949707, 'eval_runtime': 209.8017, 'eval_samples_per_second': 23.832, 'eval_steps_per_second': 11.916, 'epoch': 3.0}


In [8]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


CUDA available: True
Device name: NVIDIA GeForce GTX 1660 Ti
