In [1]:
# Install a potentially more compatible version of transformers and datasets
# Install a potentially more compatible version of transformers, datasets, and accelerate
!pip install datasets==2.16.1 transformers==4.38.0 peft==0.8.2 accelerate==0.27.2

Collecting datasets==2.16.1
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting transformers==4.38.0
  Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)
Collecting peft==0.8.2
  Downloading peft-0.8.2-py3-none-any.whl.metadata (25 kB)
Collecting accelerate==0.27.2
  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow-hotfix (from datasets==2.16.1)
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.16.1)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.16.1)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.0)
  Downloading tokenizers-0.15.2.tar.gz (320 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requireme

  error: subprocess-exited-with-error
  
  × Preparing metadata (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [48 lines of output]
      Checking for Rust toolchain....
      Rust not found, installing into a temporary directory
      Python reports SOABI: cp313-win_amd64
      Computed rustc target triple: x86_64-pc-windows-msvc
      Installation directory: C:\Users\smith\AppData\Local\puccinialin\puccinialin\Cache
      Downloading rustup-init from https://static.rust-lang.org/rustup/dist/x86_64-pc-windows-msvc/rustup-init.exe
      
      Downloading rustup-init:   0%|          | 0.00/13.6M [00:00<?, ?B/s]
      Downloading rustup-init:   3%|â–Ž         | 459k/13.6M [00:00<00:02, 4.53MB/s]
      Downloading rustup-init:   9%|â–Š         | 1.17M/13.6M [00:00<00:02, 5.98MB/s]
      Downloading rustup-init:  14%|â–ˆâ–\x8d        | 1.87M/13.6M [00:00<00:01, 6.42MB/s]
      Downloading rustup-init:  19%|â–ˆâ–‰        | 2.57M/13.6M [00:00<00:01, 6.61MB/s]
      Downlo

In [None]:
from datasets import load_dataset

# Load your dataset with predefined splits
dataset = load_dataset("sander-wood/melodyhub")

# Assuming the dataset has 'train' and 'validation' splits
train_dataset = dataset['train']
validation_dataset = dataset['validation']

# You can optionally split the validation set to create a test set
# For example, split the validation set into new validation and test sets
# This approach keeps the original train set intact.
validation_test_split = validation_dataset.train_test_split(test_size=0.5, seed=42)

new_validation_dataset = validation_test_split['train']  # This will be the new validation set
test_dataset = validation_test_split['test']      # This will be your test set

print("Original Train Set:", train_dataset)
print("New Validation Set:", new_validation_dataset)
print("Test Set:", test_dataset)

In [None]:
from transformers import RobertaTokenizer

# Initialize a tokenizer
# You can choose a different pre-trained tokenizer if it suits your needs better
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Now you can use this tokenizer to process your datasets
# For example, tokenizing the 'text' column (assuming your dataset has a 'text' column)
def tokenize_function(examples):
    # Assuming the ABC notation is in a column named 'input' based on the error and likely dataset structure
    # Original comment said 'text', but the error traceback is using 'input'.
    return tokenizer(examples["input"], padding="max_length", truncation=True)

# Apply the tokenization to your datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_new_validation_dataset = new_validation_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

print("\nTokenized Datasets:")
print("Tokenized Train Set:", tokenized_train_dataset)
print("Tokenized New Validation Set:", tokenized_new_validation_dataset)
print("Tokenized Test Set:", tokenized_test_dataset)

In [None]:
from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer, training_args
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder

# Load the pre-trained model for sequence classification
# Adjust the number of labels based on your classification task
# num_labels = 2 # Example: Binary classification. Change this based on your actual number of classes.
# The actual number of labels should be determined from the unique values in the 'output' column.
unique_labels = set(train_dataset['output'])
num_labels = len(unique_labels)
print(f"Number of unique labels found: {num_labels}")


model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder on the unique labels from the training set
label_encoder.fit(list(unique_labels))

# Function to rename the 'output' column to 'labels' and remove original columns
def prepare_dataset(example):
    example['labels'] = label_encoder.transform([example['output']])[0] # Encode the label
    return example

# Apply the preparation function to your datasets and remove original columns
tokenized_train_dataset = tokenized_train_dataset.map(prepare_dataset, remove_columns=['output', 'input', 'dataset', 'task'])
tokenized_new_validation_dataset = tokenized_new_validation_dataset.map(prepare_dataset, remove_columns=['output', 'input', 'dataset', 'task'])
tokenized_test_dataset = tokenized_test_dataset.map(prepare_dataset, remove_columns=['output', 'input', 'dataset', 'task'])


# Create a Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset, # training dataset
    eval_dataset=tokenized_new_validation_dataset,  # evaluation dataset
)

# Start training
trainer.train()

# You can also evaluate the model after training
results = trainer.evaluate(tokenized_test_dataset)
print("\nEvaluation Results:")
print(results)