In [1]:
import glob
import json
import time
import evaluate
import transformers
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm.auto import tqdm
from datasets import Dataset
from tokenizers import Tokenizer
from tokenizers.normalizers import (Sequence, Lowercase, NFD, StripAccents)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.decoders import BPEDecoder
from transformers import AutoConfig, \
    DataCollatorWithPadding, AutoModelForSequenceClassification, \
    Trainer, TrainingArguments, AutoTokenizer, RobertaConfig, RobertaModel, RobertaForSequenceClassification, RobertaTokenizer
from matplotlib import pyplot as plt
import torch
from transformers import get_scheduler, AdamW

### Helper Function

In [2]:
def top_x_acc(y_true, y_pred, x):
    y_true = torch.tensor(y_true)
    y_pred = torch.tensor(y_pred)
    ranked = torch.argsort(y_pred, axis=-1)
    top_x = ranked[..., -x:]
    return (top_x == torch.repeat_interleave(y_true.unsqueeze(-1), x, axis=-1)).float().sum(-1).mean().item()

def mean_recip_rank(y_true, y_pred):
    y_true = torch.tensor(y_true)
    y_pred = torch.tensor(y_pred)
    ranked = torch.argsort(y_pred, axis=-1)
    true_ranks = y_pred.shape[-1] - (ranked == torch.repeat_interleave(y_true.unsqueeze(-1), y_pred.shape[-1], axis=-1)).float().argmax(-1)

    return (1/true_ranks).mean().item()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions = np.argmax(predictions, axis=1)

    top_one = top_x_acc(labels, predictions, 1)
    top_five = top_x_acc(labels, predictions, 5)
    top_ten = 0
    if predictions.shape[-1] >= 10:
        top_ten = top_x_acc(labels, predictions, 10)
    mrr = mean_recip_rank(labels, predictions)

    metrics = {"top_one" : top_one, "top_five" : top_five, "mrr": mrr, "top_ten": top_ten}
    return metrics

In [3]:
def merge_staff_overlaps(bscores):
    """
    Takes in either one binary score or a batch of them and merges the left and right hands
    """
    
    # Lower middle c is index 23
    # Upper middle c is index 33
    lower = 23
    upper = 33
    middle = (lower + upper) // 2
    
    # Total notes is 52
    total = 52
    
    # Pad out upper hand and lower hand and combine them
    padded_lower = np.concatenate([bscores[..., :middle], np.zeros((*bscores.shape[:-1], total-middle))], axis=-1)
    padded_upper = np.concatenate([np.zeros((*bscores.shape[:-1], middle-bscores.shape[-1]+total)), bscores[..., middle:]], axis=-1)
    # Logical or
    merged = padded_lower + padded_upper - padded_lower * padded_upper
    return merged


# Continuous line of 256 unicode characters
start = 10060# 931
dense_characters = [chr(i).encode("utf-8").decode("utf-8") for i in range(start, start+512)]


# This code divides the fragment into blocks (and discards any remaining info at the very edges)
# Then it uses einsum with a filter of powers of 2 to convert from binary to an integer.  Then converts integers into
# unicode characters

def dense_encoder(fragment, block_size=[1, 1]):
    fragment = merge_staff_overlaps(fragment)
    # Rewrote this to be much faster but looks complicated
    # This filter has powers of 2 which is how the binary is turned to ints
    filter_ = np.power(2, np.arange(np.prod(block_size))).reshape(block_size)
    
    # The fragment is split into blocks here
    xblocks = np.stack(np.split(fragment[:, :(fragment.shape[1]//block_size[1])*block_size[1]], fragment.shape[1]//block_size[1], axis=1))
    xyblocks = np.stack(np.split(xblocks[:, :(xblocks.shape[1]//block_size[0])*block_size[0]], xblocks.shape[1]//block_size[0], axis=1))
    
    # The blocks are multiplied so they are ints
    numbers = np.einsum("ijkl,kl->ij", xyblocks, filter_)
    
    # The ints are turned into corresponding characters
    characters = (numbers+start).astype(np.int32).view('U1')
    return " ".join(["".join(t) for t in characters])

def data_preparation(labeled_data):
    train_X, train_y, val_X, val_y, test_X, test_y, train_m, valid_m, test_m = pd.read_pickle(labeled_data)

    train_df = pd.DataFrame({"text": [dense_encoder(piece, block_size=[1,8]) for piece in train_X], "label": train_y})
    val_df = pd.DataFrame({"text": [dense_encoder(piece, block_size=[1,8]) for piece in val_X], "label": val_y})
    test_df = pd.DataFrame({"text": [dense_encoder(piece, block_size=[1,8]) for piece in test_X], "label": test_y})
    
    return train_df, val_df, test_df

def label2id_function(examples, label2id):
    return {"label": [label2id[label] for label in examples["label"]]}

def tokenizer_function(examples, tokenizer):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

### Data Prep

In [4]:
# gpt2_dir = Path("./")
roberta_dir = Path("/home/jliu/ttmp/PBSCSR/roberta")
seed = 42

# tokenizer = AutoTokenizer.from_pretrained(gpt2_dir/"roberta_pretrained")
tokenizer = AutoTokenizer.from_pretrained(roberta_dir/"tokenizer")
tokenizer.pad_token = '<pad>'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# labeled_data = "/home/jliu/ttmp/PBSCSR/baselines/9_way_dataset.pkl"
labeled_data = "/home/jliu/ttmp/PBSCSR/baselines/100_way_dataset.pkl"

train_df, val_df, test_df = data_preparation(labeled_data)
train_ds = Dataset.from_dict(train_df)
val_ds = Dataset.from_dict(val_df)
test_ds = Dataset.from_dict(test_df)

label2id = {label: i for i, label in enumerate(set(train_df['label']))}
id2label = {i: label for label, i in label2id.items()}

train_ds = train_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
train_ds = train_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})
val_ds = val_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})
val_ds = val_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
test_ds = test_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
test_ds = test_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", max_length=1024)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

### No Pretraining

In [6]:
config = AutoConfig.from_pretrained(roberta_dir/"pretrained_model")
config.num_labels = len(label2id)
config.pad_token_id = tokenizer.pad_token_id

model = AutoModelForSequenceClassification.from_config(config=config)


# Freeze all layers except the classifier
for name, param in model.named_parameters():
    param.requires_grad = False
model.classifier.out_proj.weight.requires_grad = True

In [7]:
training_args = TrainingArguments(
    # output_dir=roberta_dir/"classifier_9_no_pretrained",
    output_dir=roberta_dir/"classifier_100_no_pretrained",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=12,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Top One,Top Five,Mrr,Top Ten
1,4.625,4.611366,0.011333,0.054133,0.054186,0.1062
2,4.6148,4.607971,0.0124,0.0562,0.056202,0.107133
3,4.6073,4.604663,0.012,0.058,0.056829,0.113667
4,4.6002,4.602517,0.0136,0.061467,0.059111,0.117467
5,4.5953,4.600651,0.0136,0.0624,0.05982,0.120667
6,4.5914,4.599602,0.0146,0.063533,0.061235,0.122
7,4.587,4.598385,0.013267,0.063133,0.060565,0.1228
8,4.5836,4.597538,0.0144,0.066933,0.061871,0.123067
9,4.5808,4.596832,0.014267,0.066267,0.061932,0.1236
10,4.5788,4.596418,0.014733,0.0664,0.062137,0.123267




TrainOutput(global_step=13128, training_loss=4.5931636012958945, metrics={'train_runtime': 1139.1225, 'train_samples_per_second': 737.41, 'train_steps_per_second': 11.525, 'total_flos': 1.847751883061376e+16, 'train_loss': 4.5931636012958945, 'epoch': 12.0})

In [8]:
trainer.evaluate(test_ds)



{'eval_loss': 2.178577423095703,
 'eval_top_one': 0.14875894784927368,
 'eval_top_five': 0.6271864175796509,
 'eval_mrr': 0.3593359887599945,
 'eval_top_ten': 0,
 'eval_runtime': 10.0796,
 'eval_samples_per_second': 595.561,
 'eval_steps_per_second': 9.326,
 'epoch': 12.0}

### Linear Probe

In [6]:
config = AutoConfig.from_pretrained(roberta_dir/"pretrained_model")
config.num_labels = len(label2id)
config.pad_token_id = tokenizer.pad_token_id

model = AutoModelForSequenceClassification.from_pretrained(roberta_dir/"pretrained_model", config=config)


# Freeze all layers except the classifier
for name, param in model.named_parameters():
    param.requires_grad = False
model.classifier.out_proj.weight.requires_grad = True

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /home/jliu/ttmp/PBSCSR/roberta/pretrained_model and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
training_args = TrainingArguments(
    output_dir=roberta_dir/"classifier_9_pretrained",
    # output_dir=roberta_dir/"classifier_100_pretrained",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=12,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Top One,Top Five,Mrr,Top Ten
1,2.1813,2.135515,0.190571,0.676495,0.397929,0
2,2.1092,2.073681,0.229718,0.742962,0.440324,0
3,2.0608,2.031308,0.253373,0.769948,0.463606,0
4,2.0248,2.000467,0.263701,0.787273,0.476693,0
5,2.0004,1.977895,0.276862,0.799267,0.487733,0
6,1.9816,1.960823,0.286024,0.808263,0.495658,0
7,1.9651,1.947851,0.290188,0.812594,0.499755,0
8,1.9542,1.938177,0.293853,0.819757,0.503565,0
9,1.9477,1.931075,0.296518,0.820257,0.505901,0
10,1.9417,1.926272,0.298684,0.822589,0.507991,0




TrainOutput(global_step=5256, training_loss=2.0033310145547945, metrics={'train_runtime': 1448.6131, 'train_samples_per_second': 231.938, 'train_steps_per_second': 3.628, 'total_flos': 7514471469594078.0, 'train_loss': 2.0033310145547945, 'epoch': 12.0})

### Full Fine-Tuning

In [9]:
import os

In [11]:
for lr in [5e-5/100, 5e-5/300, 5e-5/1000]:
    os.makedirs(roberta_dir/f"LPFT_100_{lr}", exist_ok=True)
    # os.makedirs(roberta_dir/f"LPFT_9_{lr}", exist_ok=True)
    
    # config = AutoConfig.from_pretrained(roberta_dir/"classifier_9_pretrained/checkpoint-5256")
    config = AutoConfig.from_pretrained(roberta_dir/"classifier_100_pretrained/checkpoint-13128")
    
    config.num_labels = len(label2id)
    config.pad_token_id = tokenizer.pad_token_id
    model = AutoModelForSequenceClassification.from_pretrained(roberta_dir/"classifier_100_pretrained/checkpoint-13128", config=config)
    # model = AutoModelForSequenceClassification.from_pretrained(roberta_dir/"classifier_9_pretrained/checkpoint-5256", config=config)

    optimizer = AdamW(
        model.parameters(),
        lr=lr,
    )

    num_training_steps = len(train_ds) // training_args.per_device_train_batch_size * training_args.num_train_epochs
    scheduler = get_scheduler(
        "cosine",
        optimizer,
        num_warmup_steps=0,  # You can adjust the warmup steps if needed
        num_training_steps=num_training_steps
    )

    training_args = TrainingArguments(
        output_dir=roberta_dir/f"LPFT_100_{lr}",
        # output_dir=roberta_dir/f"LPFT_9_{lr}",
        learning_rate=lr,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, scheduler),
    )

    # Train model
    trainer.train()
    metrics = trainer.evaluate(test_ds)
    print(metrics)



Epoch,Training Loss,Validation Loss,Top One,Top Five,Mrr,Top Ten
1,4.2228,4.205184,0.078933,0.229333,0.168945,0.350267
2,4.0787,4.115465,0.086333,0.2494,0.182187,0.379467
3,3.9968,4.06002,0.090133,0.263333,0.190049,0.394867
4,3.9425,4.024141,0.094933,0.270933,0.195992,0.4028
5,3.9003,3.99725,0.098933,0.277733,0.201352,0.413533




{'eval_loss': 4.011227607727051, 'eval_top_one': 0.09286666661500931, 'eval_top_five': 0.2701333463191986, 'eval_mrr': 0.19485780596733093, 'eval_top_ten': 0.4010666608810425, 'eval_runtime': 10.8309, 'eval_samples_per_second': 1384.923, 'eval_steps_per_second': 21.697, 'epoch': 5.0}




Epoch,Training Loss,Validation Loss,Top One,Top Five,Mrr,Top Ten
1,4.2956,4.320817,0.062467,0.187933,0.14352,0.296333
2,4.2151,4.255294,0.073333,0.213467,0.159878,0.327467
3,4.1586,4.212419,0.0782,0.227733,0.167998,0.347733
4,4.1232,4.184475,0.080733,0.2348,0.172202,0.357467
5,4.0976,4.165159,0.081933,0.238067,0.174734,0.364867




{'eval_loss': 4.175278186798096, 'eval_top_one': 0.07813332974910736, 'eval_top_five': 0.2321999967098236, 'eval_mrr': 0.17059360444545746, 'eval_top_ten': 0.3547999858856201, 'eval_runtime': 11.4251, 'eval_samples_per_second': 1312.894, 'eval_steps_per_second': 20.569, 'epoch': 5.0}




Epoch,Training Loss,Validation Loss,Top One,Top Five,Mrr,Top Ten
1,4.3296,4.382401,0.054933,0.1662,0.129046,0.268667
2,4.2999,4.355662,0.0578,0.176467,0.135358,0.2806
3,4.2732,4.332899,0.061333,0.1842,0.141355,0.290267
4,4.2537,4.315032,0.0642,0.189667,0.145546,0.299267
5,4.2369,4.30161,0.066067,0.194533,0.14854,0.305133




{'eval_loss': 4.307337284088135, 'eval_top_one': 0.060733333230018616, 'eval_top_five': 0.18913333117961884, 'eval_mrr': 0.1434733271598816, 'eval_top_ten': 0.30033332109451294, 'eval_runtime': 11.0733, 'eval_samples_per_second': 1354.611, 'eval_steps_per_second': 21.222, 'epoch': 5.0}
