In [29]:
import glob
import json
import time
import evaluate
import transformers
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm.auto import tqdm
from datasets import Dataset
from tokenizers import Tokenizer
from tokenizers.normalizers import (Sequence, Lowercase, NFD, StripAccents)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.decoders import BPEDecoder
from transformers import AutoConfig, \
    DataCollatorWithPadding, AutoModelForSequenceClassification, \
    Trainer, TrainingArguments, AutoTokenizer, GPT2Config
from matplotlib import pyplot as plt
import torch
from transformers import get_scheduler, AdamW

In [8]:
def top_x_acc(y_true, y_pred, x):
    y_true = torch.tensor(y_true)
    y_pred = torch.tensor(y_pred)
    ranked = torch.argsort(y_pred, axis=-1)
    top_x = ranked[..., -x:]
    return (top_x == torch.repeat_interleave(y_true.unsqueeze(-1), x, axis=-1)).float().sum(-1).mean().item()

def mean_recip_rank(y_true, y_pred):
    y_true = torch.tensor(y_true)
    y_pred = torch.tensor(y_pred)
    ranked = torch.argsort(y_pred, axis=-1)
    true_ranks = y_pred.shape[-1] - (ranked == torch.repeat_interleave(y_true.unsqueeze(-1), y_pred.shape[-1], axis=-1)).float().argmax(-1)

    return (1/true_ranks).mean().item()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions = np.argmax(predictions, axis=1)

    top_one = top_x_acc(labels, predictions, 1)
    top_five = top_x_acc(labels, predictions, 5)
    top_ten = 0
    if predictions.shape[-1] >= 10:
        top_ten = top_x_acc(labels, predictions, 10)
    mrr = mean_recip_rank(labels, predictions)

    metrics = {"top_one" : top_one, "top_five" : top_five, "mrr": mrr, "top_ten": top_ten}
    return metrics

In [9]:
def merge_staff_overlaps(bscores):
    """
    Takes in either one binary score or a batch of them and merges the left and right hands
    """
    
    # Lower middle c is index 23
    # Upper middle c is index 33
    lower = 23
    upper = 33
    middle = (lower + upper) // 2
    
    # Total notes is 52
    total = 52
    
    # Pad out upper hand and lower hand and combine them
    padded_lower = np.concatenate([bscores[..., :middle], np.zeros((*bscores.shape[:-1], total-middle))], axis=-1)
    padded_upper = np.concatenate([np.zeros((*bscores.shape[:-1], middle-bscores.shape[-1]+total)), bscores[..., middle:]], axis=-1)
    # Logical or
    merged = padded_lower + padded_upper - padded_lower * padded_upper
    return merged


# Continuous line of 256 unicode characters
start = 10060# 931
dense_characters = [chr(i).encode("utf-8").decode("utf-8") for i in range(start, start+512)]


# This code divides the fragment into blocks (and discards any remaining info at the very edges)
# Then it uses einsum with a filter of powers of 2 to convert from binary to an integer.  Then converts integers into
# unicode characters

def dense_encoder(fragment, block_size=[1, 1]):
    fragment = merge_staff_overlaps(fragment)
    # Rewrote this to be much faster but looks complicated
    # This filter has powers of 2 which is how the binary is turned to ints
    filter_ = np.power(2, np.arange(np.prod(block_size))).reshape(block_size)
    
    # The fragment is split into blocks here
    xblocks = np.stack(np.split(fragment[:, :(fragment.shape[1]//block_size[1])*block_size[1]], fragment.shape[1]//block_size[1], axis=1))
    xyblocks = np.stack(np.split(xblocks[:, :(xblocks.shape[1]//block_size[0])*block_size[0]], xblocks.shape[1]//block_size[0], axis=1))
    
    # The blocks are multiplied so they are ints
    numbers = np.einsum("ijkl,kl->ij", xyblocks, filter_)
    
    # The ints are turned into corresponding characters
    characters = (numbers+start).astype(np.int32).view('U1')
    return " ".join(["".join(t) for t in characters])

def data_preparation(labeled_data):
    train_X, train_y, val_X, val_y, test_X, test_y, train_m, valid_m, test_m = pd.read_pickle(labeled_data)

    train_df = pd.DataFrame({"text": [dense_encoder(piece, block_size=[1,8]) for piece in train_X], "label": train_y})
    val_df = pd.DataFrame({"text": [dense_encoder(piece, block_size=[1,8]) for piece in val_X], "label": val_y})
    test_df = pd.DataFrame({"text": [dense_encoder(piece, block_size=[1,8]) for piece in test_X], "label": test_y})
    
    return train_df, val_df, test_df

def label2id_function(examples, label2id):
    return {"label": [label2id[label] for label in examples["label"]]}

def tokenizer_function(examples, tokenizer):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

In [20]:
gpt2_dir = Path("/home/abunn/ttmp/PBSCSR_data/gpt2")
seed = 42

tokenizer = AutoTokenizer.from_pretrained(gpt2_dir/"tokenizer")
tokenizer.pad_token = '<pad>'

In [24]:
labeled_data = "../../9_way_dataset.pkl"
train_df, val_df, test_df = data_preparation(labeled_data)
train_ds = Dataset.from_dict(train_df)
val_ds = Dataset.from_dict(val_df)
test_ds = Dataset.from_dict(test_df)

label2id = {label: i for i, label in enumerate(set(train_df['label']))}
id2label = {i: label for label, i in label2id.items()}

train_ds = train_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
train_ds = train_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})
val_ds = val_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})
val_ds = val_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
test_ds = test_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
test_ds = test_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", max_length=1024)

Map:   0%|          | 0/27999 [00:00<?, ? examples/s]

Map:   0%|          | 0/27999 [00:00<?, ? examples/s]

Map:   0%|          | 0/6003 [00:00<?, ? examples/s]

Map:   0%|          | 0/6003 [00:00<?, ? examples/s]

Map:   0%|          | 0/6003 [00:00<?, ? examples/s]

Map:   0%|          | 0/6003 [00:00<?, ? examples/s]

In [22]:
config = AutoConfig.from_pretrained(gpt2_dir/"pretrained_model")
config.num_labels = len(label2id)
config.pad_token_id = tokenizer.pad_token_id
model = AutoModelForSequenceClassification.from_pretrained(gpt2_dir/"pretrained_model", config=config)

# Freeze all layers except the classifier
for name, param in model.named_parameters():
    param.requires_grad = False
model.score.weight.requires_grad = True

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at /home/abunn/ttmp/PBSCSR_data/gpt2/pretrained_model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
training_args = TrainingArguments(
    output_dir=gpt2_dir/"classifier_9_pretrained",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=12,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Top One,Top Five,Mrr,Top Ten
1,2.157,1.957262,0.281859,0.788772,0.489672,0
2,1.8064,1.787791,0.347493,0.851574,0.55047,0
3,1.6618,1.711439,0.379144,0.874396,0.577461,0
4,1.5848,1.668778,0.398301,0.887556,0.592576,0
5,1.5385,1.642106,0.406963,0.894386,0.600014,0
6,1.508,1.62313,0.415626,0.903215,0.606705,0
7,1.4857,1.610648,0.422622,0.905547,0.612029,0
8,1.4685,1.602243,0.423122,0.909878,0.613499,0
9,1.4565,1.597484,0.423455,0.909878,0.61412,0
10,1.4535,1.593756,0.424954,0.911378,0.61568,0




TrainOutput(global_step=5256, training_loss=1.583735055212198, metrics={'train_runtime': 598.748, 'train_samples_per_second': 561.151, 'train_steps_per_second': 8.778, 'total_flos': 7411563120867840.0, 'train_loss': 1.583735055212198, 'epoch': 12.0})

In [25]:
trainer.evaluate(test_ds)



{'eval_loss': 1.5673112869262695,
 'eval_top_one': 0.43328335881233215,
 'eval_top_five': 0.9055472016334534,
 'eval_mrr': 0.621686577796936,
 'eval_top_ten': 0,
 'eval_runtime': 6.6463,
 'eval_samples_per_second': 903.203,
 'eval_steps_per_second': 14.143,
 'epoch': 12.0}

### Full Fine-Tuning

In [34]:
import os

In [41]:
for lr in [5e-5/100, 5e-5/300, 5e-5/1000]:
    os.makedirs(gpt2_dir/f"LPFT_9_{lr}", exist_ok=True)
    
    config = AutoConfig.from_pretrained(gpt2_dir/"classifier_9_pretrained/checkpoint-5256")
    config.num_labels = len(label2id)
    config.pad_token_id = tokenizer.pad_token_id
    model = AutoModelForSequenceClassification.from_pretrained(gpt2_dir/"classifier_9_pretrained/checkpoint-5256", config=config)

    optimizer = AdamW(
        model.parameters(),
        lr=lr,
    )

    num_training_steps = len(train_ds) // training_args.per_device_train_batch_size * training_args.num_train_epochs
    scheduler = get_scheduler(
        "cosine",
        optimizer,
        num_warmup_steps=0,  # You can adjust the warmup steps if needed
        num_training_steps=num_training_steps
    )

    training_args = TrainingArguments(
        output_dir=gpt2_dir/f"LPFT_9_{lr}",
        learning_rate=lr,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, scheduler),
    )

    # Train model
    trainer.train()
    metrics = trainer.evaluate(test_ds)
    print(metrics)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Top One,Top Five,Mrr,Top Ten
1,1.3804,1.51131,0.45244,0.931701,0.640215,0
2,1.3111,1.496953,0.455939,0.933866,0.642834,0




In [None]:
# Omniglot??????
# Good fewshot alphabet model