In [1]:
import glob
import json
import time
import evaluate
import transformers
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm.auto import tqdm
from datasets import Dataset
from tokenizers import Tokenizer
from tokenizers.normalizers import (Sequence, Lowercase, NFD, StripAccents)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.decoders import BPEDecoder
from transformers import AutoConfig, \
    DataCollatorWithPadding, AutoModelForSequenceClassification, \
    Trainer, TrainingArguments, AutoTokenizer, GPT2Config
from matplotlib import pyplot as plt
import torch
from transformers import get_scheduler, AdamW



In [2]:
def top_x_acc(y_true, y_pred, x):
    y_true = torch.tensor(y_true)
    y_pred = torch.tensor(y_pred)
    ranked = torch.argsort(y_pred, axis=-1)
    top_x = ranked[..., -x:]
    return (top_x == torch.repeat_interleave(y_true.unsqueeze(-1), x, axis=-1)).float().sum(-1).mean().item()

def mean_recip_rank(y_true, y_pred):
    y_true = torch.tensor(y_true)
    y_pred = torch.tensor(y_pred)
    ranked = torch.argsort(y_pred, axis=-1)
    true_ranks = y_pred.shape[-1] - (ranked == torch.repeat_interleave(y_true.unsqueeze(-1), y_pred.shape[-1], axis=-1)).float().argmax(-1)

    return (1/true_ranks).mean().item()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions = np.argmax(predictions, axis=1)

    top_one = top_x_acc(labels, predictions, 1)
    top_five = top_x_acc(labels, predictions, 5)
    top_ten = 0
    if predictions.shape[-1] >= 10:
        top_ten = top_x_acc(labels, predictions, 10)
    mrr = mean_recip_rank(labels, predictions)

    metrics = {"top_one" : top_one, "top_five" : top_five, "mrr": mrr, "top_ten": top_ten}
    return metrics

In [3]:
def merge_staff_overlaps(bscores):
    """
    Takes in either one binary score or a batch of them and merges the left and right hands
    """
    
    # Lower middle c is index 23
    # Upper middle c is index 33
    lower = 23
    upper = 33
    middle = (lower + upper) // 2
    
    # Total notes is 52
    total = 52
    
    # Pad out upper hand and lower hand and combine them
    padded_lower = np.concatenate([bscores[..., :middle], np.zeros((*bscores.shape[:-1], total-middle))], axis=-1)
    padded_upper = np.concatenate([np.zeros((*bscores.shape[:-1], middle-bscores.shape[-1]+total)), bscores[..., middle:]], axis=-1)
    # Logical or
    merged = padded_lower + padded_upper - padded_lower * padded_upper
    return merged


# Continuous line of 256 unicode characters
start = 10060# 931
dense_characters = [chr(i).encode("utf-8").decode("utf-8") for i in range(start, start+512)]


# This code divides the fragment into blocks (and discards any remaining info at the very edges)
# Then it uses einsum with a filter of powers of 2 to convert from binary to an integer.  Then converts integers into
# unicode characters

def dense_encoder(fragment, block_size=[1, 1]):
    fragment = merge_staff_overlaps(fragment)
    # Rewrote this to be much faster but looks complicated
    # This filter has powers of 2 which is how the binary is turned to ints
    filter_ = np.power(2, np.arange(np.prod(block_size))).reshape(block_size)
    
    # The fragment is split into blocks here
    xblocks = np.stack(np.split(fragment[:, :(fragment.shape[1]//block_size[1])*block_size[1]], fragment.shape[1]//block_size[1], axis=1))
    xyblocks = np.stack(np.split(xblocks[:, :(xblocks.shape[1]//block_size[0])*block_size[0]], xblocks.shape[1]//block_size[0], axis=1))
    
    # The blocks are multiplied so they are ints
    numbers = np.einsum("ijkl,kl->ij", xyblocks, filter_)
    
    # The ints are turned into corresponding characters
    characters = (numbers+start).astype(np.int32).view('U1')
    return " ".join(["".join(t) for t in characters])

def data_preparation(labeled_data):
    train_X, train_y, val_X, val_y, test_X, test_y, train_m, valid_m, test_m = pd.read_pickle(labeled_data)

    train_df = pd.DataFrame({"text": [dense_encoder(piece, block_size=[1,8]) for piece in train_X], "label": train_y})
    val_df = pd.DataFrame({"text": [dense_encoder(piece, block_size=[1,8]) for piece in val_X], "label": val_y})
    test_df = pd.DataFrame({"text": [dense_encoder(piece, block_size=[1,8]) for piece in test_X], "label": test_y})
    
    return train_df, val_df, test_df

def label2id_function(examples, label2id):
    return {"label": [label2id[label] for label in examples["label"]]}

def tokenizer_function(examples, tokenizer):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

In [4]:
gpt2_dir = Path("./")
seed = 42

tokenizer = AutoTokenizer.from_pretrained(gpt2_dir/"roberta_pretrained")
tokenizer.pad_token = '<pad>'

In [14]:
labeled_data = "100_way_dataset.pkl"
train_df, val_df, test_df = data_preparation(labeled_data)
train_ds = Dataset.from_dict(train_df)
val_ds = Dataset.from_dict(val_df)
test_ds = Dataset.from_dict(test_df)

label2id = {label: i for i, label in enumerate(set(train_df['label']))}
id2label = {i: label for label, i in label2id.items()}

train_ds = train_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
train_ds = train_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})
val_ds = val_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})
val_ds = val_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
test_ds = test_ds.map(tokenizer_function, batched=True, fn_kwargs={"tokenizer": tokenizer})
test_ds = test_ds.map(label2id_function, batched=True, fn_kwargs={"label2id": label2id})

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", max_length=1024)

Map: 100%|██████████| 70000/70000 [00:07<00:00, 8957.01 examples/s]
Map: 100%|██████████| 70000/70000 [00:00<00:00, 273004.20 examples/s]
Map: 100%|██████████| 15000/15000 [00:00<00:00, 301598.05 examples/s]
Map: 100%|██████████| 15000/15000 [00:01<00:00, 8939.61 examples/s]
Map: 100%|██████████| 15000/15000 [00:01<00:00, 9071.70 examples/s]
Map: 100%|██████████| 15000/15000 [00:00<00:00, 279793.12 examples/s]


In [15]:
config = AutoConfig.from_pretrained(gpt2_dir/"roberta_pretrained")
config.num_labels = len(label2id)
config.pad_token_id = tokenizer.pad_token_id
model = AutoModelForSequenceClassification.from_pretrained(gpt2_dir/"roberta_pretrained", config=config)

# Freeze all layers except the classifier
for name, param in model.named_parameters():
    param.requires_grad = False
model.classifier.out_proj.weight.requires_grad = True

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta_pretrained and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    output_dir="C:\\Users\\bunna\\Desktop\\classifier_100",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=12,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

  8%|▊         | 2188/26256 [01:07<12:09, 32.97it/s]

{'loss': 4.5918, 'learning_rate': 4.5833333333333334e-05, 'epoch': 1.0}


                                                    
  8%|▊         | 2188/26256 [01:21<12:09, 32.97it/s]

{'eval_loss': 4.548223972320557, 'eval_top_one': 0.029600000008940697, 'eval_top_five': 0.10406666994094849, 'eval_mrr': 0.08791352063417435, 'eval_top_ten': 0.17746666073799133, 'eval_runtime': 13.2253, 'eval_samples_per_second': 1134.194, 'eval_steps_per_second': 35.462, 'epoch': 1.0}


 17%|█▋        | 4376/26256 [02:28<11:17, 32.30it/s]  

{'loss': 4.5086, 'learning_rate': 4.166666666666667e-05, 'epoch': 2.0}


                                                    
 17%|█▋        | 4376/26256 [02:42<11:17, 32.30it/s]

{'eval_loss': 4.497393608093262, 'eval_top_one': 0.03880000114440918, 'eval_top_five': 0.1313333362340927, 'eval_mrr': 0.10399021208286285, 'eval_top_ten': 0.21326667070388794, 'eval_runtime': 13.2395, 'eval_samples_per_second': 1132.97, 'eval_steps_per_second': 35.424, 'epoch': 2.0}


 25%|██▌       | 6564/26256 [03:50<09:29, 34.57it/s]  

{'loss': 4.4526, 'learning_rate': 3.7500000000000003e-05, 'epoch': 3.0}


                                                    
 25%|██▌       | 6564/26256 [04:03<09:29, 34.57it/s]

{'eval_loss': 4.463146686553955, 'eval_top_one': 0.04213333502411842, 'eval_top_five': 0.14113333821296692, 'eval_mrr': 0.11182039231061935, 'eval_top_ten': 0.23206666111946106, 'eval_runtime': 13.2685, 'eval_samples_per_second': 1130.495, 'eval_steps_per_second': 35.347, 'epoch': 3.0}


 33%|███▎      | 8752/26256 [05:11<09:20, 31.23it/s]  

{'loss': 4.4103, 'learning_rate': 3.3333333333333335e-05, 'epoch': 4.0}


                                                    
 33%|███▎      | 8752/26256 [05:24<09:20, 31.23it/s]

{'eval_loss': 4.438538074493408, 'eval_top_one': 0.04613333195447922, 'eval_top_five': 0.1504666656255722, 'eval_mrr': 0.11762411147356033, 'eval_top_ten': 0.24373333156108856, 'eval_runtime': 13.269, 'eval_samples_per_second': 1130.458, 'eval_steps_per_second': 35.346, 'epoch': 4.0}


 42%|████▏     | 10940/26256 [06:33<07:40, 33.29it/s] 

{'loss': 4.3815, 'learning_rate': 2.916666666666667e-05, 'epoch': 5.0}


                                                     
 42%|████▏     | 10940/26256 [06:46<07:40, 33.29it/s]

{'eval_loss': 4.420085906982422, 'eval_top_one': 0.04859999939799309, 'eval_top_five': 0.15546666085720062, 'eval_mrr': 0.12147132307291031, 'eval_top_ten': 0.2515333294868469, 'eval_runtime': 13.2896, 'eval_samples_per_second': 1128.705, 'eval_steps_per_second': 35.291, 'epoch': 5.0}


 50%|█████     | 13128/26256 [07:55<06:26, 33.95it/s]  

{'loss': 4.3571, 'learning_rate': 2.5e-05, 'epoch': 6.0}


                                                     
 50%|█████     | 13128/26256 [08:08<06:26, 33.95it/s]

{'eval_loss': 4.406322002410889, 'eval_top_one': 0.050333332270383835, 'eval_top_five': 0.1587333381175995, 'eval_mrr': 0.12384721636772156, 'eval_top_ten': 0.25573334097862244, 'eval_runtime': 13.2468, 'eval_samples_per_second': 1132.345, 'eval_steps_per_second': 35.405, 'epoch': 6.0}


 58%|█████▊    | 15316/26256 [09:16<05:35, 32.57it/s]  

{'loss': 4.3392, 'learning_rate': 2.0833333333333336e-05, 'epoch': 7.0}


                                                     
 58%|█████▊    | 15316/26256 [09:30<05:35, 32.57it/s]

{'eval_loss': 4.3957672119140625, 'eval_top_one': 0.05146666616201401, 'eval_top_five': 0.16233333945274353, 'eval_mrr': 0.12587445974349976, 'eval_top_ten': 0.2606666684150696, 'eval_runtime': 13.2367, 'eval_samples_per_second': 1133.217, 'eval_steps_per_second': 35.432, 'epoch': 7.0}


 67%|██████▋   | 17504/26256 [10:38<04:27, 32.77it/s]  

{'loss': 4.3239, 'learning_rate': 1.6666666666666667e-05, 'epoch': 8.0}


                                                     
 67%|██████▋   | 17504/26256 [10:51<04:27, 32.77it/s]

{'eval_loss': 4.388261318206787, 'eval_top_one': 0.052400000393390656, 'eval_top_five': 0.16446666419506073, 'eval_mrr': 0.12719734013080597, 'eval_top_ten': 0.26446667313575745, 'eval_runtime': 13.2294, 'eval_samples_per_second': 1133.838, 'eval_steps_per_second': 35.451, 'epoch': 8.0}


 75%|███████▌  | 19692/26256 [11:59<03:09, 34.59it/s]  

{'loss': 4.3141, 'learning_rate': 1.25e-05, 'epoch': 9.0}


                                                     
 75%|███████▌  | 19692/26256 [12:13<03:09, 34.59it/s]

{'eval_loss': 4.382510185241699, 'eval_top_one': 0.053199999034404755, 'eval_top_five': 0.1658666729927063, 'eval_mrr': 0.1283940225839615, 'eval_top_ten': 0.2657333314418793, 'eval_runtime': 13.1758, 'eval_samples_per_second': 1138.451, 'eval_steps_per_second': 35.596, 'epoch': 9.0}


 83%|████████▎ | 21880/26256 [13:20<02:14, 32.51it/s]  

{'loss': 4.3068, 'learning_rate': 8.333333333333334e-06, 'epoch': 10.0}


                                                     
 83%|████████▎ | 21880/26256 [13:33<02:14, 32.51it/s]

{'eval_loss': 4.378729343414307, 'eval_top_one': 0.05353333428502083, 'eval_top_five': 0.16813333332538605, 'eval_mrr': 0.1289985179901123, 'eval_top_ten': 0.268533319234848, 'eval_runtime': 13.1419, 'eval_samples_per_second': 1141.392, 'eval_steps_per_second': 35.688, 'epoch': 10.0}


 92%|█████████▏| 24068/26256 [14:41<01:08, 32.01it/s]  

{'loss': 4.3016, 'learning_rate': 4.166666666666667e-06, 'epoch': 11.0}


                                                     
 92%|█████████▏| 24068/26256 [14:54<01:08, 32.01it/s]

{'eval_loss': 4.376612663269043, 'eval_top_one': 0.05420000106096268, 'eval_top_five': 0.16813333332538605, 'eval_mrr': 0.1296611875295639, 'eval_top_ten': 0.26866665482521057, 'eval_runtime': 13.1096, 'eval_samples_per_second': 1144.197, 'eval_steps_per_second': 35.775, 'epoch': 11.0}


100%|██████████| 26256/26256 [16:02<00:00, 32.17it/s]

{'loss': 4.2968, 'learning_rate': 0.0, 'epoch': 12.0}


                                                     
100%|██████████| 26256/26256 [16:16<00:00, 32.17it/s]

{'eval_loss': 4.375930309295654, 'eval_top_one': 0.05426666513085365, 'eval_top_five': 0.16899999976158142, 'eval_mrr': 0.1298314779996872, 'eval_top_ten': 0.26899999380111694, 'eval_runtime': 13.7363, 'eval_samples_per_second': 1091.995, 'eval_steps_per_second': 34.143, 'epoch': 12.0}


100%|██████████| 26256/26256 [16:17<00:00, 26.87it/s]

{'train_runtime': 977.0725, 'train_samples_per_second': 859.711, 'train_steps_per_second': 26.872, 'train_loss': 4.382027002458962, 'epoch': 12.0}





TrainOutput(global_step=26256, training_loss=4.382027002458962, metrics={'train_runtime': 977.0725, 'train_samples_per_second': 859.711, 'train_steps_per_second': 26.872, 'train_loss': 4.382027002458962, 'epoch': 12.0})

In [17]:
trainer.evaluate(test_ds)

100%|██████████| 469/469 [00:13<00:00, 35.07it/s]


{'eval_loss': 4.383317470550537,
 'eval_top_one': 0.04919999837875366,
 'eval_top_five': 0.16333332657814026,
 'eval_mrr': 0.1257762461900711,
 'eval_top_ten': 0.2640666663646698,
 'eval_runtime': 13.5898,
 'eval_samples_per_second': 1103.771,
 'eval_steps_per_second': 34.511,
 'epoch': 12.0}

### Full Fine-Tuning

In [18]:
import os

In [20]:
for lr in [5e-5/100, 5e-5/300, 5e-5/1000]:
    os.makedirs(gpt2_dir/f"LPFT_100_{lr}", exist_ok=True)
    
    config = AutoConfig.from_pretrained("C:\\Users\\bunna\\Desktop\\classifier_100\\checkpoint-26256")
    config.num_labels = len(label2id)
    config.pad_token_id = tokenizer.pad_token_id
    model = AutoModelForSequenceClassification.from_pretrained("C:\\Users\\bunna\\Desktop\\classifier_100\\checkpoint-26256", config=config)

    optimizer = AdamW(
        model.parameters(),
        lr=lr,
    )

    num_training_steps = len(train_ds) // training_args.per_device_train_batch_size * training_args.num_train_epochs
    scheduler = get_scheduler(
        "cosine",
        optimizer,
        num_warmup_steps=0,  # You can adjust the warmup steps if needed
        num_training_steps=num_training_steps
    )

    training_args = TrainingArguments(
        output_dir=gpt2_dir/f"LPFT_100_{lr}",
        learning_rate=lr,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, scheduler),
    )

    # Train model
    trainer.train()
    metrics = trainer.evaluate(test_ds)
    print(metrics)

 20%|██        | 2188/10940 [02:57<11:44, 12.41it/s]

{'loss': 4.107, 'learning_rate': 4.914737092239488e-07, 'epoch': 1.0}


                                                    
 20%|██        | 2188/10940 [03:10<11:44, 12.41it/s]

{'eval_loss': 4.096588134765625, 'eval_top_one': 0.0860000029206276, 'eval_top_five': 0.24826666712760925, 'eval_mrr': 0.18203990161418915, 'eval_top_ten': 0.37786665558815, 'eval_runtime': 13.2238, 'eval_samples_per_second': 1134.315, 'eval_steps_per_second': 35.466, 'epoch': 1.0}


 40%|████      | 4376/10940 [06:09<08:18, 13.15it/s]  

{'loss': 3.9443, 'learning_rate': 4.664764179709776e-07, 'epoch': 2.0}


                                                    
 40%|████      | 4376/10940 [06:23<08:18, 13.15it/s]

{'eval_loss': 4.016777992248535, 'eval_top_one': 0.09353332966566086, 'eval_top_five': 0.27219998836517334, 'eval_mrr': 0.19478052854537964, 'eval_top_ten': 0.40540000796318054, 'eval_runtime': 13.3067, 'eval_samples_per_second': 1127.254, 'eval_steps_per_second': 35.245, 'epoch': 2.0}


 60%|██████    | 6564/10940 [09:22<05:20, 13.66it/s]  

{'loss': 3.8669, 'learning_rate': 4.267131996317781e-07, 'epoch': 3.0}


                                                    
 60%|██████    | 6564/10940 [09:35<05:20, 13.66it/s]

{'eval_loss': 3.9726312160491943, 'eval_top_one': 0.1003333330154419, 'eval_top_five': 0.28033334016799927, 'eval_mrr': 0.20310531556606293, 'eval_top_ten': 0.41973334550857544, 'eval_runtime': 13.2752, 'eval_samples_per_second': 1129.926, 'eval_steps_per_second': 35.329, 'epoch': 3.0}


 80%|████████  | 8752/10940 [12:35<02:53, 12.61it/s]  

{'loss': 3.8116, 'learning_rate': 3.748963163003632e-07, 'epoch': 4.0}


                                                    
 80%|████████  | 8752/10940 [12:48<02:53, 12.61it/s]

{'eval_loss': 3.9451792240142822, 'eval_top_one': 0.10533333569765091, 'eval_top_five': 0.28646665811538696, 'eval_mrr': 0.20821581780910492, 'eval_top_ten': 0.42820000648498535, 'eval_runtime': 13.2602, 'eval_samples_per_second': 1131.203, 'eval_steps_per_second': 35.369, 'epoch': 4.0}


100%|██████████| 10940/10940 [15:47<00:00, 13.33it/s] 

{'loss': 3.7725, 'learning_rate': 3.1456021449187194e-07, 'epoch': 5.0}


                                                     
100%|██████████| 10940/10940 [16:00<00:00, 13.33it/s]

{'eval_loss': 3.9244325160980225, 'eval_top_one': 0.10980000346899033, 'eval_top_five': 0.2926666736602783, 'eval_mrr': 0.21301965415477753, 'eval_top_ten': 0.43459999561309814, 'eval_runtime': 13.3085, 'eval_samples_per_second': 1127.096, 'eval_steps_per_second': 35.241, 'epoch': 5.0}


100%|██████████| 10940/10940 [16:03<00:00, 11.36it/s]


{'train_runtime': 963.0331, 'train_samples_per_second': 363.435, 'train_steps_per_second': 11.36, 'train_loss': 3.900432758226691, 'epoch': 5.0}


100%|██████████| 469/469 [00:13<00:00, 35.21it/s]


{'eval_loss': 3.9390363693237305, 'eval_top_one': 0.10566666722297668, 'eval_top_five': 0.2895333468914032, 'eval_mrr': 0.2098255157470703, 'eval_top_ten': 0.419866681098938, 'eval_runtime': 13.3809, 'eval_samples_per_second': 1121.004, 'eval_steps_per_second': 35.05, 'epoch': 5.0}


 20%|██        | 2188/10940 [02:59<11:39, 12.52it/s]

{'loss': 4.2042, 'learning_rate': 1.5073734100432617e-07, 'epoch': 1.0}


                                                    
 20%|██        | 2188/10940 [03:12<11:39, 12.52it/s]

{'eval_loss': 4.2315521240234375, 'eval_top_one': 0.07460000365972519, 'eval_top_five': 0.2134000062942505, 'eval_mrr': 0.16111303865909576, 'eval_top_ten': 0.328000009059906, 'eval_runtime': 13.3265, 'eval_samples_per_second': 1125.58, 'eval_steps_per_second': 35.193, 'epoch': 1.0}


 40%|████      | 4376/10940 [06:12<08:21, 13.08it/s]  

{'loss': 4.0984, 'learning_rate': 1.090392060026702e-07, 'epoch': 2.0}


                                                    
 40%|████      | 4376/10940 [06:25<08:21, 13.08it/s]

{'eval_loss': 4.160536289215088, 'eval_top_one': 0.08153333514928818, 'eval_top_five': 0.23393332958221436, 'eval_mrr': 0.17389370501041412, 'eval_top_ten': 0.35519999265670776, 'eval_runtime': 13.2976, 'eval_samples_per_second': 1128.026, 'eval_steps_per_second': 35.27, 'epoch': 2.0}


 60%|██████    | 6564/10940 [09:24<05:19, 13.71it/s]  

{'loss': 4.0495, 'learning_rate': 5.7513617788585624e-08, 'epoch': 3.0}


                                                    
 60%|██████    | 6564/10940 [09:38<05:19, 13.71it/s]

{'eval_loss': 4.129701614379883, 'eval_top_one': 0.08286666870117188, 'eval_top_five': 0.24173332750797272, 'eval_mrr': 0.17767737805843353, 'eval_top_ten': 0.36666667461395264, 'eval_runtime': 13.2345, 'eval_samples_per_second': 1133.403, 'eval_steps_per_second': 35.438, 'epoch': 3.0}


 80%|████████  | 8752/10940 [12:37<02:50, 12.80it/s]  

{'loss': 4.0258, 'learning_rate': 1.5859005352611847e-08, 'epoch': 4.0}


                                                    
 80%|████████  | 8752/10940 [12:51<02:50, 12.80it/s]

{'eval_loss': 4.118958473205566, 'eval_top_one': 0.08326666802167892, 'eval_top_five': 0.24379999935626984, 'eval_mrr': 0.17884159088134766, 'eval_top_ten': 0.3694666624069214, 'eval_runtime': 13.7349, 'eval_samples_per_second': 1092.109, 'eval_steps_per_second': 34.147, 'epoch': 4.0}


100%|██████████| 10940/10940 [15:51<00:00, 13.49it/s] 

{'loss': 4.0196, 'learning_rate': 8.59786977539893e-14, 'epoch': 5.0}


                                                     
100%|██████████| 10940/10940 [16:04<00:00, 13.49it/s]

{'eval_loss': 4.117405414581299, 'eval_top_one': 0.08340000361204147, 'eval_top_five': 0.24459999799728394, 'eval_mrr': 0.17907018959522247, 'eval_top_ten': 0.3700000047683716, 'eval_runtime': 13.5238, 'eval_samples_per_second': 1109.159, 'eval_steps_per_second': 34.68, 'epoch': 5.0}


100%|██████████| 10940/10940 [16:06<00:00, 11.32it/s]


{'train_runtime': 966.741, 'train_samples_per_second': 362.041, 'train_steps_per_second': 11.316, 'train_loss': 4.079518252970749, 'epoch': 5.0}


100%|██████████| 469/469 [00:13<00:00, 34.77it/s]


{'eval_loss': 4.137241840362549, 'eval_top_one': 0.08073333650827408, 'eval_top_five': 0.2425999939441681, 'eval_mrr': 0.17649109661579132, 'eval_top_ten': 0.3662000000476837, 'eval_runtime': 13.4726, 'eval_samples_per_second': 1113.372, 'eval_steps_per_second': 34.811, 'epoch': 5.0}


 20%|██        | 2188/10940 [02:56<11:41, 12.48it/s]

{'loss': 4.2573, 'learning_rate': 4.522120230129785e-08, 'epoch': 1.0}


                                                    
 20%|██        | 2188/10940 [03:11<11:41, 12.48it/s]

{'eval_loss': 4.3245110511779785, 'eval_top_one': 0.05999999865889549, 'eval_top_five': 0.18386666476726532, 'eval_mrr': 0.1397770792245865, 'eval_top_ten': 0.29179999232292175, 'eval_runtime': 14.146, 'eval_samples_per_second': 1060.369, 'eval_steps_per_second': 33.154, 'epoch': 1.0}


 40%|████      | 4376/10940 [06:12<09:10, 11.93it/s]  

{'loss': 4.2124, 'learning_rate': 3.2711761800801064e-08, 'epoch': 2.0}


                                                    
 40%|████      | 4376/10940 [06:26<09:10, 11.93it/s]

{'eval_loss': 4.288690567016602, 'eval_top_one': 0.06646666675806046, 'eval_top_five': 0.19553333520889282, 'eval_mrr': 0.14882200956344604, 'eval_top_ten': 0.30693334341049194, 'eval_runtime': 13.9979, 'eval_samples_per_second': 1071.591, 'eval_steps_per_second': 33.505, 'epoch': 2.0}


 60%|██████    | 6564/10940 [09:30<05:13, 13.97it/s]  

{'loss': 4.185, 'learning_rate': 1.725408533657569e-08, 'epoch': 3.0}


                                                    
 60%|██████    | 6564/10940 [09:43<05:13, 13.97it/s]

{'eval_loss': 4.268123626708984, 'eval_top_one': 0.06960000097751617, 'eval_top_five': 0.2011999934911728, 'eval_mrr': 0.15329992771148682, 'eval_top_ten': 0.31726667284965515, 'eval_runtime': 13.6049, 'eval_samples_per_second': 1102.54, 'eval_steps_per_second': 34.473, 'epoch': 3.0}


 80%|████████  | 8752/10940 [12:43<03:01, 12.02it/s]  

{'loss': 4.1693, 'learning_rate': 4.7577016057835545e-09, 'epoch': 4.0}


                                                    
 80%|████████  | 8752/10940 [12:56<03:01, 12.02it/s]

{'eval_loss': 4.259817123413086, 'eval_top_one': 0.070333331823349, 'eval_top_five': 0.2027333378791809, 'eval_mrr': 0.15482746064662933, 'eval_top_ten': 0.31940001249313354, 'eval_runtime': 13.6014, 'eval_samples_per_second': 1102.826, 'eval_steps_per_second': 34.482, 'epoch': 4.0}


100%|██████████| 10940/10940 [15:55<00:00, 13.46it/s] 

{'loss': 4.1662, 'learning_rate': 2.579360932619679e-14, 'epoch': 5.0}


                                                     
100%|██████████| 10940/10940 [16:09<00:00, 13.46it/s]

{'eval_loss': 4.25885009765625, 'eval_top_one': 0.07046666741371155, 'eval_top_five': 0.2029999941587448, 'eval_mrr': 0.15506814420223236, 'eval_top_ten': 0.31966665387153625, 'eval_runtime': 13.5864, 'eval_samples_per_second': 1104.042, 'eval_steps_per_second': 34.52, 'epoch': 5.0}


100%|██████████| 10940/10940 [16:11<00:00, 11.26it/s]


{'train_runtime': 971.6554, 'train_samples_per_second': 360.21, 'train_steps_per_second': 11.259, 'train_loss': 4.198042768795704, 'epoch': 5.0}


100%|██████████| 469/469 [00:13<00:00, 34.83it/s]

{'eval_loss': 4.272396087646484, 'eval_top_one': 0.06599999964237213, 'eval_top_five': 0.20000000298023224, 'eval_mrr': 0.15055006742477417, 'eval_top_ten': 0.3124000132083893, 'eval_runtime': 13.5258, 'eval_samples_per_second': 1108.989, 'eval_steps_per_second': 34.674, 'epoch': 5.0}



