In [28]:
# Setup
! pip install seqeval evaluate
! pip install kaleido
! pip install --upgrade nevergrad # upgrade to ensure latest version



In [29]:
# Library imports
from transformers import AutoTokenizer, AutoModel, pipeline, AutoConfig, DistilBertForSequenceClassification, DistilBertModel, DistilBertConfig, DistilBertPreTrainedModel, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.tokenization_utils_base import BatchEncoding
from datasets import Dataset, DatasetDict
import torch
import torch.nn as nn
from google.colab import drive, userdata
import pickle
import random
import re
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.express as px
import evaluate
import pprint
import kaleido
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import re
from tqdm import tqdm
import torch.nn.functional as F
from torch.utils.data import DataLoader
import nevergrad as ng
import wandb
import shutil
import tempfile
import os

In [30]:
# Mount drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [31]:
# View all pandas columns, rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [32]:
# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

In [33]:
# Load custom trained model

checkpoint = "Heather-Driver/distilbert-NER-LinearAlg-finetuned"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, do_lower_case=False)
distilbert_model = DistilBertModel.from_pretrained(checkpoint)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [34]:
def extract_window(sentence, predicate, window_size):
  """This function creates a window around the matching predicate in order to tokenize and later get the span vectors for the window.
  The function adjusts according to the window size wanted"""
  tokens = sentence.split()

  # Find the starting index of the predicate in the sentence (find all word positions for the predicate)
  pattern = re.escape(predicate)  # Escape the predicate string to handle special characters if any
  match = re.search(pattern, sentence)

  if not match:
      return "Predicate not found in the sentence."

  # Get the index of where the predicate starts in the list of tokens
  start_index = len(sentence[:match.start()].split())  # Token index of the start of predicate

  # Define the sample window
  start_window = max(0, start_index - window_size)
  end_window = min(len(tokens), start_index + len(predicate.split()) + window_size)

  # Create the window of words around the predicate
  window = tokens[start_window:end_window]

  # If the window is too short at the beginning or the end, adjust to take as many as possible
  if start_window == 0:
      # If the window is at the start, extend the end if possible
      end_window = min(len(tokens), start_index + len(predicate.split()) + window_size)
  if end_window == len(tokens):
      # If the window is at the end, extend the start if possible
      start_window = max(0, start_index - window_size)

  # Create the window of words around the predicate again after adjustments
  window = tokens[start_window:end_window]
  return ' '.join(window)

def adds_context_window(window_size, df):
  for i in range(len(df)):
    text = extract_window(df.at[i, 'sentence'], df.at[i, 'predicate'], window_size=window_size)
    df.at[i, 'context_window'] = text
  return df

In [35]:
# Read in dictionary
predicate_data = read_pickle('predicate_data.pkl')
predicate_data = predicate_data.rename(columns={'Window_1': 'context_window', 'Label': 'string_label'})
predicate_data.columns = predicate_data.columns.str.lower()

In [36]:
# Need mapping of classification tags to their indices for model to use

index2tag = {idx:tag for idx, tag in enumerate(predicate_data['string_label'].unique())} # This is just a nonsignificant arbitrary mapping of the label to a number for training the model
tag2index = {tag:idx for idx, tag in enumerate(predicate_data['string_label'].unique())} # To lookup indices from tags

In [37]:
predicate_data['label'] = predicate_data['string_label'].map(tag2index)

In [38]:
predicate_data = adds_context_window(window_size=1, df=predicate_data)

In [39]:
predicate_data.head(2)

Unnamed: 0,sentence,subject,predicate,object,string_label,context_window,label
0,The Wishart distribution is used in multivaria...,wishart distribution,is used in,multivariate statistics,used in,distribution is used in multivariate,0
1,The Square Root Method is transformed by the a...,Square Root Method,transformed by,the application of inverse operations to deriv...,computation,is transformed by the,1


In [40]:
X_train_indices, X_test_indices, y_train_indices, y_test_indices = train_test_split(predicate_data.index.to_numpy(), predicate_data['label'].to_numpy(),
                                                                                    test_size=0.05, random_state=42, stratify=predicate_data['label'].to_numpy())

# Repeat to get validation sub-sample of Train
X_train_indices, X_valid_indices, y_train_indices, y_valid_indices = train_test_split(X_train_indices, y_train_indices, test_size=0.3, random_state=42, stratify=y_train_indices)

## Preprocessing

In [41]:
dataset = Dataset.from_pandas(predicate_data[['sentence', 'label', 'context_window', 'predicate']])

In [42]:
# Select subsets of the dataset for train, test and validation
train_split = dataset.select(X_train_indices)
test_split = dataset.select(X_test_indices)
valid_split = dataset.select(X_valid_indices)

dataset = DatasetDict({
    'train': train_split,
    'test': test_split,
    'validation': valid_split
})

In [43]:
def preprocess_function_predicate(examples):
  context_inputs = tokenizer(examples["predicate"], return_tensors="pt", add_special_tokens=False, truncation=True, padding="max_length", max_length=256)
  return context_inputs

dataset = dataset.map(preprocess_function_predicate, batched=True)
# Rename the 'attention_mask' column to 'context_attention_mask'
dataset = dataset.rename_columns({"attention_mask": "predicate_attention_mask", "input_ids": "predicate_input_ids"})

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [44]:
def preprocess_function_context(examples):
  context_inputs = tokenizer(examples["context_window"], return_tensors="pt", add_special_tokens=True, truncation=True, padding="max_length", max_length=256)
  return context_inputs

dataset = dataset.map(preprocess_function_context, batched=True)
# Rename the 'attention_mask' column to 'context_attention_mask'
dataset = dataset.rename_columns({"attention_mask": "context_attention_mask", "input_ids": "context_input_ids"})

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [45]:
def preprocess_function(examples):
  inputs = tokenizer(examples["sentence"], return_tensors="pt", add_special_tokens=True, truncation=True, padding="max_length", max_length=256)
  return inputs

dataset = dataset.map(preprocess_function, batched=True)

dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label', 'context_attention_mask', 'context_input_ids', 'predicate_attention_mask', 'predicate_input_ids'])

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

## Developing the Model Parameters

In [46]:
class DistilBertForSentenceClassificationSpan(DistilBertPreTrainedModel):
  config_class = DistilBertConfig

  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels
    # Model body
    self.distilbert = distilbert_model
    # Attention mechanism for context
    self.attention_w = nn.Parameter(torch.randn(config.hidden_size))  # Trainable attention vector
    self.attention_bias = nn.Parameter(torch.zeros(1))  # Bias term for attention
    # Lookup table for context width embeddings
    self.width_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
    # Classification head
    self.classifier = nn.Linear(768 * 4, self.num_labels)  # Span * 4 (CLS, span, width, predicate)

  def _predicate_weight_embedding(self, predicate_input_ids, predicate_attention_mask):
    outputs = self.distilbert(input_ids=predicate_input_ids, attention_mask=predicate_attention_mask, output_attentions=False)
    predicate_embeddings = outputs.last_hidden_state  # shape: [batch_size, seq_len, hidden_size] torch.Size([batch_size, 512, 768]) --> 512 to 256
    # print('context_embeddings', context_embeddings.shape)
    # attention scores
    attention_scores = torch.matmul(predicate_embeddings, self.attention_w) + self.attention_bias  # shape: [batch_size, seq_len, 1] torch.Size([1, 512, 1])
    # print('attention_scores', attention_scores.shape)

    # get attention weights from softmax
    attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)  # shape: [batch_size, num_heads, seq_len, seq_len] torch.Size([1, 12, 512, 512])
    # print('attention_weights', attention_weights.shape)
    attention_weights = attention_weights.unsqueeze(-1)
    # print('attention_weights', attention_weights.shape)

    # # Weighted sum of span embeddings to get the final attention span representation
    # weighted_context_embeddings = attention_weights * context_embeddings.unsqueeze(2)  # shape: [1, 12, 512, 512, 768]

    weighted_span_embeddings = torch.sum(attention_weights * predicate_embeddings, dim=1)  # shape: [1, 768]
    # print('weighted_span_embeddings', weighted_span_embeddings.shape)
    return weighted_span_embeddings #span representation [1, 768]

  def _attention_weight_embedding(self, context_input_ids, context_attention_mask):
    outputs = self.distilbert(input_ids=context_input_ids, attention_mask=context_attention_mask, output_attentions=False)
    context_embeddings = outputs.last_hidden_state  # shape: [batch_size, seq_len, hidden_size] torch.Size([batch_size, 512, 768]) --> 512 to 256
    # print('context_embeddings', context_embeddings.shape)
    # attention scores
    attention_scores = torch.matmul(context_embeddings, self.attention_w) + self.attention_bias  # shape: [batch_size, seq_len, 1] torch.Size([1, 512, 1])
    # print('attention_scores', attention_scores.shape)

    # get attention weights from softmax
    attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)  # shape: [batch_size, num_heads, seq_len, seq_len] torch.Size([1, 12, 512, 512])
    # print('attention_weights', attention_weights.shape)
    attention_weights = attention_weights.unsqueeze(-1)
    # print('attention_weights', attention_weights.shape)

    # # Weighted sum of span embeddings to get the final attention span representation
    # weighted_context_embeddings = attention_weights * context_embeddings.unsqueeze(2)  # shape: [1, 12, 512, 512, 768]

    weighted_window_embeddings = torch.sum(attention_weights * context_embeddings, dim=1)  # shape: [1, 768]
    # print('weighted_span_embeddings', weighted_span_embeddings.shape)

    return weighted_window_embeddings #span representation [1, 768]

  def _cls_embeddings(self, input_ids, attention_mask):
    outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
    embeddings = outputs.last_hidden_state # shape [1, 512, 768]
    cls_embedding = embeddings[:, 0:1, :].squeeze(1)  # squeeze converts [1, 1, 768] to [1, 768]
    return cls_embedding

  def _width_embeddings(self, context_input_ids):
    span_length = context_input_ids.ne(0).sum(dim=1) -2
    # Ensure no negative or zero indices (minimum span length should be 1)
    span_length = torch.clamp(span_length, min=1)
    width_embedding = self.width_embedding(span_length)
    return width_embedding

  def forward(self, context_input_ids=None, context_attention_mask=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, predicate_input_ids=None, predicate_attention_mask=None, **kwargs):
    # Get embeddings from attention weights
    attention_weight_embedding = self._attention_weight_embedding(context_input_ids=context_input_ids, context_attention_mask=context_attention_mask)
    # Get embeddings from predicate weights
    attention_weight_predicate = self._predicate_weight_embedding(predicate_input_ids=predicate_input_ids, predicate_attention_mask=predicate_attention_mask)
    # Get width embedding
    width_embedding = self._width_embeddings(context_input_ids=context_input_ids)
    # Get CLS token embedding
    cls_embedding = self._cls_embeddings(input_ids=input_ids, attention_mask=attention_mask)
    # Concatenate span representation, [CLS] embedding, and width embedding
    final_representation = torch.cat((attention_weight_embedding, cls_embedding, width_embedding, attention_weight_predicate), dim=-1)  # shape: [batch_size, 768*4]
    # Classifier on concat
    logits = self.classifier(final_representation)
    # Loss calc
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    return SequenceClassifierOutput(loss=loss, logits=logits)

In [47]:
config = DistilBertConfig.from_pretrained("Heather-Driver/distilbert-NER-LinearAlg-finetuned")
config.label2id = tag2index
config.id2label = index2tag
config.num_labels = len(index2tag)

model = DistilBertForSentenceClassificationSpan(config)
model.to(device)
model.gradient_checkpointing_enable()

In [48]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    return_tensors="pt",
)

In [49]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted") #y_true, y_pred
  acc = accuracy_score(labels, preds)
  precision = precision_score(labels, preds, average="weighted")
  recall = recall_score(labels, preds, average="weighted")
  return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

## Nevergrad optimisation

In [50]:
# Load the best hyperparameters from the pickle file
with open("best_hyperparameters.pkl", "rb") as f:
    best_params = pickle.load(f)

print("Loaded best hyperparameters:", best_params)

# You can now use these hyperparameters, for example:
# train_and_evaluate(**best_hyperparameters, name="final_model_run")

Loaded best hyperparameters: {'learning_rate': 0.0005661960144606241, 'batch_size': 32, 'weight_decay': 0.0003358596215703767, 'warmup_steps': 500, 'gradient_accumulation_steps': 2}


In [51]:
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/transformers_cache'

[Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/transformers_cache'
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [52]:
  # Set output_dir in TrainingArguments to this temporary folder.
  # After exiting the with-block, the directory is deleted.
os.environ["TRANSFORMERS_CACHE"] = '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/transformers_cache'

In [53]:
def train_and_evaluate(learning_rate, batch_size, weight_decay, warmup_steps, gradient_accumulation_steps, name):
  """Function to train the model with different hyperparameters"""
  output_dir = '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/' + name
  training_arguments = TrainingArguments(
      output_dir=output_dir,
      run_name=name,
      log_level="error",
      num_train_epochs=25,  # Keeping epochs fixed for now
      per_device_train_batch_size=int(batch_size),
      per_device_eval_batch_size=int(batch_size),
      learning_rate=learning_rate,
      weight_decay=weight_decay,
      warmup_steps=int(warmup_steps),
      gradient_accumulation_steps=int(gradient_accumulation_steps),
      eval_strategy="no",
      save_strategy="no",
      disable_tqdm=True,  # Avoid flooding the output
      report_to=["wandb"],
      load_best_model_at_end=True,
      push_to_hub=False,
      greater_is_better=True,
      save_safetensors=True,
      save_total_limit=1,
  )

  trainer = Trainer(
      model=model,
      args=training_arguments,
      train_dataset=dataset["train"],
      eval_dataset=dataset["validation"],
      tokenizer=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
  )

  # Train and evaluate
  trainer.train()
  metrics = trainer.evaluate()

  # Finish the current W&B run so that a new run is created next time
  wandb.finish()

  # Delete the output directory to free disk space
  shutil.rmtree(output_dir, ignore_errors=True)

  # Return negative F1-score (Nevergrad minimizes)
  return -metrics["eval_f1"]

# Define the hyperparameter search space using Instrumentation
instrum = ng.p.Instrumentation(
    learning_rate=ng.p.Log(lower=1e-6, upper=1e-3),        # Logarithmic scale
    batch_size=ng.p.TransitionChoice([4, 8, 16, 32]),
    weight_decay=ng.p.Log(lower=1e-5, upper=1e-1),
    warmup_steps=ng.p.TransitionChoice([100, 500, 1000, 2000]),
    gradient_accumulation_steps=ng.p.TransitionChoice([1, 2, 4])
)

# Initialize the optimizer with the instrumentation and a budget
optimizer = ng.optimizers.OnePlusOne(instrum, budget=20)  # 20 iterations

# Define a wrapper function that takes a dictionary of parameters and unpacks them for train_and_evaluate
# Wrapper function with dynamic naming
def objective_function_wrapper(**parameters):
    # Construct a unique name from the hyperparameters
    combo_name = (
        f"lr_{parameters['learning_rate']:.1e}_"
        f"bs_{parameters['batch_size']}_"
        f"wd_{parameters['weight_decay']:.1e}_"
        f"ws_{parameters['warmup_steps']}_"
        f"gas_{parameters['gradient_accumulation_steps']}"
    )
    combo_name = "optimized_model_window_1" + combo_name
    return train_and_evaluate(**parameters, name=combo_name)

# Update the optimizer.minimize call to use the wrapper function
recommendation = optimizer.minimize(objective_function_wrapper)

# Print the best hyperparameters found
print("Best hyperparameters:", recommendation.kwargs)

# Save best hyperparameters to a pickle file
with open("best_hyperparameters.pkl", "wb") as f:
    pickle.dump(recommendation.kwargs, f)

print("Best hyperparameters:", recommendation.kwargs)
best_params = recommendation.kwargs

  trainer = Trainer(


{'train_runtime': 125.9785, 'train_samples_per_second': 57.946, 'train_steps_per_second': 1.786, 'train_loss': 4.029936794704861, 'epoch': 22.526315789473685}
{'eval_loss': 1.3474440574645996, 'eval_accuracy': 0.7063492063492064, 'eval_f1': 0.6967525630569109, 'eval_precision': 0.6946215160500874, 'eval_recall': 0.7063492063492064, 'eval_runtime': 0.63, 'eval_samples_per_second': 199.998, 'eval_steps_per_second': 12.698, 'epoch': 22.526315789473685}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.70635
eval/f1,0.69675
eval/loss,1.34744
eval/precision,0.69462
eval/recall,0.70635
eval/runtime,0.63
eval/samples_per_second,199.998
eval/steps_per_second,12.698
total_flos,430444078940160.0
train/epoch,22.52632


  trainer = Trainer(


{'train_runtime': 132.1748, 'train_samples_per_second': 55.23, 'train_steps_per_second': 0.946, 'train_loss': 0.6545215454101563, 'epoch': 25.0}
{'eval_loss': 0.9218429923057556, 'eval_accuracy': 0.8412698412698413, 'eval_f1': 0.8392888180500327, 'eval_precision': 0.8669197865626438, 'eval_recall': 0.8412698412698413, 'eval_runtime': 0.5926, 'eval_samples_per_second': 212.63, 'eval_steps_per_second': 6.75, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84127
eval/f1,0.83929
eval/loss,0.92184
eval/precision,0.86692
eval/recall,0.84127
eval/runtime,0.5926
eval/samples_per_second,212.63
eval/steps_per_second,6.75
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 140.8568, 'train_samples_per_second': 51.826, 'train_steps_per_second': 3.372, 'train_loss': 0.029348230863872327, 'epoch': 25.0}
{'eval_loss': 1.2410203218460083, 'eval_accuracy': 0.8492063492063492, 'eval_f1': 0.8448882328851273, 'eval_precision': 0.8624900232043089, 'eval_recall': 0.8492063492063492, 'eval_runtime': 0.6348, 'eval_samples_per_second': 198.483, 'eval_steps_per_second': 12.602, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84921
eval/f1,0.84489
eval/loss,1.24102
eval/precision,0.86249
eval/recall,0.84921
eval/runtime,0.6348
eval/samples_per_second,198.483
eval/steps_per_second,12.602
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 133.0663, 'train_samples_per_second': 54.86, 'train_steps_per_second': 1.879, 'train_loss': 2.1301586180925368e-05, 'epoch': 25.0}
{'eval_loss': 1.2562686204910278, 'eval_accuracy': 0.8492063492063492, 'eval_f1': 0.844615542612437, 'eval_precision': 0.8624028088313802, 'eval_recall': 0.8492063492063492, 'eval_runtime': 0.5924, 'eval_samples_per_second': 212.707, 'eval_steps_per_second': 6.753, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84921
eval/f1,0.84462
eval/loss,1.25627
eval/precision,0.8624
eval/recall,0.84921
eval/runtime,0.5924
eval/samples_per_second,212.707
eval/steps_per_second,6.753
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 156.1164, 'train_samples_per_second': 46.76, 'train_steps_per_second': 2.882, 'train_loss': 5.983140112625228e-06, 'epoch': 23.71232876712329}
{'eval_loss': 1.462624430656433, 'eval_accuracy': 0.8412698412698413, 'eval_f1': 0.8382640019961907, 'eval_precision': 0.8533809664762044, 'eval_recall': 0.8412698412698413, 'eval_runtime': 0.7759, 'eval_samples_per_second': 162.382, 'eval_steps_per_second': 41.24, 'epoch': 23.71232876712329}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84127
eval/f1,0.83826
eval/loss,1.46262
eval/precision,0.85338
eval/recall,0.84127
eval/runtime,0.7759
eval/samples_per_second,162.382
eval/steps_per_second,41.24
total_flos,452672357621760.0
train/epoch,23.71233


  trainer = Trainer(


{'loss': 0.0, 'grad_norm': 4.185895647879079e-07, 'learning_rate': 8.68271013972848e-06, 'epoch': 6.8493150684931505}
{'loss': 0.0, 'grad_norm': 1.8917465922640986e-06, 'learning_rate': 5.40621574737811e-06, 'epoch': 13.698630136986301}
{'loss': 0.0, 'grad_norm': 3.6808423828915693e-06, 'learning_rate': 2.12972135502774e-06, 'epoch': 20.54794520547945}
{'train_runtime': 172.7118, 'train_samples_per_second': 42.267, 'train_steps_per_second': 10.567, 'train_loss': 1.9563385524646674e-08, 'epoch': 25.0}
{'eval_loss': 1.5293554067611694, 'eval_accuracy': 0.8412698412698413, 'eval_f1': 0.8383519494754424, 'eval_precision': 0.8537225561035083, 'eval_recall': 0.8412698412698413, 'eval_runtime': 0.7759, 'eval_samples_per_second': 162.396, 'eval_steps_per_second': 41.243, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.84127
eval/f1,0.83835
eval/loss,1.52936
eval/precision,0.85372
eval/recall,0.84127
eval/runtime,0.7759
eval/samples_per_second,162.396
eval/steps_per_second,41.243
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 133.2982, 'train_samples_per_second': 54.764, 'train_steps_per_second': 1.875, 'train_loss': 1.1578201338124927e-08, 'epoch': 25.0}
{'eval_loss': 1.5401413440704346, 'eval_accuracy': 0.8412698412698413, 'eval_f1': 0.8383519494754424, 'eval_precision': 0.8537225561035083, 'eval_recall': 0.8412698412698413, 'eval_runtime': 0.594, 'eval_samples_per_second': 212.124, 'eval_steps_per_second': 6.734, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84127
eval/f1,0.83835
eval/loss,1.54014
eval/precision,0.85372
eval/recall,0.84127
eval/runtime,0.594
eval/samples_per_second,212.124
eval/steps_per_second,6.734
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 133.1567, 'train_samples_per_second': 54.823, 'train_steps_per_second': 1.877, 'train_loss': 6.854533694422571e-09, 'epoch': 25.0}
{'eval_loss': 1.56757390499115, 'eval_accuracy': 0.8412698412698413, 'eval_f1': 0.8383519494754424, 'eval_precision': 0.8537225561035083, 'eval_recall': 0.8412698412698413, 'eval_runtime': 0.5934, 'eval_samples_per_second': 212.347, 'eval_steps_per_second': 6.741, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84127
eval/f1,0.83835
eval/loss,1.56757
eval/precision,0.85372
eval/recall,0.84127
eval/runtime,0.5934
eval/samples_per_second,212.347
eval/steps_per_second,6.741
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 0.0, 'grad_norm': 2.5128494485215924e-07, 'learning_rate': 7.681159420289861e-07, 'epoch': 6.8493150684931505}
{'loss': 0.0, 'grad_norm': 6.923531259417359e-07, 'learning_rate': 4.782608695652178e-07, 'epoch': 13.698630136986301}
{'loss': 0.0, 'grad_norm': 2.095130639645504e-06, 'learning_rate': 1.8840579710144943e-07, 'epoch': 20.54794520547945}
{'train_runtime': 172.8687, 'train_samples_per_second': 42.229, 'train_steps_per_second': 10.557, 'train_loss': 3.3966480010374107e-09, 'epoch': 25.0}
{'eval_loss': 1.570379614830017, 'eval_accuracy': 0.8412698412698413, 'eval_f1': 0.8383519494754424, 'eval_precision': 0.8537225561035083, 'eval_recall': 0.8412698412698413, 'eval_runtime': 0.7781, 'eval_samples_per_second': 161.935, 'eval_steps_per_second': 41.126, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.84127
eval/f1,0.83835
eval/loss,1.57038
eval/precision,0.85372
eval/recall,0.84127
eval/runtime,0.7781
eval/samples_per_second,161.935
eval/steps_per_second,41.126
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 133.0057, 'train_samples_per_second': 54.885, 'train_steps_per_second': 1.88, 'train_loss': 3.9339061004284305e-09, 'epoch': 25.0}
{'eval_loss': 1.5708937644958496, 'eval_accuracy': 0.8412698412698413, 'eval_f1': 0.8383519494754424, 'eval_precision': 0.8537225561035083, 'eval_recall': 0.8412698412698413, 'eval_runtime': 0.5951, 'eval_samples_per_second': 211.726, 'eval_steps_per_second': 6.721, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84127
eval/f1,0.83835
eval/loss,1.57089
eval/precision,0.85372
eval/recall,0.84127
eval/runtime,0.5951
eval/samples_per_second,211.726
eval/steps_per_second,6.721
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 133.0808, 'train_samples_per_second': 54.854, 'train_steps_per_second': 1.879, 'train_loss': 2.756714593488141e-09, 'epoch': 25.0}
{'eval_loss': 1.612754225730896, 'eval_accuracy': 0.8492063492063492, 'eval_f1': 0.8459463012258043, 'eval_precision': 0.8566006304101542, 'eval_recall': 0.8492063492063492, 'eval_runtime': 0.5923, 'eval_samples_per_second': 212.738, 'eval_steps_per_second': 6.754, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84921
eval/f1,0.84595
eval/loss,1.61275
eval/precision,0.8566
eval/recall,0.84921
eval/runtime,0.5923
eval/samples_per_second,212.738
eval/steps_per_second,6.754
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 133.0848, 'train_samples_per_second': 54.852, 'train_steps_per_second': 1.879, 'train_loss': 6.407498744920304e-10, 'epoch': 25.0}
{'eval_loss': 1.6536321640014648, 'eval_accuracy': 0.8412698412698413, 'eval_f1': 0.8352739984882841, 'eval_precision': 0.8532264032264032, 'eval_recall': 0.8412698412698413, 'eval_runtime': 0.5928, 'eval_samples_per_second': 212.539, 'eval_steps_per_second': 6.747, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84127
eval/f1,0.83527
eval/loss,1.65363
eval/precision,0.85323
eval/recall,0.84127
eval/runtime,0.5928
eval/samples_per_second,212.539
eval/steps_per_second,6.747
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 132.2222, 'train_samples_per_second': 55.21, 'train_steps_per_second': 0.945, 'train_loss': 3.874301626183296e-10, 'epoch': 25.0}
{'eval_loss': 1.660221815109253, 'eval_accuracy': 0.8412698412698413, 'eval_f1': 0.8355564448001421, 'eval_precision': 0.8453916453916454, 'eval_recall': 0.8412698412698413, 'eval_runtime': 0.5965, 'eval_samples_per_second': 211.229, 'eval_steps_per_second': 6.706, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84127
eval/f1,0.83556
eval/loss,1.66022
eval/precision,0.84539
eval/recall,0.84127
eval/runtime,0.5965
eval/samples_per_second,211.229
eval/steps_per_second,6.706
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 133.0756, 'train_samples_per_second': 54.856, 'train_steps_per_second': 1.879, 'train_loss': 2.2351740369686012e-10, 'epoch': 25.0}
{'eval_loss': 1.66061532497406, 'eval_accuracy': 0.8412698412698413, 'eval_f1': 0.8355564448001421, 'eval_precision': 0.8453916453916454, 'eval_recall': 0.8412698412698413, 'eval_runtime': 0.5928, 'eval_samples_per_second': 212.556, 'eval_steps_per_second': 6.748, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.84127
eval/f1,0.83556
eval/loss,1.66062
eval/precision,0.84539
eval/recall,0.84127
eval/runtime,0.5928
eval/samples_per_second,212.556
eval/steps_per_second,6.748
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 0.0, 'grad_norm': 1.3364076778543676e-07, 'learning_rate': 8.693815827910651e-05, 'epoch': 6.8493150684931505}
{'loss': 0.0, 'grad_norm': 5.232504918240011e-07, 'learning_rate': 0.00017387631655821302, 'epoch': 13.698630136986301}
{'loss': 2.0646, 'grad_norm': 16.348241806030273, 'learning_rate': 0.00026081447483731953, 'epoch': 20.54794520547945}
{'train_runtime': 172.7076, 'train_samples_per_second': 42.268, 'train_steps_per_second': 10.567, 'train_loss': 0.9748696693328962, 'epoch': 25.0}
{'eval_loss': 2.5531527996063232, 'eval_accuracy': 0.15873015873015872, 'eval_f1': 0.1009434617405761, 'eval_precision': 0.11486005813736908, 'eval_recall': 0.15873015873015872, 'eval_runtime': 0.7759, 'eval_samples_per_second': 162.389, 'eval_steps_per_second': 41.242, 'epoch': 25.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.15873
eval/f1,0.10094
eval/loss,2.55315
eval/precision,0.11486
eval/recall,0.15873
eval/runtime,0.7759
eval/samples_per_second,162.389
eval/steps_per_second,41.242
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 2.1964, 'grad_norm': 10.469857215881348, 'learning_rate': 0.00022809738369828684, 'epoch': 13.513513513513514}
{'train_runtime': 152.8748, 'train_samples_per_second': 47.751, 'train_steps_per_second': 6.051, 'train_loss': 2.2199958430109796, 'epoch': 25.0}
{'eval_loss': 2.5614712238311768, 'eval_accuracy': 0.1984126984126984, 'eval_f1': 0.1277847585309391, 'eval_precision': 0.13132980393042315, 'eval_recall': 0.1984126984126984, 'eval_runtime': 0.6987, 'eval_samples_per_second': 180.34, 'eval_steps_per_second': 22.9, 'epoch': 25.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁██
train/global_step,▁██

0,1
eval/accuracy,0.19841
eval/f1,0.12778
eval/loss,2.56147
eval/precision,0.13133
eval/recall,0.19841
eval/runtime,0.6987
eval/samples_per_second,180.34
eval/steps_per_second,22.9
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 133.0363, 'train_samples_per_second': 54.872, 'train_steps_per_second': 1.879, 'train_loss': 2.152450927734375, 'epoch': 25.0}
{'eval_loss': 2.535567283630371, 'eval_accuracy': 0.1746031746031746, 'eval_f1': 0.11536701978439964, 'eval_precision': 0.12016017091205061, 'eval_recall': 0.1746031746031746, 'eval_runtime': 0.5938, 'eval_samples_per_second': 212.191, 'eval_steps_per_second': 6.736, 'epoch': 25.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.1746
eval/f1,0.11537
eval/loss,2.53557
eval/precision,0.12016
eval/recall,0.1746
eval/runtime,0.5938
eval/samples_per_second,212.191
eval/steps_per_second,6.736
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 133.0753, 'train_samples_per_second': 54.856, 'train_steps_per_second': 1.879, 'train_loss': 2.137359619140625, 'epoch': 25.0}
{'eval_loss': 2.5379834175109863, 'eval_accuracy': 0.1746031746031746, 'eval_f1': 0.11536701978439964, 'eval_precision': 0.12016017091205061, 'eval_recall': 0.1746031746031746, 'eval_runtime': 0.5951, 'eval_samples_per_second': 211.731, 'eval_steps_per_second': 6.722, 'epoch': 25.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.1746
eval/f1,0.11537
eval/loss,2.53798
eval/precision,0.12016
eval/recall,0.1746
eval/runtime,0.5951
eval/samples_per_second,211.731
eval/steps_per_second,6.722
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 2.1622, 'grad_norm': 10.341872215270996, 'learning_rate': 0.0001452654154923556, 'epoch': 13.513513513513514}
{'train_runtime': 152.8613, 'train_samples_per_second': 47.756, 'train_steps_per_second': 6.051, 'train_loss': 2.176418193095439, 'epoch': 25.0}
{'eval_loss': 2.5616891384124756, 'eval_accuracy': 0.1984126984126984, 'eval_f1': 0.1277847585309391, 'eval_precision': 0.13132980393042315, 'eval_recall': 0.1984126984126984, 'eval_runtime': 0.7024, 'eval_samples_per_second': 179.373, 'eval_steps_per_second': 22.778, 'epoch': 25.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁██
train/global_step,▁██

0,1
eval/accuracy,0.19841
eval/f1,0.12778
eval/loss,2.56169
eval/precision,0.13133
eval/recall,0.19841
eval/runtime,0.7024
eval/samples_per_second,179.373
eval/steps_per_second,22.778
total_flos,477254218752000.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 132.4271, 'train_samples_per_second': 55.125, 'train_steps_per_second': 0.944, 'train_loss': 4.295259765625, 'epoch': 25.0}
{'eval_loss': 2.56192684173584, 'eval_accuracy': 0.16666666666666666, 'eval_f1': 0.11512764446437916, 'eval_precision': 0.12561030418173275, 'eval_recall': 0.16666666666666666, 'eval_runtime': 0.5958, 'eval_samples_per_second': 211.485, 'eval_steps_per_second': 6.714, 'epoch': 25.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.16667
eval/f1,0.11513
eval/loss,2.56193
eval/precision,0.12561
eval/recall,0.16667
eval/runtime,0.5958
eval/samples_per_second,211.485
eval/steps_per_second,6.714
total_flos,477254218752000.0
train/epoch,25.0


Best hyperparameters: {'learning_rate': 0.0008737483861931933, 'batch_size': 32, 'weight_decay': 0.0005765898551871099, 'warmup_steps': 2000, 'gradient_accumulation_steps': 1}
Best hyperparameters: {'learning_rate': 0.0008737483861931933, 'batch_size': 32, 'weight_decay': 0.0005765898551871099, 'warmup_steps': 2000, 'gradient_accumulation_steps': 1}
