In [1]:
# Setup
! pip install seqeval evaluate
! pip install --upgrade nevergrad # upgrade to ensure latest version

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess 

In [2]:
# Library imports
from transformers import AutoTokenizer, AutoModel, pipeline, AutoConfig, DistilBertForSequenceClassification, DistilBertModel, DistilBertConfig, DistilBertPreTrainedModel, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.tokenization_utils_base import BatchEncoding
from datasets import Dataset, DatasetDict
import torch
import torch.nn as nn
from google.colab import drive, userdata
import pickle
import random
import re
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.express as px
import evaluate
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import re
from tqdm import tqdm
import torch.nn.functional as F
from torch.utils.data import DataLoader
import nevergrad as ng
import wandb
import shutil
import tempfile
import os

In [3]:
# Mount drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [4]:
# View all pandas columns, rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

In [6]:
# Load custom trained model

checkpoint = "Heather-Driver/distilbert-NER-LinearAlg-finetuned"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, do_lower_case=False)
distilbert_model = DistilBertModel.from_pretrained(checkpoint)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

In [7]:
def extract_window(sentence, predicate, window_size):
  """This function creates a window around the matching predicate in order to tokenize and later get the span vectors for the window.
  The function adjusts according to the window size wanted"""
  tokens = sentence.split()

  # Find the starting index of the predicate in the sentence (find all word positions for the predicate)
  pattern = re.escape(predicate)  # Escape the predicate string to handle special characters if any
  match = re.search(pattern, sentence)

  if not match:
      return "Predicate not found in the sentence."

  # Get the index of where the predicate starts in the list of tokens
  start_index = len(sentence[:match.start()].split())  # Token index of the start of predicate

  # Define the sample window
  start_window = max(0, start_index - window_size)
  end_window = min(len(tokens), start_index + len(predicate.split()) + window_size)

  # Create the window of words around the predicate
  window = tokens[start_window:end_window]

  # If the window is too short at the beginning or the end, adjust to take as many as possible
  if start_window == 0:
      # If the window is at the start, extend the end if possible
      end_window = min(len(tokens), start_index + len(predicate.split()) + window_size)
  if end_window == len(tokens):
      # If the window is at the end, extend the start if possible
      start_window = max(0, start_index - window_size)

  # Create the window of words around the predicate again after adjustments
  window = tokens[start_window:end_window]
  return ' '.join(window)

def adds_context_window(window_size, df):
  for i in range(len(df)):
    text = extract_window(df.at[i, 'sentence'], df.at[i, 'predicate'], window_size=window_size)
    df.at[i, 'context_window'] = text
  return df

In [8]:
# Read in dictionary
predicate_data = read_pickle('predicate_data.pkl')
predicate_data = predicate_data.rename(columns={'Window_1': 'context_window', 'Label': 'string_label'})
predicate_data.columns = predicate_data.columns.str.lower()

In [9]:
# Need mapping of classification tags to their indices for model to use

index2tag = {idx:tag for idx, tag in enumerate(predicate_data['string_label'].unique())} # This is just a nonsignificant arbitrary mapping of the label to a number for training the model
tag2index = {tag:idx for idx, tag in enumerate(predicate_data['string_label'].unique())} # To lookup indices from tags

In [10]:
predicate_data['label'] = predicate_data['string_label'].map(tag2index)

In [11]:
predicate_data = adds_context_window(window_size=2, df=predicate_data)

In [12]:
predicate_data.head(2)

Unnamed: 0,sentence,subject,predicate,object,string_label,context_window,label
0,The Wishart distribution is used in multivaria...,wishart distribution,is used in,multivariate statistics,used in,Wishart distribution is used in multivariate s...,0
1,The Square Root Method is transformed by the a...,Square Root Method,transformed by,the application of inverse operations to deriv...,computation,Method is transformed by the application,1


In [13]:
X_train_indices, X_test_indices, y_train_indices, y_test_indices = train_test_split(predicate_data.index.to_numpy(), predicate_data['label'].to_numpy(),
                                                                                    test_size=0.05, random_state=42, stratify=predicate_data['label'].to_numpy())

# Repeat to get validation sub-sample of Train
X_train_indices, X_valid_indices, y_train_indices, y_valid_indices = train_test_split(X_train_indices, y_train_indices, test_size=0.3, random_state=42, stratify=y_train_indices)

## Preprocessing

In [14]:
dataset = Dataset.from_pandas(predicate_data[['sentence', 'label', 'context_window', 'predicate']])

In [15]:
# Select subsets of the dataset for train, test and validation
train_split = dataset.select(X_train_indices)
test_split = dataset.select(X_test_indices)
valid_split = dataset.select(X_valid_indices)

dataset = DatasetDict({
    'train': train_split,
    'test': test_split,
    'validation': valid_split
})

In [16]:
def preprocess_function_predicate(examples):
  context_inputs = tokenizer(examples["predicate"], return_tensors="pt", add_special_tokens=False, truncation=True, padding="max_length", max_length=256)
  return context_inputs

dataset = dataset.map(preprocess_function_predicate, batched=True)
# Rename the 'attention_mask' column to 'context_attention_mask'
dataset = dataset.rename_columns({"attention_mask": "predicate_attention_mask", "input_ids": "predicate_input_ids"})

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [17]:
def preprocess_function_context(examples):
  context_inputs = tokenizer(examples["context_window"], return_tensors="pt", add_special_tokens=True, truncation=True, padding="max_length", max_length=256)
  return context_inputs

dataset = dataset.map(preprocess_function_context, batched=True)
# Rename the 'attention_mask' column to 'context_attention_mask'
dataset = dataset.rename_columns({"attention_mask": "context_attention_mask", "input_ids": "context_input_ids"})

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [18]:
def preprocess_function(examples):
  inputs = tokenizer(examples["sentence"], return_tensors="pt", add_special_tokens=True, truncation=True, padding="max_length", max_length=256)
  return inputs

dataset = dataset.map(preprocess_function, batched=True)

dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label', 'context_attention_mask', 'context_input_ids', 'predicate_attention_mask', 'predicate_input_ids'])

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

## Developing the Model Parameters

In [19]:
class StandardDistilBertClassifier(DistilBertPreTrainedModel):
  config_class = DistilBertConfig

  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels
    # Model body
    self.distilbert = distilbert_model
    # Classification head
    self.classifier = nn.Linear(config.hidden_size, self.num_labels)  # Span * 3 (CLS, span, width)

  def _cls_embeddings(self, input_ids, attention_mask):
    outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
    embeddings = outputs.last_hidden_state # shape [1, 512, 768]
    cls_embedding = embeddings[:, 0:1, :].squeeze(1)  # squeeze converts [1, 1, 768] to [1, 768]
    return cls_embedding

  def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
    # Get CLS token embedding
    cls_embedding = self._cls_embeddings(input_ids=input_ids, attention_mask=attention_mask)
    # Classifier on cls_embedding
    logits = self.classifier(cls_embedding)
    # Loss calc
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    return SequenceClassifierOutput(loss=loss, logits=logits)

In [20]:
config = DistilBertConfig.from_pretrained("Heather-Driver/distilbert-NER-LinearAlg-finetuned")
config.label2id = tag2index
config.id2label = index2tag
config.num_labels = len(index2tag)

model = StandardDistilBertClassifier(config)
model.to(device)
model.gradient_checkpointing_enable()

In [21]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    return_tensors="pt",
)

In [22]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted") #y_true, y_pred
  acc = accuracy_score(labels, preds)
  precision = precision_score(labels, preds, average="weighted")
  recall = recall_score(labels, preds, average="weighted")
  return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

## Nevergrad optimisation

In [23]:
# Load the best hyperparameters from the pickle file
with open("best_hyperparameters.pkl", "rb") as f:
    best_params = pickle.load(f)

print("Loaded best hyperparameters:", best_params)

# You can now use these hyperparameters, for example:
# train_and_evaluate(**best_hyperparameters, name="final_model_run")

Loaded best hyperparameters: {'learning_rate': 5.314885705504048e-06, 'batch_size': 32, 'weight_decay': 7.395196046678932e-05, 'warmup_steps': 100, 'gradient_accumulation_steps': 1}


In [24]:
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/transformers_cache'

[Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/transformers_cache'
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [25]:
  # Set output_dir in TrainingArguments to this temporary folder.
  # After exiting the with-block, the directory is deleted.
os.environ["TRANSFORMERS_CACHE"] = '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/transformers_cache'

In [27]:
def train_and_evaluate(learning_rate, batch_size, weight_decay, warmup_steps, gradient_accumulation_steps, name):
  """Function to train the model with different hyperparameters"""
  output_dir = '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/' + name
  training_arguments = TrainingArguments(
      output_dir=output_dir,
      run_name=name,
      log_level="error",
      num_train_epochs=25,  # Keeping epochs fixed for now
      per_device_train_batch_size=int(batch_size),
      per_device_eval_batch_size=int(batch_size),
      learning_rate=learning_rate,
      weight_decay=weight_decay,
      warmup_steps=int(warmup_steps),
      gradient_accumulation_steps=int(gradient_accumulation_steps),
      eval_strategy="no",
      save_strategy="no",
      disable_tqdm=True,  # Avoid flooding the output
      report_to=["wandb"],
      load_best_model_at_end=True,
      push_to_hub=False,
      greater_is_better=True,
      save_safetensors=True,
      save_total_limit=1,
  )

  trainer = Trainer(
      model=model,
      args=training_arguments,
      train_dataset=dataset["train"],
      eval_dataset=dataset["validation"],
      tokenizer=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
  )

  # Train and evaluate
  trainer.train()
  metrics = trainer.evaluate()

  # Finish the current W&B run so that a new run is created next time
  wandb.finish()

  # Delete the output directory to free disk space
  shutil.rmtree(output_dir, ignore_errors=True)

  # Return negative F1-score (Nevergrad minimizes)
  return -metrics["eval_f1"]

# Define the hyperparameter search space
instrum = ng.p.Instrumentation(
    learning_rate=ng.p.Log(lower=1e-6, upper=1e-3),        # Logarithmic scale
    batch_size=ng.p.TransitionChoice([4, 8, 16, 32]),
    weight_decay=ng.p.Log(lower=1e-5, upper=1e-1),
    warmup_steps=ng.p.TransitionChoice([100, 500, 1000, 2000]),
    gradient_accumulation_steps=ng.p.TransitionChoice([1, 2, 4])
)

# Initialize the optimizer with the instrumentation and a budget
optimizer = ng.optimizers.OnePlusOne(instrum, budget=20)  # 20 iterations

# Define a wrapper function that takes a dictionary of parameters and unpacks them for train_and_evaluate
# Wrapper function with dynamic naming
def objective_function_wrapper(**parameters):
    # Construct a unique name from the hyperparameters
    combo_name = (
        f"lr_{parameters['learning_rate']:.1e}_"
        f"bs_{parameters['batch_size']}_"
        f"wd_{parameters['weight_decay']:.1e}_"
        f"ws_{parameters['warmup_steps']}_"
        f"gas_{parameters['gradient_accumulation_steps']}"
    )
    combo_name = "optimized_model_baseline_" + combo_name
    return train_and_evaluate(**parameters, name=combo_name)

# Update the optimizer.minimize call to use the wrapper function
recommendation = optimizer.minimize(objective_function_wrapper)

# Print the best hyperparameters found
print("Best hyperparameters:", recommendation.kwargs)

# Save best hyperparameters to a pickle file
with open("best_hyperparameters.pkl", "wb") as f:
    pickle.dump(recommendation.kwargs, f)

print("Best hyperparameters:", recommendation.kwargs)
best_params = recommendation.kwargs

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mheather-rink[0m ([33mh-driver[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'train_runtime': 58.191, 'train_samples_per_second': 125.449, 'train_steps_per_second': 3.867, 'train_loss': 4.0465728081597225, 'epoch': 22.526315789473685}
{'eval_loss': 1.4591604471206665, 'eval_accuracy': 0.5952380952380952, 'eval_f1': 0.5949968087921668, 'eval_precision': 0.6188371813371814, 'eval_recall': 0.5952380952380952, 'eval_runtime': 0.2533, 'eval_samples_per_second': 497.36, 'eval_steps_per_second': 31.578, 'epoch': 22.526315789473685}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.59524
eval/f1,0.595
eval/loss,1.45916
eval/precision,0.61884
eval/recall,0.59524
eval/runtime,0.2533
eval/samples_per_second,497.36
eval/steps_per_second,31.578
total_flos,430179997544448.0
train/epoch,22.52632


  trainer = Trainer(


{'train_runtime': 46.156, 'train_samples_per_second': 158.159, 'train_steps_per_second': 2.708, 'train_loss': 0.4259173583984375, 'epoch': 25.0}
{'eval_loss': 1.4064356088638306, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7863160010503006, 'eval_precision': 0.8103835978835979, 'eval_recall': 0.7777777777777778, 'eval_runtime': 0.2081, 'eval_samples_per_second': 605.534, 'eval_steps_per_second': 19.223, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.77778
eval/f1,0.78632
eval/loss,1.40644
eval/precision,0.81038
eval/recall,0.77778
eval/runtime,0.2081
eval/samples_per_second,605.534
eval/steps_per_second,19.223
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 50.962, 'train_samples_per_second': 143.244, 'train_steps_per_second': 9.321, 'train_loss': 0.08327663220857319, 'epoch': 25.0}
{'eval_loss': 1.7330249547958374, 'eval_accuracy': 0.753968253968254, 'eval_f1': 0.7446054803408039, 'eval_precision': 0.7634085862156037, 'eval_recall': 0.753968253968254, 'eval_runtime': 0.2285, 'eval_samples_per_second': 551.36, 'eval_steps_per_second': 35.007, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.75397
eval/f1,0.74461
eval/loss,1.73302
eval/precision,0.76341
eval/recall,0.75397
eval/runtime,0.2285
eval/samples_per_second,551.36
eval/steps_per_second,35.007
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 46.8381, 'train_samples_per_second': 155.856, 'train_steps_per_second': 5.338, 'train_loss': 0.018270990371704102, 'epoch': 25.0}
{'eval_loss': 1.9195717573165894, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7732976372482748, 'eval_precision': 0.7770132645132645, 'eval_recall': 0.7777777777777778, 'eval_runtime': 0.2086, 'eval_samples_per_second': 603.912, 'eval_steps_per_second': 19.172, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.77778
eval/f1,0.7733
eval/loss,1.91957
eval/precision,0.77701
eval/recall,0.77778
eval/runtime,0.2086
eval/samples_per_second,603.912
eval/steps_per_second,19.172
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 57.9339, 'train_samples_per_second': 126.006, 'train_steps_per_second': 7.767, 'train_loss': 0.49644219292534725, 'epoch': 23.71232876712329}
{'eval_loss': 2.4148213863372803, 'eval_accuracy': 0.7936507936507936, 'eval_f1': 0.791163729058466, 'eval_precision': 0.8042970037017655, 'eval_recall': 0.7936507936507936, 'eval_runtime': 0.294, 'eval_samples_per_second': 428.626, 'eval_steps_per_second': 108.857, 'epoch': 23.71232876712329}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.79365
eval/f1,0.79116
eval/loss,2.41482
eval/precision,0.8043
eval/recall,0.79365
eval/runtime,0.294
eval/samples_per_second,428.626
eval/steps_per_second,108.857
total_flos,452394638972928.0
train/epoch,23.71233


  trainer = Trainer(


{'loss': 0.0012, 'grad_norm': 0.00011865991109516472, 'learning_rate': 1.564402183177944e-05, 'epoch': 13.520547945205479}
{'train_runtime': 61.4441, 'train_samples_per_second': 118.807, 'train_steps_per_second': 14.647, 'train_loss': 0.0006908457131228513, 'epoch': 24.328767123287673}
{'eval_loss': 2.918731212615967, 'eval_accuracy': 0.7936507936507936, 'eval_f1': 0.7910443225971177, 'eval_precision': 0.8004343980534456, 'eval_recall': 0.7936507936507936, 'eval_runtime': 0.2925, 'eval_samples_per_second': 430.742, 'eval_steps_per_second': 109.395, 'epoch': 24.328767123287673}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁██
train/global_step,▁██

0,1
eval/accuracy,0.79365
eval/f1,0.79104
eval/loss,2.91873
eval/precision,0.80043
eval/recall,0.79365
eval/runtime,0.2925
eval/samples_per_second,430.742
eval/steps_per_second,109.395
total_flos,464155331493888.0
train/epoch,24.32877


  trainer = Trainer(


{'train_runtime': 50.6632, 'train_samples_per_second': 144.089, 'train_steps_per_second': 9.376, 'train_loss': 2.488712290007817e-06, 'epoch': 25.0}
{'eval_loss': 3.06833815574646, 'eval_accuracy': 0.7936507936507936, 'eval_f1': 0.7916717145288575, 'eval_precision': 0.8024546000736476, 'eval_recall': 0.7936507936507936, 'eval_runtime': 0.2272, 'eval_samples_per_second': 554.484, 'eval_steps_per_second': 35.205, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.79365
eval/f1,0.79167
eval/loss,3.06834
eval/precision,0.80245
eval/recall,0.79365
eval/runtime,0.2272
eval/samples_per_second,554.484
eval/steps_per_second,35.205
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 46.8112, 'train_samples_per_second': 155.946, 'train_steps_per_second': 5.341, 'train_loss': 5.821733502671123e-07, 'epoch': 25.0}
{'eval_loss': 3.3542051315307617, 'eval_accuracy': 0.8095238095238095, 'eval_f1': 0.810124011327019, 'eval_precision': 0.8245421245421245, 'eval_recall': 0.8095238095238095, 'eval_runtime': 0.2092, 'eval_samples_per_second': 602.405, 'eval_steps_per_second': 19.124, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.80952
eval/f1,0.81012
eval/loss,3.35421
eval/precision,0.82454
eval/recall,0.80952
eval/runtime,0.2092
eval/samples_per_second,602.405
eval/steps_per_second,19.124
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 0.0, 'grad_norm': 1.6651281384838512e-06, 'learning_rate': 7.681159420289861e-07, 'epoch': 6.8493150684931505}
{'loss': 0.0, 'grad_norm': 1.6601231891399948e-06, 'learning_rate': 4.782608695652178e-07, 'epoch': 13.698630136986301}
{'loss': 0.0, 'grad_norm': 9.996391554523143e-07, 'learning_rate': 1.8840579710144943e-07, 'epoch': 20.54794520547945}
{'train_runtime': 68.754, 'train_samples_per_second': 106.176, 'train_steps_per_second': 26.544, 'train_loss': 8.65981933638479e-08, 'epoch': 25.0}
{'eval_loss': 3.3770008087158203, 'eval_accuracy': 0.8095238095238095, 'eval_f1': 0.810124011327019, 'eval_precision': 0.8245421245421245, 'eval_recall': 0.8095238095238095, 'eval_runtime': 0.2945, 'eval_samples_per_second': 427.851, 'eval_steps_per_second': 108.66, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.80952
eval/f1,0.81012
eval/loss,3.377
eval/precision,0.82454
eval/recall,0.80952
eval/runtime,0.2945
eval/samples_per_second,427.851
eval/steps_per_second,108.66
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 46.8854, 'train_samples_per_second': 155.699, 'train_steps_per_second': 5.332, 'train_loss': 7.744132744846865e-08, 'epoch': 25.0}
{'eval_loss': 3.3810884952545166, 'eval_accuracy': 0.8095238095238095, 'eval_f1': 0.810124011327019, 'eval_precision': 0.8245421245421245, 'eval_recall': 0.8095238095238095, 'eval_runtime': 0.2079, 'eval_samples_per_second': 606.151, 'eval_steps_per_second': 19.243, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.80952
eval/f1,0.81012
eval/loss,3.38109
eval/precision,0.82454
eval/recall,0.80952
eval/runtime,0.2079
eval/samples_per_second,606.151
eval/steps_per_second,19.243
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 46.8551, 'train_samples_per_second': 155.8, 'train_steps_per_second': 5.336, 'train_loss': 7.548927533207461e-08, 'epoch': 25.0}
{'eval_loss': 3.382880926132202, 'eval_accuracy': 0.8095238095238095, 'eval_f1': 0.810124011327019, 'eval_precision': 0.8245421245421245, 'eval_recall': 0.8095238095238095, 'eval_runtime': 0.2087, 'eval_samples_per_second': 603.662, 'eval_steps_per_second': 19.164, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.80952
eval/f1,0.81012
eval/loss,3.38288
eval/precision,0.82454
eval/recall,0.80952
eval/runtime,0.2087
eval/samples_per_second,603.662
eval/steps_per_second,19.164
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 46.7993, 'train_samples_per_second': 155.985, 'train_steps_per_second': 5.342, 'train_loss': 1.1891125723195728e-08, 'epoch': 25.0}
{'eval_loss': 3.529660701751709, 'eval_accuracy': 0.8095238095238095, 'eval_f1': 0.810124011327019, 'eval_precision': 0.8245421245421245, 'eval_recall': 0.8095238095238095, 'eval_runtime': 0.2084, 'eval_samples_per_second': 604.642, 'eval_steps_per_second': 19.195, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.80952
eval/f1,0.81012
eval/loss,3.52966
eval/precision,0.82454
eval/recall,0.80952
eval/runtime,0.2084
eval/samples_per_second,604.642
eval/steps_per_second,19.195
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 31.3467, 'train_samples_per_second': 232.879, 'train_steps_per_second': 1.595, 'train_loss': 0.0, 'epoch': 16.8}
{'eval_loss': 3.5340843200683594, 'eval_accuracy': 0.8095238095238095, 'eval_f1': 0.810124011327019, 'eval_precision': 0.8245421245421245, 'eval_recall': 0.8095238095238095, 'eval_runtime': 0.2091, 'eval_samples_per_second': 602.506, 'eval_steps_per_second': 19.127, 'epoch': 16.8}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.80952
eval/f1,0.81012
eval/loss,3.53408
eval/precision,0.82454
eval/recall,0.80952
eval/runtime,0.2091
eval/samples_per_second,602.506
eval/steps_per_second,19.127
total_flos,321981626351616.0
train/epoch,16.8


  trainer = Trainer(


{'train_runtime': 46.89, 'train_samples_per_second': 155.684, 'train_steps_per_second': 5.332, 'train_loss': 0.0, 'epoch': 25.0}
{'eval_loss': 3.5344038009643555, 'eval_accuracy': 0.8095238095238095, 'eval_f1': 0.810124011327019, 'eval_precision': 0.8245421245421245, 'eval_recall': 0.8095238095238095, 'eval_runtime': 0.2095, 'eval_samples_per_second': 601.333, 'eval_steps_per_second': 19.09, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.80952
eval/f1,0.81012
eval/loss,3.5344
eval/precision,0.82454
eval/recall,0.80952
eval/runtime,0.2095
eval/samples_per_second,601.333
eval/steps_per_second,19.09
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 0.0, 'grad_norm': 1.3768281803550053e-07, 'learning_rate': 0.0001287538016662383, 'epoch': 6.8493150684931505}
{'loss': 0.0, 'grad_norm': 7.414900693447635e-08, 'learning_rate': 0.0002575076033324766, 'epoch': 13.698630136986301}
{'loss': 0.0, 'grad_norm': 4.881207971152435e-08, 'learning_rate': 0.00038626140499871495, 'epoch': 20.54794520547945}
{'train_runtime': 68.8682, 'train_samples_per_second': 106.0, 'train_steps_per_second': 26.5, 'train_loss': 0.0, 'epoch': 25.0}
{'eval_loss': 3.775606155395508, 'eval_accuracy': 0.8095238095238095, 'eval_f1': 0.8092623333224839, 'eval_precision': 0.8222832722832722, 'eval_recall': 0.8095238095238095, 'eval_runtime': 0.2968, 'eval_samples_per_second': 424.482, 'eval_steps_per_second': 107.805, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.80952
eval/f1,0.80926
eval/loss,3.77561
eval/precision,0.82228
eval/recall,0.80952
eval/runtime,0.2968
eval/samples_per_second,424.482
eval/steps_per_second,107.805
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 0.0, 'grad_norm': 3.4348758504165744e-08, 'learning_rate': 3.3521027916934476e-07, 'epoch': 6.8493150684931505}
{'loss': 0.0, 'grad_norm': 3.409577331581204e-08, 'learning_rate': 6.704205583386895e-07, 'epoch': 13.698630136986301}
{'loss': 0.0, 'grad_norm': 4.1664897310056404e-08, 'learning_rate': 1.0056308375080344e-06, 'epoch': 20.54794520547945}
{'train_runtime': 68.7823, 'train_samples_per_second': 106.132, 'train_steps_per_second': 26.533, 'train_loss': 0.0, 'epoch': 25.0}
{'eval_loss': 3.7757229804992676, 'eval_accuracy': 0.8095238095238095, 'eval_f1': 0.8092623333224839, 'eval_precision': 0.8222832722832722, 'eval_recall': 0.8095238095238095, 'eval_runtime': 0.2934, 'eval_samples_per_second': 429.476, 'eval_steps_per_second': 109.073, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.80952
eval/f1,0.80926
eval/loss,3.77572
eval/precision,0.82228
eval/recall,0.80952
eval/runtime,0.2934
eval/samples_per_second,429.476
eval/steps_per_second,109.073
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 46.845, 'train_samples_per_second': 155.833, 'train_steps_per_second': 5.337, 'train_loss': 0.0, 'epoch': 25.0}
{'eval_loss': 3.765038251876831, 'eval_accuracy': 0.8174603174603174, 'eval_f1': 0.8176934352318466, 'eval_precision': 0.8303869411012267, 'eval_recall': 0.8174603174603174, 'eval_runtime': 0.2076, 'eval_samples_per_second': 606.798, 'eval_steps_per_second': 19.263, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.81746
eval/f1,0.81769
eval/loss,3.76504
eval/precision,0.83039
eval/recall,0.81746
eval/runtime,0.2076
eval/samples_per_second,606.798
eval/steps_per_second,19.263
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 46.9744, 'train_samples_per_second': 155.404, 'train_steps_per_second': 5.322, 'train_loss': 0.0, 'epoch': 25.0}
{'eval_loss': 3.7650365829467773, 'eval_accuracy': 0.8174603174603174, 'eval_f1': 0.8176934352318466, 'eval_precision': 0.8303869411012267, 'eval_recall': 0.8174603174603174, 'eval_runtime': 0.2101, 'eval_samples_per_second': 599.706, 'eval_steps_per_second': 19.038, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.81746
eval/f1,0.81769
eval/loss,3.76504
eval/precision,0.83039
eval/recall,0.81746
eval/runtime,0.2101
eval/samples_per_second,599.706
eval/steps_per_second,19.038
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 1.0884, 'grad_norm': 8.058756828308105, 'learning_rate': 0.0007681159420289858, 'epoch': 6.8493150684931505}
{'loss': 2.5621, 'grad_norm': 9.597574234008789, 'learning_rate': 0.0004782608695652176, 'epoch': 13.698630136986301}
{'loss': 2.4763, 'grad_norm': 8.999785423278809, 'learning_rate': 0.00018840579710144938, 'epoch': 20.54794520547945}
{'train_runtime': 68.9009, 'train_samples_per_second': 105.949, 'train_steps_per_second': 26.487, 'train_loss': 2.112245391427654, 'epoch': 25.0}
{'eval_loss': 2.400418758392334, 'eval_accuracy': 0.0873015873015873, 'eval_f1': 0.014019232997335188, 'eval_precision': 0.007621567145376668, 'eval_recall': 0.0873015873015873, 'eval_runtime': 0.2918, 'eval_samples_per_second': 431.807, 'eval_steps_per_second': 109.665, 'epoch': 25.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.0873
eval/f1,0.01402
eval/loss,2.40042
eval/precision,0.00762
eval/recall,0.0873
eval/runtime,0.2918
eval/samples_per_second,431.807
eval/steps_per_second,109.665
total_flos,476961418905600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 31.4092, 'train_samples_per_second': 232.416, 'train_steps_per_second': 1.592, 'train_loss': 8.091834716796875, 'epoch': 16.8}
{'eval_loss': 2.399357795715332, 'eval_accuracy': 0.0873015873015873, 'eval_f1': 0.014019232997335188, 'eval_precision': 0.007621567145376668, 'eval_recall': 0.0873015873015873, 'eval_runtime': 0.2072, 'eval_samples_per_second': 608.241, 'eval_steps_per_second': 19.309, 'epoch': 16.8}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.0873
eval/f1,0.01402
eval/loss,2.39936
eval/precision,0.00762
eval/recall,0.0873
eval/runtime,0.2072
eval/samples_per_second,608.241
eval/steps_per_second,19.309
total_flos,321981626351616.0
train/epoch,16.8


Best hyperparameters: {'learning_rate': 1.0000000000000008e-06, 'batch_size': 32, 'weight_decay': 1.0000000000000008e-05, 'warmup_steps': 2000, 'gradient_accumulation_steps': 1}
Best hyperparameters: {'learning_rate': 1.0000000000000008e-06, 'batch_size': 32, 'weight_decay': 1.0000000000000008e-05, 'warmup_steps': 2000, 'gradient_accumulation_steps': 1}
