In [53]:
# Setup
! pip install seqeval evaluate
! pip install kaleido
! pip install --upgrade nevergrad # upgrade to ensure latest version



In [54]:
# Library imports
from transformers import AutoTokenizer, AutoModel, pipeline, AutoConfig, DistilBertForSequenceClassification, DistilBertModel, DistilBertConfig, DistilBertPreTrainedModel, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.tokenization_utils_base import BatchEncoding
from datasets import Dataset, DatasetDict
import torch
import torch.nn as nn
from google.colab import drive, userdata
import pickle
import random
import re
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.express as px
import evaluate
import pprint
import kaleido
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import re
from tqdm import tqdm
import torch.nn.functional as F
from torch.utils.data import DataLoader
import nevergrad as ng
import wandb
import shutil
import tempfile
import os

In [55]:
# Mount drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [56]:
# View all pandas columns, rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [57]:
# Define file read function
def read_pickle(dict_file):
  with open(dict_file, 'rb') as file:
    return pickle.load(file)

In [58]:
# Load custom trained model

checkpoint = "Heather-Driver/distilbert-NER-LinearAlg-finetuned"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, do_lower_case=False)
distilbert_model = DistilBertModel.from_pretrained(checkpoint)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [59]:
def extract_window(sentence, predicate, window_size):
  """This function creates a window around the matching predicate in order to tokenize and later get the span vectors for the window.
  The function adjusts according to the window size wanted"""
  tokens = sentence.split()

  # Find the starting index of the predicate in the sentence (find all word positions for the predicate)
  pattern = re.escape(predicate)  # Escape the predicate string to handle special characters if any
  match = re.search(pattern, sentence)

  if not match:
      return "Predicate not found in the sentence."

  # Get the index of where the predicate starts in the list of tokens
  start_index = len(sentence[:match.start()].split())  # Token index of the start of predicate

  # Define the sample window
  start_window = max(0, start_index - window_size)
  end_window = min(len(tokens), start_index + len(predicate.split()) + window_size)

  # Create the window of words around the predicate
  window = tokens[start_window:end_window]

  # If the window is too short at the beginning or the end, adjust to take as many as possible
  if start_window == 0:
      # If the window is at the start, extend the end if possible
      end_window = min(len(tokens), start_index + len(predicate.split()) + window_size)
  if end_window == len(tokens):
      # If the window is at the end, extend the start if possible
      start_window = max(0, start_index - window_size)

  # Create the window of words around the predicate again after adjustments
  window = tokens[start_window:end_window]
  return ' '.join(window)

def adds_context_window(window_size, df):
  for i in range(len(df)):
    text = extract_window(df.at[i, 'sentence'], df.at[i, 'predicate'], window_size=window_size)
    df.at[i, 'context_window'] = text
  return df

In [60]:
# Read in dictionary
predicate_data = read_pickle('predicate_data.pkl')
predicate_data = predicate_data.rename(columns={'Window_1': 'context_window', 'Label': 'string_label'})
predicate_data.columns = predicate_data.columns.str.lower()

In [61]:
# Need mapping of classification tags to their indices for model to use

index2tag = {idx:tag for idx, tag in enumerate(predicate_data['string_label'].unique())} # This is just a nonsignificant arbitrary mapping of the label to a number for training the model
tag2index = {tag:idx for idx, tag in enumerate(predicate_data['string_label'].unique())} # To lookup indices from tags

In [62]:
predicate_data['label'] = predicate_data['string_label'].map(tag2index)

In [63]:
predicate_data = adds_context_window(window_size=1, df=predicate_data)

In [64]:
predicate_data.head(2)

Unnamed: 0,sentence,subject,predicate,object,string_label,context_window,label
0,The Wishart distribution is used in multivaria...,wishart distribution,is used in,multivariate statistics,used in,distribution is used in multivariate,0
1,The Square Root Method is transformed by the a...,Square Root Method,transformed by,the application of inverse operations to deriv...,computation,is transformed by the,1


In [65]:
X_train_indices, X_test_indices, y_train_indices, y_test_indices = train_test_split(predicate_data.index.to_numpy(), predicate_data['label'].to_numpy(),
                                                                                    test_size=0.05, random_state=42, stratify=predicate_data['label'].to_numpy())

# Repeat to get validation sub-sample of Train
X_train_indices, X_valid_indices, y_train_indices, y_valid_indices = train_test_split(X_train_indices, y_train_indices, test_size=0.3, random_state=42, stratify=y_train_indices)

## Preprocessing

In [66]:
dataset = Dataset.from_pandas(predicate_data[['sentence', 'label', 'context_window', 'predicate']])

In [67]:
# Select subsets of the dataset for train, test and validation
train_split = dataset.select(X_train_indices)
test_split = dataset.select(X_test_indices)
valid_split = dataset.select(X_valid_indices)

dataset = DatasetDict({
    'train': train_split,
    'test': test_split,
    'validation': valid_split
})

In [68]:
def preprocess_function_predicate(examples):
  context_inputs = tokenizer(examples["predicate"], return_tensors="pt", add_special_tokens=False, truncation=True, padding="max_length", max_length=256)
  return context_inputs

dataset = dataset.map(preprocess_function_predicate, batched=True)
# Rename the 'attention_mask' column to 'context_attention_mask'
dataset = dataset.rename_columns({"attention_mask": "predicate_attention_mask", "input_ids": "predicate_input_ids"})

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [69]:
def preprocess_function_context(examples):
  context_inputs = tokenizer(examples["context_window"], return_tensors="pt", add_special_tokens=True, truncation=True, padding="max_length", max_length=256)
  return context_inputs

dataset = dataset.map(preprocess_function_context, batched=True)
# Rename the 'attention_mask' column to 'context_attention_mask'
dataset = dataset.rename_columns({"attention_mask": "context_attention_mask", "input_ids": "context_input_ids"})

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [70]:
def preprocess_function(examples):
  inputs = tokenizer(examples["sentence"], return_tensors="pt", add_special_tokens=True, truncation=True, padding="max_length", max_length=256)
  return inputs

dataset = dataset.map(preprocess_function, batched=True)

dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label', 'context_attention_mask', 'context_input_ids', 'predicate_attention_mask', 'predicate_input_ids'])

Map:   0%|          | 0/292 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

## Developing the Model Parameters

In [71]:
class DistilBertForSentenceClassificationSpan(DistilBertPreTrainedModel):
  config_class = DistilBertConfig

  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels
    # Model body
    self.distilbert = distilbert_model
    # Attention mechanism for context
    self.attention_w = nn.Parameter(torch.randn(config.hidden_size))  # Trainable attention vector
    self.attention_bias = nn.Parameter(torch.zeros(1))  # Bias term for attention
    # Lookup table for context width embeddings
    self.width_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
    # Classification head
    self.classifier = nn.Linear(768 * 3, self.num_labels)  # Span * 3 (CLS, span, width)

  def _attention_weight_embedding(self, context_input_ids, context_attention_mask):
    outputs = self.distilbert(input_ids=context_input_ids, attention_mask=context_attention_mask, output_attentions=False)
    context_embeddings = outputs.last_hidden_state  # shape: [batch_size, seq_len, hidden_size] torch.Size([batch_size, 512, 768]) --> 512 to 256
    # print('context_embeddings', context_embeddings.shape)
    # attention scores
    attention_scores = torch.matmul(context_embeddings, self.attention_w) + self.attention_bias  # shape: [batch_size, seq_len, 1] torch.Size([1, 512, 1])
    # print('attention_scores', attention_scores.shape)

    # get attention weights from softmax
    attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)  # shape: [batch_size, num_heads, seq_len, seq_len] torch.Size([1, 12, 512, 512])
    # print('attention_weights', attention_weights.shape)
    attention_weights = attention_weights.unsqueeze(-1)
    # print('attention_weights', attention_weights.shape)

    # # Weighted sum of span embeddings to get the final attention span representation
    # weighted_context_embeddings = attention_weights * context_embeddings.unsqueeze(2)  # shape: [1, 12, 512, 512, 768]

    weighted_span_embeddings = torch.sum(attention_weights * context_embeddings, dim=1)  # shape: [1, 768]
    # print('weighted_span_embeddings', weighted_span_embeddings.shape)

    return weighted_span_embeddings #span representation [1, 768]

  def _cls_embeddings(self, input_ids, attention_mask):
    outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
    embeddings = outputs.last_hidden_state # shape [1, 512, 768]
    cls_embedding = embeddings[:, 0:1, :].squeeze(1)  # squeeze converts [1, 1, 768] to [1, 768]
    return cls_embedding

  def _width_embeddings(self, context_input_ids):
    span_length = context_input_ids.ne(0).sum(dim=1) -2
    # Ensure no negative or zero indices (minimum span length should be 1)
    span_length = torch.clamp(span_length, min=1)
    width_embedding = self.width_embedding(span_length)
    return width_embedding

  def forward(self, context_input_ids=None, context_attention_mask=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
    # Get embeddings from attention weights
    attention_weight_embedding = self._attention_weight_embedding(context_input_ids=context_input_ids, context_attention_mask=context_attention_mask)
    # Get width embedding
    width_embedding = self._width_embeddings(context_input_ids=context_input_ids)
    # Get CLS token embedding
    cls_embedding = self._cls_embeddings(input_ids=input_ids, attention_mask=attention_mask)
    # Concatenate span representation, [CLS] embedding, and width embedding
    final_representation = torch.cat((attention_weight_embedding, cls_embedding, width_embedding), dim=-1)  # shape: [batch_size, 768*3]
    # Classifier on concat
    logits = self.classifier(final_representation)
    # Loss calc
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    return SequenceClassifierOutput(loss=loss, logits=logits)

In [72]:
config = DistilBertConfig.from_pretrained("Heather-Driver/distilbert-NER-LinearAlg-finetuned")
config.label2id = tag2index
config.id2label = index2tag
config.num_labels = len(index2tag)

model = DistilBertForSentenceClassificationSpan(config)
model.to(device)
model.gradient_checkpointing_enable()

In [73]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    return_tensors="pt",
)

In [74]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted") #y_true, y_pred
  acc = accuracy_score(labels, preds)
  precision = precision_score(labels, preds, average="weighted")
  recall = recall_score(labels, preds, average="weighted")
  return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

## Nevergrad optimisation

In [75]:
# Load the best hyperparameters from the pickle file
with open("best_hyperparameters.pkl", "rb") as f:
    best_params = pickle.load(f)

print("Loaded best hyperparameters:", best_params)

# You can now use these hyperparameters, for example:
# train_and_evaluate(**best_hyperparameters, name="final_model_run")

Loaded best hyperparameters: {'learning_rate': 0.000400459958213354, 'batch_size': 32, 'weight_decay': 0.0007264477434506173, 'warmup_steps': 500, 'gradient_accumulation_steps': 2}


In [76]:
%cd '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/transformers_cache'

[Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/transformers_cache'
/content/drive/MyDrive/Colab Notebooks/Math_Graph/pickle_files


In [77]:
  # Set output_dir in TrainingArguments to this temporary folder.
  # After exiting the with-block, the directory is deleted.
os.environ["TRANSFORMERS_CACHE"] = '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/transformers_cache'

In [78]:
def train_and_evaluate(learning_rate, batch_size, weight_decay, warmup_steps, gradient_accumulation_steps, name):
  """Function to train the model with different hyperparameters"""
  output_dir = '/content/drive/MyDrive/Colab Notebooks/Math_Graph/Model/' + name
  training_arguments = TrainingArguments(
      output_dir=output_dir,
      run_name=name,
      log_level="error",
      num_train_epochs=25,  # Keeping epochs fixed for now
      per_device_train_batch_size=int(batch_size),
      per_device_eval_batch_size=int(batch_size),
      learning_rate=learning_rate,
      weight_decay=weight_decay,
      warmup_steps=int(warmup_steps),
      gradient_accumulation_steps=int(gradient_accumulation_steps),
      eval_strategy="no",
      save_strategy="no",
      disable_tqdm=True,  # Avoid flooding the output
      report_to=["wandb"],
      load_best_model_at_end=True,
      push_to_hub=False,
      greater_is_better=True,
      save_safetensors=True,
      save_total_limit=1,
  )

  trainer = Trainer(
      model=model,
      args=training_arguments,
      train_dataset=dataset["train"],
      eval_dataset=dataset["validation"],
      tokenizer=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
  )

  # Train and evaluate
  trainer.train()
  metrics = trainer.evaluate()

  # Finish the current W&B run so that a new run is created next time
  wandb.finish()

  # Delete the output directory to free disk space
  shutil.rmtree(output_dir, ignore_errors=True)

  # Return negative F1-score (Nevergrad minimizes)
  return -metrics["eval_f1"]

# Define the hyperparameter search space using Instrumentation
instrum = ng.p.Instrumentation(
    learning_rate=ng.p.Log(lower=1e-6, upper=1e-3),        # Logarithmic scale
    batch_size=ng.p.TransitionChoice([4, 8, 16, 32]),
    weight_decay=ng.p.Log(lower=1e-5, upper=1e-1),
    warmup_steps=ng.p.TransitionChoice([100, 500, 1000, 2000]),
    gradient_accumulation_steps=ng.p.TransitionChoice([1, 2, 4])
)

# Initialize the optimizer with the instrumentation and a budget
optimizer = ng.optimizers.OnePlusOne(instrum, budget=20)  # 20 iterations

# Define a wrapper function that takes a dictionary of parameters and unpacks them for train_and_evaluate
# Wrapper function with dynamic naming
def objective_function_wrapper(**parameters):
    # Construct a unique name from the hyperparameters
    combo_name = (
        f"lr_{parameters['learning_rate']:.1e}_"
        f"bs_{parameters['batch_size']}_"
        f"wd_{parameters['weight_decay']:.1e}_"
        f"ws_{parameters['warmup_steps']}_"
        f"gas_{parameters['gradient_accumulation_steps']}"
    )
    combo_name = "ns_model_window_1" + combo_name
    return train_and_evaluate(**parameters, name=combo_name)

# Update the optimizer.minimize call to use the wrapper function
recommendation = optimizer.minimize(objective_function_wrapper)

# Print the best hyperparameters found
print("Best hyperparameters:", recommendation.kwargs)

# Save best hyperparameters to a pickle file
with open("best_hyperparameters.pkl", "wb") as f:
    pickle.dump(recommendation.kwargs, f)

print("Best hyperparameters:", recommendation.kwargs)
best_params = recommendation.kwargs

  trainer = Trainer(


{'train_runtime': 85.7383, 'train_samples_per_second': 85.143, 'train_steps_per_second': 2.624, 'train_loss': 4.28796142578125, 'epoch': 22.526315789473685}
{'eval_loss': 1.9302881956100464, 'eval_accuracy': 0.3968253968253968, 'eval_f1': 0.39446532569382914, 'eval_precision': 0.4404383975812547, 'eval_recall': 0.3968253968253968, 'eval_runtime': 0.4309, 'eval_samples_per_second': 292.416, 'eval_steps_per_second': 18.566, 'epoch': 22.526315789473685}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.39683
eval/f1,0.39447
eval/loss,1.93029
eval/precision,0.44044
eval/recall,0.39683
eval/runtime,0.4309
eval/samples_per_second,292.416
eval/steps_per_second,18.566
total_flos,430358644113408.0
train/epoch,22.52632


  trainer = Trainer(


{'train_runtime': 89.4466, 'train_samples_per_second': 81.613, 'train_steps_per_second': 1.397, 'train_loss': 0.980978759765625, 'epoch': 25.0}
{'eval_loss': 1.0861784219741821, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7771684789587602, 'eval_precision': 0.8012864120006978, 'eval_recall': 0.7777777777777778, 'eval_runtime': 0.4007, 'eval_samples_per_second': 314.429, 'eval_steps_per_second': 9.982, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.77778
eval/f1,0.77717
eval/loss,1.08618
eval/precision,0.80129
eval/recall,0.77778
eval/runtime,0.4007
eval/samples_per_second,314.429
eval/steps_per_second,9.982
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 96.3539, 'train_samples_per_second': 75.762, 'train_steps_per_second': 4.93, 'train_loss': 0.04337641665810033, 'epoch': 25.0}
{'eval_loss': 1.6422792673110962, 'eval_accuracy': 0.7857142857142857, 'eval_f1': 0.7857774431687474, 'eval_precision': 0.7995032745032744, 'eval_recall': 0.7857142857142857, 'eval_runtime': 0.4289, 'eval_samples_per_second': 293.803, 'eval_steps_per_second': 18.654, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.78571
eval/f1,0.78578
eval/loss,1.64228
eval/precision,0.7995
eval/recall,0.78571
eval/runtime,0.4289
eval/samples_per_second,293.803
eval/steps_per_second,18.654
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 90.2911, 'train_samples_per_second': 80.85, 'train_steps_per_second': 2.769, 'train_loss': 0.001808133602142334, 'epoch': 25.0}
{'eval_loss': 1.6304999589920044, 'eval_accuracy': 0.7857142857142857, 'eval_f1': 0.7871522197609153, 'eval_precision': 0.8035035951702618, 'eval_recall': 0.7857142857142857, 'eval_runtime': 0.4008, 'eval_samples_per_second': 314.349, 'eval_steps_per_second': 9.979, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.78571
eval/f1,0.78715
eval/loss,1.6305
eval/precision,0.8035
eval/recall,0.78571
eval/runtime,0.4008
eval/samples_per_second,314.349
eval/steps_per_second,9.979
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 106.4935, 'train_samples_per_second': 68.549, 'train_steps_per_second': 4.226, 'train_loss': 0.0006156275007459853, 'epoch': 23.71232876712329}
{'eval_loss': 1.6110053062438965, 'eval_accuracy': 0.7936507936507936, 'eval_f1': 0.7967584865100392, 'eval_precision': 0.8143501560168227, 'eval_recall': 0.7936507936507936, 'eval_runtime': 0.536, 'eval_samples_per_second': 235.061, 'eval_steps_per_second': 59.698, 'epoch': 23.71232876712329}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.79365
eval/f1,0.79676
eval/loss,1.61101
eval/precision,0.81435
eval/recall,0.79365
eval/runtime,0.536
eval/samples_per_second,235.061
eval/steps_per_second,59.698
total_flos,452582510911488.0
train/epoch,23.71233


  trainer = Trainer(


{'loss': 0.0, 'grad_norm': 0.0009152163984254003, 'learning_rate': 7.681159420289861e-07, 'epoch': 6.8493150684931505}
{'loss': 0.0, 'grad_norm': 0.003036330919712782, 'learning_rate': 4.782608695652178e-07, 'epoch': 13.698630136986301}
{'loss': 0.0, 'grad_norm': 0.002367990091443062, 'learning_rate': 1.8840579710144943e-07, 'epoch': 20.54794520547945}
{'train_runtime': 120.8921, 'train_samples_per_second': 60.384, 'train_steps_per_second': 15.096, 'train_loss': 3.112591752042509e-05, 'epoch': 25.0}
{'eval_loss': 1.6477603912353516, 'eval_accuracy': 0.7936507936507936, 'eval_f1': 0.7954346481675674, 'eval_precision': 0.8083617000283667, 'eval_recall': 0.7936507936507936, 'eval_runtime': 0.5392, 'eval_samples_per_second': 233.678, 'eval_steps_per_second': 59.347, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.79365
eval/f1,0.79543
eval/loss,1.64776
eval/precision,0.80836
eval/recall,0.79365
eval/runtime,0.5392
eval/samples_per_second,233.678
eval/steps_per_second,59.347
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 90.2967, 'train_samples_per_second': 80.845, 'train_steps_per_second': 2.769, 'train_loss': 3.3689167350530625e-05, 'epoch': 25.0}
{'eval_loss': 2.0288898944854736, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7783167006736801, 'eval_precision': 0.7939785743357172, 'eval_recall': 0.7777777777777778, 'eval_runtime': 0.4006, 'eval_samples_per_second': 314.504, 'eval_steps_per_second': 9.984, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.77778
eval/f1,0.77832
eval/loss,2.02889
eval/precision,0.79398
eval/recall,0.77778
eval/runtime,0.4006
eval/samples_per_second,314.504
eval/steps_per_second,9.984
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 89.4512, 'train_samples_per_second': 81.609, 'train_steps_per_second': 1.397, 'train_loss': 4.520660266280174e-06, 'epoch': 25.0}
{'eval_loss': 2.0141732692718506, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7783167006736801, 'eval_precision': 0.7939785743357172, 'eval_recall': 0.7777777777777778, 'eval_runtime': 0.4042, 'eval_samples_per_second': 311.763, 'eval_steps_per_second': 9.897, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.77778
eval/f1,0.77832
eval/loss,2.01417
eval/precision,0.79398
eval/recall,0.77778
eval/runtime,0.4042
eval/samples_per_second,311.763
eval/steps_per_second,9.897
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 0.0, 'grad_norm': 3.034533619938884e-05, 'learning_rate': 7.681159420289861e-07, 'epoch': 6.8493150684931505}
{'loss': 0.0, 'grad_norm': 0.0002570437209215015, 'learning_rate': 4.782608695652178e-07, 'epoch': 13.698630136986301}
{'loss': 0.0, 'grad_norm': 0.0001974689366761595, 'learning_rate': 1.8840579710144943e-07, 'epoch': 20.54794520547945}
{'train_runtime': 120.7929, 'train_samples_per_second': 60.434, 'train_steps_per_second': 15.109, 'train_loss': 1.8168810423310488e-06, 'epoch': 25.0}
{'eval_loss': 2.0580968856811523, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7783167006736801, 'eval_precision': 0.7939785743357172, 'eval_recall': 0.7777777777777778, 'eval_runtime': 0.5347, 'eval_samples_per_second': 235.635, 'eval_steps_per_second': 59.844, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.77778
eval/f1,0.77832
eval/loss,2.0581
eval/precision,0.79398
eval/recall,0.77778
eval/runtime,0.5347
eval/samples_per_second,235.635
eval/steps_per_second,59.844
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 90.3395, 'train_samples_per_second': 80.806, 'train_steps_per_second': 2.767, 'train_loss': 1.5506205381825566e-06, 'epoch': 25.0}
{'eval_loss': 2.0665907859802246, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7783167006736801, 'eval_precision': 0.7939785743357172, 'eval_recall': 0.7777777777777778, 'eval_runtime': 0.4012, 'eval_samples_per_second': 314.045, 'eval_steps_per_second': 9.97, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.77778
eval/f1,0.77832
eval/loss,2.06659
eval/precision,0.79398
eval/recall,0.77778
eval/runtime,0.4012
eval/samples_per_second,314.045
eval/steps_per_second,9.97
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 90.2619, 'train_samples_per_second': 80.876, 'train_steps_per_second': 2.77, 'train_loss': 1.4956367667764426e-06, 'epoch': 25.0}
{'eval_loss': 2.0675604343414307, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7783167006736801, 'eval_precision': 0.7939785743357172, 'eval_recall': 0.7777777777777778, 'eval_runtime': 0.4056, 'eval_samples_per_second': 310.659, 'eval_steps_per_second': 9.862, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.77778
eval/f1,0.77832
eval/loss,2.06756
eval/precision,0.79398
eval/recall,0.77778
eval/runtime,0.4056
eval/samples_per_second,310.659
eval/steps_per_second,9.862
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 90.3525, 'train_samples_per_second': 80.795, 'train_steps_per_second': 2.767, 'train_loss': 8.443942060694099e-07, 'epoch': 25.0}
{'eval_loss': 2.1987922191619873, 'eval_accuracy': 0.7857142857142857, 'eval_f1': 0.7877340328239315, 'eval_precision': 0.8013481888481888, 'eval_recall': 0.7857142857142857, 'eval_runtime': 0.4017, 'eval_samples_per_second': 313.697, 'eval_steps_per_second': 9.959, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.78571
eval/f1,0.78773
eval/loss,2.19879
eval/precision,0.80135
eval/recall,0.78571
eval/runtime,0.4017
eval/samples_per_second,313.697
eval/steps_per_second,9.959
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 60.5537, 'train_samples_per_second': 120.554, 'train_steps_per_second': 0.826, 'train_loss': 6.603424844797701e-07, 'epoch': 16.8}
{'eval_loss': 2.521421194076538, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7749349118669159, 'eval_precision': 0.7816250548393404, 'eval_recall': 0.7777777777777778, 'eval_runtime': 0.4011, 'eval_samples_per_second': 314.129, 'eval_steps_per_second': 9.972, 'epoch': 16.8}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.77778
eval/f1,0.77493
eval/loss,2.52142
eval/precision,0.78163
eval/recall,0.77778
eval/runtime,0.4011
eval/samples_per_second,314.129
eval/steps_per_second,9.972
total_flos,322115339943936.0
train/epoch,16.8


  trainer = Trainer(


{'train_runtime': 90.3147, 'train_samples_per_second': 80.828, 'train_steps_per_second': 2.768, 'train_loss': 3.1664960260968654e-08, 'epoch': 25.0}
{'eval_loss': 2.522679328918457, 'eval_accuracy': 0.7777777777777778, 'eval_f1': 0.7749349118669159, 'eval_precision': 0.7816250548393404, 'eval_recall': 0.7777777777777778, 'eval_runtime': 0.4005, 'eval_samples_per_second': 314.618, 'eval_steps_per_second': 9.988, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.77778
eval/f1,0.77493
eval/loss,2.52268
eval/precision,0.78163
eval/recall,0.77778
eval/runtime,0.4005
eval/samples_per_second,314.618
eval/steps_per_second,9.988
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 0.0, 'grad_norm': 3.7164551258683787e-07, 'learning_rate': 4.636295109090229e-05, 'epoch': 6.8493150684931505}
{'loss': 0.0, 'grad_norm': 1.430725092177454e-06, 'learning_rate': 9.272590218180458e-05, 'epoch': 13.698630136986301}
{'loss': 0.0, 'grad_norm': 3.19359372724648e-07, 'learning_rate': 0.00013908885327270688, 'epoch': 20.54794520547945}
{'train_runtime': 120.776, 'train_samples_per_second': 60.442, 'train_steps_per_second': 15.111, 'train_loss': 1.188826655529177e-08, 'epoch': 25.0}
{'eval_loss': 2.4852077960968018, 'eval_accuracy': 0.7857142857142857, 'eval_f1': 0.78432172358292, 'eval_precision': 0.7961201717154098, 'eval_recall': 0.7857142857142857, 'eval_runtime': 0.5325, 'eval_samples_per_second': 236.617, 'eval_steps_per_second': 60.093, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.78571
eval/f1,0.78432
eval/loss,2.48521
eval/precision,0.79612
eval/recall,0.78571
eval/runtime,0.5325
eval/samples_per_second,236.617
eval/steps_per_second,60.093
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 0.0, 'grad_norm': 1.3387672481712798e-07, 'learning_rate': 3.1950778223811783e-07, 'epoch': 6.8493150684931505}
{'loss': 0.0, 'grad_norm': 3.455367050264613e-07, 'learning_rate': 6.390155644762357e-07, 'epoch': 13.698630136986301}
{'loss': 0.0, 'grad_norm': 3.095315435075463e-07, 'learning_rate': 9.585233467143535e-07, 'epoch': 20.54794520547945}
{'train_runtime': 120.6803, 'train_samples_per_second': 60.49, 'train_steps_per_second': 15.123, 'train_loss': 9.961323183059194e-10, 'epoch': 25.0}
{'eval_loss': 2.4893460273742676, 'eval_accuracy': 0.7857142857142857, 'eval_f1': 0.78432172358292, 'eval_precision': 0.7961201717154098, 'eval_recall': 0.7857142857142857, 'eval_runtime': 0.5321, 'eval_samples_per_second': 236.795, 'eval_steps_per_second': 60.138, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.78571
eval/f1,0.78432
eval/loss,2.48935
eval/precision,0.79612
eval/recall,0.78571
eval/runtime,0.5321
eval/samples_per_second,236.795
eval/steps_per_second,60.138
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 90.3471, 'train_samples_per_second': 80.8, 'train_steps_per_second': 2.767, 'train_loss': 5.364417461350968e-10, 'epoch': 25.0}
{'eval_loss': 2.5939552783966064, 'eval_accuracy': 0.7936507936507936, 'eval_f1': 0.7934487029613903, 'eval_precision': 0.8076257711061633, 'eval_recall': 0.7936507936507936, 'eval_runtime': 0.4011, 'eval_samples_per_second': 314.149, 'eval_steps_per_second': 9.973, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.79365
eval/f1,0.79345
eval/loss,2.59396
eval/precision,0.80763
eval/recall,0.79365
eval/runtime,0.4011
eval/samples_per_second,314.149
eval/steps_per_second,9.973
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 90.2811, 'train_samples_per_second': 80.859, 'train_steps_per_second': 2.769, 'train_loss': 4.470348002882929e-11, 'epoch': 25.0}
{'eval_loss': 2.5939531326293945, 'eval_accuracy': 0.7936507936507936, 'eval_f1': 0.7934487029613903, 'eval_precision': 0.8076257711061633, 'eval_recall': 0.7936507936507936, 'eval_runtime': 0.3997, 'eval_samples_per_second': 315.253, 'eval_steps_per_second': 10.008, 'epoch': 25.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.79365
eval/f1,0.79345
eval/loss,2.59395
eval/precision,0.80763
eval/recall,0.79365
eval/runtime,0.3997
eval/samples_per_second,315.253
eval/steps_per_second,10.008
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'loss': 2.3447, 'grad_norm': 17.134586334228516, 'learning_rate': 0.0007681159420289858, 'epoch': 6.8493150684931505}
{'loss': 2.5455, 'grad_norm': 16.35771369934082, 'learning_rate': 0.0004782608695652176, 'epoch': 13.698630136986301}
{'loss': 2.3386, 'grad_norm': 15.789960861206055, 'learning_rate': 0.00018840579710144938, 'epoch': 20.54794520547945}
{'train_runtime': 120.6463, 'train_samples_per_second': 60.507, 'train_steps_per_second': 15.127, 'train_loss': 2.3745051637414383, 'epoch': 25.0}
{'eval_loss': 2.524622917175293, 'eval_accuracy': 0.14285714285714285, 'eval_f1': 0.09879105606557878, 'eval_precision': 0.12620703410177092, 'eval_recall': 0.14285714285714285, 'eval_runtime': 0.5306, 'eval_samples_per_second': 237.463, 'eval_steps_per_second': 60.308, 'epoch': 25.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▆██
train/global_step,▁▄▆██

0,1
eval/accuracy,0.14286
eval/f1,0.09879
eval/loss,2.52462
eval/precision,0.12621
eval/recall,0.14286
eval/runtime,0.5306
eval/samples_per_second,237.463
eval/steps_per_second,60.308
total_flos,477159493017600.0
train/epoch,25.0


  trainer = Trainer(


{'train_runtime': 60.7057, 'train_samples_per_second': 120.252, 'train_steps_per_second': 0.824, 'train_loss': 7.222557373046875, 'epoch': 16.8}
{'eval_loss': 2.5244972705841064, 'eval_accuracy': 0.14285714285714285, 'eval_f1': 0.09879105606557878, 'eval_precision': 0.12620703410177092, 'eval_recall': 0.14285714285714285, 'eval_runtime': 0.4028, 'eval_samples_per_second': 312.813, 'eval_steps_per_second': 9.931, 'epoch': 16.8}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.14286
eval/f1,0.09879
eval/loss,2.5245
eval/precision,0.12621
eval/recall,0.14286
eval/runtime,0.4028
eval/samples_per_second,312.813
eval/steps_per_second,9.931
total_flos,322115339943936.0
train/epoch,16.8


Best hyperparameters: {'learning_rate': 8.356222913708296e-06, 'batch_size': 4, 'weight_decay': 1.0000000000000008e-05, 'warmup_steps': 100, 'gradient_accumulation_steps': 4}
Best hyperparameters: {'learning_rate': 8.356222913708296e-06, 'batch_size': 4, 'weight_decay': 1.0000000000000008e-05, 'warmup_steps': 100, 'gradient_accumulation_steps': 4}
