In [2]:
# Core
import numpy as np
import pandas as pd
import os
import json

# Dataset prepration
#from transformers import TFAutoModel, AutoModel
from transformers import AutoTokenizer
from datasets import Dataset
#from torch.utils.data import DataLoader

# Model, hyperparameter search, evaluation
import torch
import optuna
from transformers import BertPreTrainedModel, TrainingArguments
from transformers.models.roberta.modeling_roberta import (
    RobertaClassificationHead,
    RobertaConfig,
    RobertaModel,
)
from torch.nn import CrossEntropyLoss
import evaluate

from transformers import get_scheduler, Trainer
from torch.optim import AdamW
from tqdm.auto import tqdm

# Define functions

These code blocks contain functions that are used multiple times, by each section.

In [3]:
# Prepare variables for indexing
classes = ["Explicit", "Implicit", "Non-abusive"]
labels = ["E1", "E2", "E3", "I1", "I2", "I3", "I4", "I5", "I6", "I7"]

### `load_csv`
Loads a dataset split from a constant directory.
To change the directory, modify the loaded CSV at line $10$.

- `dataset: str (train, test, split)` — Split to load.
- `use_stopwords: bool` — Whether to use a dataset with stopwords or not.

Returns: `DataFrame` containing the loaded CSV.

In [4]:
def load_csv(dataset, use_stopwords):
    if dataset not in ["train", "validate", "test"]:
        raise Exception("Invalid split.")
    if type(use_stopwords) != bool:
        raise Exception("Stop words must be specified in boolean.")
    
    stopwords = "With Stopwords"
    if not use_stopwords:
        stopwords = "Without Stopwords"
    df = pd.read_csv(f"./Data/{stopwords}/{dataset}.csv")
    
    return df

### `format_dataset`
Formats a DataFrame for HuggingFace use.

- `df: DataFrame` — DataFrame to convert.
- `cv_indices: list` — Indices to create stratified CV datasets.

Returns: `list`, `list` containing HuggingFace `Datasets` for training and testing.

In [5]:
def format_dataset(df):
    df.rename({"Text": "text", "Class": "labels"}, axis=1, inplace=True)
    df.drop(labels, axis=1, inplace=True)
    
    dataset = Dataset.from_pandas(df)
    
    # Convert pd labels to huggingface ClassLabels for stratifying
    dataset = dataset.class_encode_column("labels")
    
    dataset = dataset.map(tokenize_function, batched=True)
    
    # Convert datasets to pytorch format
    dataset = dataset.remove_columns(["text"])
    dataset.set_format("torch")
        
    return dataset

### `save_preds`
Saves the predictions of a model.

- `filename: str` — Filename to use. Must include the `.json` extension.
- `data: list` — Contains the data to save.

In [6]:
def save_preds(filename, data):
    # Serializing json
    json_object = json.dumps({"predictions": data}, indent=4)

    # Writing to sample.json
    with open(filename, "w") as outfile:
        outfile.write(json_object)

# Standard multiclass classifier RoBERTa

## Preparation for models

Prepares the RoBERTa model's classification head based on a repository by [Chanda](https://pchanda.github.io/Roberta-FineTuning-for-Classification/).

In addition prepares the `compute_metrics` function to be used by the HuggingFace `Trainer` class.

In [7]:
# Prepare classification head for pretrained RoBERTa
class RobertaAbusiveClassification(BertPreTrainedModel):
    
    def __init__(self, config):
        super(RobertaAbusiveClassification, self).__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
        
    def forward(self, input_ids, attention_mask, labels):
        outputs = self.roberta(input_ids,attention_mask=attention_mask)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]
        
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [7]:
def compute_metrics(eval_pred):
    global predictions
    acc_metric = evaluate.load("accuracy")
    pre_metric = evaluate.load("precision")
    rec_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = acc_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = pre_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = rec_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    
    # Calculate per-class metrics
    results = []
    for i in range(0, 3):
        x = labels[predictions == i]
        y = predictions[predictions == i]
        
        class_accuracy = acc_metric.compute(predictions=y, references=x)["accuracy"]
        class_precision = pre_metric.compute(predictions=y, references=x, average="weighted")["precision"]
        class_recall = rec_metric.compute(predictions=y, references=x, average="weighted")["recall"]
        class_f1 = f1_metric.compute(predictions=y, references=x, average="weighted")["f1"]
        results += [class_accuracy, class_precision, class_recall, class_f1]
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1":f1,
           "NA_accuracy": results[0], "NA_precision": results[1], "NA_recall": results[2], "NA_f1": results[3],
           "E_accuracy": results[4], "E_precision": results[5], "E_recall": results[6], "E_f1": results[7],
           "I_accuracy": results[8], "I_precision": results[9], "I_recall": results[10], "I_f1": results[11]}

In [8]:
# Variables
num_labels = 3
load_dir = "jcblaise/roberta-tagalog-base"

# Get tokenizer from repository
tokenizer = AutoTokenizer.from_pretrained(load_dir, model_max_length=256)

def tokenize_function(data):
    return tokenizer(data["text"], padding="max_length", truncation=True)

In [20]:
# Load and format CSVs
train_df = load_csv("train", True)
train_dataset = format_dataset(train_df)

test_df = load_csv("test", True)
test_dataset = format_dataset(test_df)

val_df = load_csv("validate", True)
val_dataset = format_dataset(val_df)

                                                                                                                       

## Train initial model

In [10]:
save_dir = "./Models/standard-initial"

# Create directories
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [11]:
# Attach classification head and prepare model for trainer
config = RobertaConfig.from_pretrained(load_dir, num_labels=num_labels)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
training_args = TrainingArguments(
    output_dir = save_dir,
    report_to = "none")

# Initialize model
model = RobertaAbusiveClassification.from_pretrained(load_dir, config=config)
model.to(device)
model.train()

# Initialize Trainer class and train
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics)
result = trainer.train()

# Evaluate model and save metrics and predictions
model.eval()
predictions = []
metrics = trainer.evaluate()
trainer.save_metrics('validation', metrics)
save_preds(f'{save_dir}/validation_predictions.json', predictions.tolist())

# Save trained model
model.save_pretrained(f'{save_dir}/model')
tokenizer.save_pretrained(f'{save_dir}/model')

Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAbusiveClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weig

Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./Models/standard-initial/model\\tokenizer_config.json',
 './Models/standard-initial/model\\special_tokens_map.json',
 './Models/standard-initial/model\\vocab.json',
 './Models/standard-initial/model\\merges.txt',
 './Models/standard-initial/model\\added_tokens.json',
 './Models/standard-initial/model\\tokenizer.json')

In [12]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
val_df = pd.DataFrame(values.reshape(4, 4), columns=colnames, index=["Average"] + classes)

print("Validation set metrics:")
val_df

Validation set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.712389,0.717259,0.712389,0.713154
Explicit,0.757576,0.573921,0.757576,0.653083
Implicit,0.76,0.5776,0.76,0.656364
Non-abusive,0.635294,0.403599,0.635294,0.49361


In [13]:
predictions = []
metrics = trainer.evaluate(test_dataset)
trainer.save_metrics('test', metrics)
save_preds(f'{save_dir}/test_predictions.json', predictions.tolist())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
test_df = pd.DataFrame(values.reshape(4, 4), columns=colnames, index=["Average"] + classes)

print("Test set metrics:")
test_df

Test set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.682819,0.694285,0.682819,0.68696
Explicit,0.788732,0.622099,0.788732,0.695575
Implicit,0.757143,0.573265,0.757143,0.652497
Non-abusive,0.534884,0.286101,0.534884,0.372798


## Fine-tune pre-trained model

Fine tuning using [Optuna with transformers](https://python-bloggers.com/2022/08/hyperparameter-tuning-a-transformer-with-optuna/)

In [15]:
# Variables
save_dir = "./Models/standard-tuning"

In [19]:
def objective(trial: optuna.Trial):
    config = RobertaConfig.from_pretrained(load_dir, num_labels=num_labels)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = RobertaAbusiveClassification.from_pretrained(load_dir, config=config)
    model.to(device)
    model.train()
    
    training_args = TrainingArguments(         
        output_dir= save_dir, 
        # Test for different optimizer
        # optimizer = trial.suggest_categorical("optimizer", ["MomentumSGD", "Adam"])
        optim = "adamw_torch",
        learning_rate = trial.suggest_float("learning_rate", 4e-5, 0.01, log=True),         
        weight_decay = trial.suggest_float("weight_decay", 4e-5, 0.01, log=True),         
        num_train_epochs = trial.suggest_int("num_train_epochs", low=2, high=5),         
        per_device_train_batch_size = 8,         
        per_device_eval_batch_size = 8,
        report_to = "none")

    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = val_dataset)  
    
    result = trainer.train()     
    return result.training_loss

In [20]:
study = optuna.create_study(study_name="hp-search-roberta", direction="minimize") 
study.optimize(func=objective, n_trials=128)

[I 2023-07-18 13:21:23,456] A new study created in memory with name: hp-search-roberta
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAbusiveClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.dense.

Step,Training Loss


[I 2023-07-18 13:22:47,105] Trial 0 finished with value: 1.3856196582765508 and parameters: {'learning_rate': 0.003649141609739691, 'weight_decay': 0.00040036096382604523, 'num_train_epochs': 3}. Best is trial 0 with value: 1.3856196582765508.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss


[I 2023-07-18 13:24:13,876] Trial 1 finished with value: 1.1972683784656954 and parameters: {'learning_rate': 0.0009099928570682495, 'weight_decay': 0.003792662235843005, 'num_train_epochs': 3}. Best is trial 1 with value: 1.1972683784656954.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss


[I 2023-07-18 13:25:39,990] Trial 2 finished with value: 1.1561002420602287 and parameters: {'learning_rate': 0.0005079731034528873, 'weight_decay': 0.0010374593528754187, 'num_train_epochs': 3}. Best is trial 2 with value: 1.1561002420602287.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss


[I 2023-07-18 13:26:43,487] Trial 3 finished with value: 1.3358615502378994 and parameters: {'learning_rate': 0.002893805387999785, 'weight_decay': 0.00033256730567339786, 'num_train_epochs': 2}. Best is trial 2 with value: 1.1561002420602287.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss


[I 2023-07-18 13:27:49,713] Trial 4 finished with value: 1.152994485726034 and parameters: {'learning_rate': 0.0002948351264899888, 'weight_decay': 0.00019031398465809516, 'num_train_epochs': 2}. Best is trial 4 with value: 1.152994485726034.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss
500,0.4146


[I 2023-07-18 13:29:59,764] Trial 5 finished with value: 0.3926650843674079 and parameters: {'learning_rate': 4.779007457301192e-05, 'weight_decay': 6.997390720735163e-05, 'num_train_epochs': 4}. Best is trial 5 with value: 0.3926650843674079.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss
500,0.5387


[I 2023-07-18 13:32:01,160] Trial 6 finished with value: 0.5155450912346518 and parameters: {'learning_rate': 0.00016306318523167125, 'weight_decay': 0.008861089148222236, 'num_train_epochs': 4}. Best is trial 5 with value: 0.3926650843674079.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss


[I 2023-07-18 13:33:34,664] Trial 7 finished with value: 0.5034562878142622 and parameters: {'learning_rate': 8.336132913034417e-05, 'weight_decay': 0.000406519099843087, 'num_train_epochs': 3}. Best is trial 5 with value: 0.3926650843674079.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss


[I 2023-07-18 13:34:32,030] Trial 8 finished with value: 1.180972565385632 and parameters: {'learning_rate': 0.00038424226178832, 'weight_decay': 0.00026940123151447746, 'num_train_epochs': 2}. Best is trial 5 with value: 0.3926650843674079.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of 

Step,Training Loss
500,1.4349


[I 2023-07-18 13:37:02,767] Trial 9 finished with value: 1.3760431504787358 and parameters: {'learning_rate': 0.00575868978972807, 'weight_decay': 0.00013717722499117355, 'num_train_epochs': 5}. Best is trial 5 with value: 0.3926650843674079.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss
500,0.443


[I 2023-07-18 13:39:50,503] Trial 10 finished with value: 0.33924921580723355 and parameters: {'learning_rate': 4.229266632733644e-05, 'weight_decay': 4.619589286736094e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.33924921580723355.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.445


[I 2023-07-18 13:42:39,031] Trial 11 finished with value: 0.3399905147409081 and parameters: {'learning_rate': 4.609660420312733e-05, 'weight_decay': 4.5262542565300266e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.33924921580723355.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4462


[I 2023-07-18 13:45:26,774] Trial 12 finished with value: 0.34049107902928405 and parameters: {'learning_rate': 4.539572731967106e-05, 'weight_decay': 4.2994015857789415e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.33924921580723355.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.5074


[I 2023-07-18 13:48:13,751] Trial 13 finished with value: 0.3925767260386532 and parameters: {'learning_rate': 0.00011566692013338137, 'weight_decay': 4.3993975316290984e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.33924921580723355.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4389


[I 2023-07-18 13:50:28,699] Trial 14 finished with value: 0.4148058535012984 and parameters: {'learning_rate': 4.4415567271890195e-05, 'weight_decay': 0.00010042033496137224, 'num_train_epochs': 4}. Best is trial 10 with value: 0.33924921580723355.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.7666


[I 2023-07-18 13:53:09,605] Trial 15 finished with value: 0.6486485990366542 and parameters: {'learning_rate': 0.0001702241263646181, 'weight_decay': 8.191606206184445e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.33924921580723355.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4209


[I 2023-07-18 13:55:20,110] Trial 16 finished with value: 0.3971115864981386 and parameters: {'learning_rate': 7.612506777662525e-05, 'weight_decay': 4.400096804285904e-05, 'num_train_epochs': 4}. Best is trial 10 with value: 0.33924921580723355.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.444


[I 2023-07-18 13:58:00,747] Trial 17 finished with value: 0.33794209401410324 and parameters: {'learning_rate': 8.265618101225071e-05, 'weight_decay': 0.0001286294081908788, 'num_train_epochs': 5}. Best is trial 17 with value: 0.33794209401410324.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.8299


[I 2023-07-18 14:00:40,159] Trial 18 finished with value: 0.7742763203785832 and parameters: {'learning_rate': 0.00021145133742905302, 'weight_decay': 0.00015153591015322143, 'num_train_epochs': 5}. Best is trial 17 with value: 0.33794209401410324.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.463


[I 2023-07-18 14:02:34,313] Trial 19 finished with value: 0.43595989362189647 and parameters: {'learning_rate': 9.675178946394477e-05, 'weight_decay': 0.00010474695055006557, 'num_train_epochs': 4}. Best is trial 17 with value: 0.33794209401410324.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,1.1814


[I 2023-07-18 14:04:55,422] Trial 20 finished with value: 1.1644438091077303 and parameters: {'learning_rate': 0.0009659699596891921, 'weight_decay': 0.0001894834996319691, 'num_train_epochs': 5}. Best is trial 17 with value: 0.33794209401410324.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4454


[I 2023-07-18 14:07:17,725] Trial 21 finished with value: 0.3409537308198169 and parameters: {'learning_rate': 4.004971043741128e-05, 'weight_decay': 5.9294636130637266e-05, 'num_train_epochs': 5}. Best is trial 17 with value: 0.33794209401410324.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4408


[I 2023-07-18 14:09:37,299] Trial 22 finished with value: 0.33695004649628374 and parameters: {'learning_rate': 7.105020235130945e-05, 'weight_decay': 7.43731901107666e-05, 'num_train_epochs': 5}. Best is trial 22 with value: 0.33695004649628374.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4727


[I 2023-07-18 14:11:33,294] Trial 23 finished with value: 0.4484091974738845 and parameters: {'learning_rate': 0.00011650527477343434, 'weight_decay': 9.182373694806714e-05, 'num_train_epochs': 4}. Best is trial 22 with value: 0.33695004649628374.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.438


[I 2023-07-18 14:13:52,367] Trial 24 finished with value: 0.33248415172548224 and parameters: {'learning_rate': 5.8902922198008515e-05, 'weight_decay': 7.056610608872833e-05, 'num_train_epochs': 5}. Best is trial 24 with value: 0.33248415172548224.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4357


[I 2023-07-18 14:16:12,631] Trial 25 finished with value: 0.33233975575382546 and parameters: {'learning_rate': 7.114487371584815e-05, 'weight_decay': 0.00012489685319895232, 'num_train_epochs': 5}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4278


[I 2023-07-18 14:18:05,039] Trial 26 finished with value: 0.40231084430072095 and parameters: {'learning_rate': 6.952720811061387e-05, 'weight_decay': 7.858157053037643e-05, 'num_train_epochs': 4}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.5334


[I 2023-07-18 14:20:26,769] Trial 27 finished with value: 0.4197871631249449 and parameters: {'learning_rate': 0.00015691047264790737, 'weight_decay': 6.502492054230678e-05, 'num_train_epochs': 5}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4299


[I 2023-07-18 14:22:22,978] Trial 28 finished with value: 0.40655024943495155 and parameters: {'learning_rate': 6.309010137589949e-05, 'weight_decay': 0.00022592552384278548, 'num_train_epochs': 4}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4648


[I 2023-07-18 14:24:41,732] Trial 29 finished with value: 0.354671415529753 and parameters: {'learning_rate': 0.00012083492429079664, 'weight_decay': 0.00012202860220124977, 'num_train_epochs': 5}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,1.1456


[I 2023-07-18 14:27:00,983] Trial 30 finished with value: 1.1376786081414474 and parameters: {'learning_rate': 0.0002293887567963007, 'weight_decay': 0.0006495642388117961, 'num_train_epochs': 5}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.44


[I 2023-07-18 14:29:20,247] Trial 31 finished with value: 0.33558869684549203 and parameters: {'learning_rate': 6.95734269630133e-05, 'weight_decay': 0.00013144600935338226, 'num_train_epochs': 5}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4476


[I 2023-07-18 14:31:39,704] Trial 32 finished with value: 0.34156725926506787 and parameters: {'learning_rate': 6.31031219281477e-05, 'weight_decay': 6.801949028213791e-05, 'num_train_epochs': 5}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.5108


[I 2023-07-18 14:33:59,168] Trial 33 finished with value: 0.39738221132665646 and parameters: {'learning_rate': 0.00011231291893607204, 'weight_decay': 0.00015642342268445763, 'num_train_epochs': 5}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss


[I 2023-07-18 14:35:21,600] Trial 34 finished with value: 0.5005576915310738 and parameters: {'learning_rate': 6.224893667183466e-05, 'weight_decay': 9.919450700729464e-05, 'num_train_epochs': 3}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4372


[I 2023-07-18 14:37:41,161] Trial 35 finished with value: 0.33439527741052155 and parameters: {'learning_rate': 8.859161248396699e-05, 'weight_decay': 0.0002852011258310014, 'num_train_epochs': 5}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4975


[I 2023-07-18 14:39:33,154] Trial 36 finished with value: 0.47450162548767894 and parameters: {'learning_rate': 0.00012901624890006977, 'weight_decay': 0.000290143890221493, 'num_train_epochs': 4}. Best is trial 25 with value: 0.33233975575382546.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.436


[I 2023-07-18 14:41:52,431] Trial 37 finished with value: 0.33094638415745325 and parameters: {'learning_rate': 9.327951779023881e-05, 'weight_decay': 0.00019772672950472105, 'num_train_epochs': 5}. Best is trial 37 with value: 0.33094638415745325.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.5194


[I 2023-07-18 14:43:44,721] Trial 38 finished with value: 0.4957074491601241 and parameters: {'learning_rate': 0.0001580797417677054, 'weight_decay': 0.0004222867512606949, 'num_train_epochs': 4}. Best is trial 37 with value: 0.33094638415745325.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.434


[I 2023-07-18 14:46:05,353] Trial 39 finished with value: 0.33019532117628514 and parameters: {'learning_rate': 9.923519083419231e-05, 'weight_decay': 0.00022485051188249442, 'num_train_epochs': 5}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-18 14:47:27,788] Trial 40 finished with value: 1.1565504337014412 and parameters: {'learning_rate': 0.00033506132316021506, 'weight_decay': 0.000180273948200682, 'num_train_epochs': 3}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.449


[I 2023-07-18 14:49:46,938] Trial 41 finished with value: 0.34052347007550693 and parameters: {'learning_rate': 9.896466197597806e-05, 'weight_decay': 0.00023338375941878904, 'num_train_epochs': 5}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4498


[I 2023-07-18 14:52:06,187] Trial 42 finished with value: 0.34539425211741515 and parameters: {'learning_rate': 9.09388791234381e-05, 'weight_decay': 0.00022822302755415362, 'num_train_epochs': 5}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-18 14:53:01,593] Trial 43 finished with value: 0.6509638621394795 and parameters: {'learning_rate': 5.6433877206659646e-05, 'weight_decay': 0.0003692008069278307, 'num_train_epochs': 2}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4437


[I 2023-07-18 14:55:21,354] Trial 44 finished with value: 0.33615264516127735 and parameters: {'learning_rate': 5.618673554442662e-05, 'weight_decay': 0.00031107959384731976, 'num_train_epochs': 5}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.437


[I 2023-07-18 14:57:40,622] Trial 45 finished with value: 0.3318957368234046 and parameters: {'learning_rate': 8.606798338050391e-05, 'weight_decay': 0.00017202995663933839, 'num_train_epochs': 5}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4378


[I 2023-07-18 15:00:02,294] Trial 46 finished with value: 0.3351887491412629 and parameters: {'learning_rate': 5.568583867912e-05, 'weight_decay': 0.0001658645169943687, 'num_train_epochs': 5}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss
500,0.5299


[I 2023-07-18 15:02:57,568] Trial 47 finished with value: 0.41006055057496954 and parameters: {'learning_rate': 0.00013976224974825057, 'weight_decay': 0.00011458997058971678, 'num_train_epochs': 5}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss
500,1.0774


[I 2023-07-18 15:05:17,966] Trial 48 finished with value: 1.0795483194795765 and parameters: {'learning_rate': 0.00020117969642017167, 'weight_decay': 0.00019473360985584475, 'num_train_epochs': 4}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,1.1455


[I 2023-07-18 15:08:12,631] Trial 49 finished with value: 1.1377171566611841 and parameters: {'learning_rate': 0.00026082636389867375, 'weight_decay': 5.652253305655739e-05, 'num_train_epochs': 5}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.439


[I 2023-07-18 15:10:39,966] Trial 50 finished with value: 0.41471329533067863 and parameters: {'learning_rate': 5.1121844877453725e-05, 'weight_decay': 0.0004769944826838404, 'num_train_epochs': 4}. Best is trial 39 with value: 0.33019532117628514.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4347


[I 2023-07-18 15:13:14,014] Trial 51 finished with value: 0.32895712726994564 and parameters: {'learning_rate': 9.181193307368631e-05, 'weight_decay': 0.0002633360187004873, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4328


[I 2023-07-18 15:15:43,489] Trial 52 finished with value: 0.3290459478708138 and parameters: {'learning_rate': 8.473089708753071e-05, 'weight_decay': 0.0001476272771089068, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4651


[I 2023-07-18 15:18:11,306] Trial 53 finished with value: 0.35499845805921054 and parameters: {'learning_rate': 9.532775348753044e-05, 'weight_decay': 0.0001393131182390161, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.6497


[I 2023-07-18 15:20:36,749] Trial 54 finished with value: 0.5436663347975652 and parameters: {'learning_rate': 0.0001785928232826186, 'weight_decay': 0.00024872023620815716, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4393


[I 2023-07-18 15:23:02,351] Trial 55 finished with value: 0.33401988717846404 and parameters: {'learning_rate': 7.911910993088412e-05, 'weight_decay': 0.0001731222145508128, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4935


[I 2023-07-18 15:25:26,751] Trial 56 finished with value: 0.3800337131758382 and parameters: {'learning_rate': 0.0001171849470125511, 'weight_decay': 0.0003495620623204159, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4467


[I 2023-07-18 15:27:50,726] Trial 57 finished with value: 0.34293652297858906 and parameters: {'learning_rate': 4.744201507344327e-05, 'weight_decay': 0.00020326725306978976, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4405


[I 2023-07-18 15:30:18,569] Trial 58 finished with value: 0.33479745441809633 and parameters: {'learning_rate': 8.073796750905761e-05, 'weight_decay': 0.00011507071259233676, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.5422


[I 2023-07-18 15:32:43,971] Trial 59 finished with value: 0.4287021866418365 and parameters: {'learning_rate': 0.00014000315200321948, 'weight_decay': 0.00014886711361377378, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4368


[I 2023-07-18 15:35:09,263] Trial 60 finished with value: 0.3305352037114308 and parameters: {'learning_rate': 7.697511192639459e-05, 'weight_decay': 9.098324531879766e-05, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.499


[I 2023-07-18 15:37:33,205] Trial 61 finished with value: 0.3863446837977359 and parameters: {'learning_rate': 0.00010212813508791154, 'weight_decay': 9.295436887646707e-05, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4347


[I 2023-07-18 15:39:57,229] Trial 62 finished with value: 0.3297771538110604 and parameters: {'learning_rate': 7.674619139328379e-05, 'weight_decay': 0.00026215178269317114, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4487


[I 2023-07-18 15:42:23,148] Trial 63 finished with value: 0.3416765507002522 and parameters: {'learning_rate': 8.296662288994785e-05, 'weight_decay': 0.00026510110680694546, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4966


[I 2023-07-18 15:44:47,580] Trial 64 finished with value: 0.3860107077691788 and parameters: {'learning_rate': 0.000103896775172904, 'weight_decay': 0.00020726916184518235, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.444


[I 2023-07-18 15:47:12,703] Trial 65 finished with value: 0.34029362685698317 and parameters: {'learning_rate': 4.1042420953365926e-05, 'weight_decay': 0.00016630156072690116, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss
500,0.4339


[I 2023-07-18 15:49:41,219] Trial 66 finished with value: 0.3312757606793167 and parameters: {'learning_rate': 7.318639702774949e-05, 'weight_decay': 0.00031494710071490104, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4432


[I 2023-07-18 15:52:12,055] Trial 67 finished with value: 0.3398911339896066 and parameters: {'learning_rate': 4.906911504995532e-05, 'weight_decay': 0.0003125685197044989, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4343


[I 2023-07-18 15:54:13,557] Trial 68 finished with value: 0.40908750146627426 and parameters: {'learning_rate': 6.805887155208752e-05, 'weight_decay': 0.00024952574290668393, 'num_train_epochs': 4}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.5386


[I 2023-07-18 15:56:39,457] Trial 69 finished with value: 0.4224299610109257 and parameters: {'learning_rate': 0.0001363505614924703, 'weight_decay': 0.000570079806061513, 'num_train_epochs': 5}. Best is trial 51 with value: 0.32895712726994564.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.4309


[I 2023-07-18 15:59:03,637] Trial 70 finished with value: 0.328465609801443 and parameters: {'learning_rate': 7.316232345445066e-05, 'weight_decay': 0.00034648485064419645, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.4321


[I 2023-07-18 16:01:27,523] Trial 71 finished with value: 0.3292370007450419 and parameters: {'learning_rate': 7.435961149847185e-05, 'weight_decay': 0.00040927715658274746, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.5333


[I 2023-07-18 16:03:52,710] Trial 72 finished with value: 0.41054707577354027 and parameters: {'learning_rate': 0.00011161496087476279, 'weight_decay': 0.00039948775722146486, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4506


[I 2023-07-18 16:06:17,299] Trial 73 finished with value: 0.34409721023158024 and parameters: {'learning_rate': 6.163998052612816e-05, 'weight_decay': 0.00026300552283780915, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4359


[I 2023-07-18 16:08:43,138] Trial 74 finished with value: 0.3298419966733545 and parameters: {'learning_rate': 7.48746348968623e-05, 'weight_decay': 0.0002079280297699517, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss
500,0.4413


[I 2023-07-18 16:11:08,693] Trial 75 finished with value: 0.3360131077300337 and parameters: {'learning_rate': 5.109498118525437e-05, 'weight_decay': 0.0003683945182305744, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.4411


[I 2023-07-18 16:13:33,551] Trial 76 finished with value: 0.3360018769601234 and parameters: {'learning_rate': 7.329327150922952e-05, 'weight_decay': 0.00047660432797409677, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.445


[I 2023-07-18 16:15:59,564] Trial 77 finished with value: 0.34082539422171454 and parameters: {'learning_rate': 4.079070350656673e-05, 'weight_decay': 0.00029208338715991323, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.5292


[I 2023-07-18 16:18:24,541] Trial 78 finished with value: 0.41676854728756096 and parameters: {'learning_rate': 0.00012636851540870911, 'weight_decay': 0.00014662698820925422, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-18 16:19:22,122] Trial 79 finished with value: 0.6413527754016388 and parameters: {'learning_rate': 6.473988516250906e-05, 'weight_decay': 0.0002206443675230254, 'num_train_epochs': 2}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss


[I 2023-07-18 16:20:47,483] Trial 80 finished with value: 0.5204749238819706 and parameters: {'learning_rate': 8.950230602808781e-05, 'weight_decay': 0.00022089135112266233, 'num_train_epochs': 3}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.4445


[I 2023-07-18 16:23:14,111] Trial 81 finished with value: 0.3396207185616171 and parameters: {'learning_rate': 9.809293593834379e-05, 'weight_decay': 0.00018944353832128593, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.4465


[I 2023-07-18 16:25:40,752] Trial 82 finished with value: 0.341669099492238 and parameters: {'learning_rate': 8.236118131513216e-05, 'weight_decay': 0.00034831760621502794, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.4371


[I 2023-07-18 16:28:04,526] Trial 83 finished with value: 0.33170816952124577 and parameters: {'learning_rate': 5.7389327611340856e-05, 'weight_decay': 0.00026828643697476156, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.5216


[I 2023-07-18 16:30:29,107] Trial 84 finished with value: 0.4066020262868781 and parameters: {'learning_rate': 0.00011030761734668788, 'weight_decay': 0.00020097592712676828, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.5912


[I 2023-07-18 16:32:53,676] Trial 85 finished with value: 0.47786622872029927 and parameters: {'learning_rate': 0.00015065054745239188, 'weight_decay': 0.00012947032977605585, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4375


[I 2023-07-18 16:35:18,683] Trial 86 finished with value: 0.3341560266968003 and parameters: {'learning_rate': 7.230028229724695e-05, 'weight_decay': 8.511330420650915e-05, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.4501


[I 2023-07-18 16:37:44,155] Trial 87 finished with value: 0.3477379239591441 and parameters: {'learning_rate': 9.416187452576579e-05, 'weight_decay': 0.00010678994108790712, 'num_train_epochs': 5}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.4929


[I 2023-07-18 16:39:42,987] Trial 88 finished with value: 0.46438204906040564 and parameters: {'learning_rate': 0.00012354873789547925, 'weight_decay': 0.00014625828264335563, 'num_train_epochs': 4}. Best is trial 70 with value: 0.328465609801443.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4265


[I 2023-07-18 16:42:08,149] Trial 89 finished with value: 0.3239022950480755 and parameters: {'learning_rate': 5.33350707976753e-05, 'weight_decay': 0.000244025553447447, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss
500,0.4451


[I 2023-07-18 16:44:33,223] Trial 90 finished with value: 0.3421543085485473 and parameters: {'learning_rate': 4.7169084309596375e-05, 'weight_decay': 0.0003217566869257828, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4481


[I 2023-07-18 16:46:57,442] Trial 91 finished with value: 0.3428603591775536 and parameters: {'learning_rate': 6.251690153464062e-05, 'weight_decay': 0.00023956778017501277, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4342


[I 2023-07-18 16:49:23,243] Trial 92 finished with value: 0.3296056912357646 and parameters: {'learning_rate': 7.685496289639441e-05, 'weight_decay': 0.00027683572824533357, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.442


[I 2023-07-18 16:51:49,057] Trial 93 finished with value: 0.33620402149688033 and parameters: {'learning_rate': 5.398738214772251e-05, 'weight_decay': 0.0002889867068617554, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4385


[I 2023-07-18 16:54:12,927] Trial 94 finished with value: 0.333145655008187 and parameters: {'learning_rate': 7.713036022802588e-05, 'weight_decay': 0.0004155116944665894, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.4409


[I 2023-07-18 16:56:39,877] Trial 95 finished with value: 0.3395717950691854 and parameters: {'learning_rate': 6.785319065859917e-05, 'weight_decay': 0.00024286863517837748, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4383


[I 2023-07-18 16:59:05,152] Trial 96 finished with value: 0.33495831955644423 and parameters: {'learning_rate': 5.606906706257458e-05, 'weight_decay': 0.00017065479678179745, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4347


[I 2023-07-18 17:01:30,476] Trial 97 finished with value: 0.3318729701795076 and parameters: {'learning_rate': 8.185740626647524e-05, 'weight_decay': 0.00032618812196055183, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4444


[I 2023-07-18 17:03:55,284] Trial 98 finished with value: 0.34144688799865264 and parameters: {'learning_rate': 4.8552492181132545e-05, 'weight_decay': 0.0001888466050447699, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.5042


[I 2023-07-18 17:06:20,258] Trial 99 finished with value: 0.391081734707481 and parameters: {'learning_rate': 0.00010773956753567482, 'weight_decay': 0.0002693363877569236, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.4423


[I 2023-07-18 17:08:44,660] Trial 100 finished with value: 0.3382279894405738 and parameters: {'learning_rate': 4.416245815476067e-05, 'weight_decay': 0.00021001387174255384, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4476


[I 2023-07-18 17:11:11,903] Trial 101 finished with value: 0.3428186373603075 and parameters: {'learning_rate': 8.763036089822344e-05, 'weight_decay': 0.00015479839267246506, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.465


[I 2023-07-18 17:13:37,311] Trial 102 finished with value: 0.35398137802468205 and parameters: {'learning_rate': 9.523323430175622e-05, 'weight_decay': 0.0002248864510736907, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4308


[I 2023-07-18 17:16:02,496] Trial 103 finished with value: 0.3269113788031098 and parameters: {'learning_rate': 7.566990390789883e-05, 'weight_decay': 0.0003630998783977515, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4498


[I 2023-07-18 17:18:27,420] Trial 104 finished with value: 0.3456360680716378 and parameters: {'learning_rate': 6.515341273577374e-05, 'weight_decay': 0.0003737574728831456, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4379


[I 2023-07-18 17:20:53,854] Trial 105 finished with value: 0.33561658070499734 and parameters: {'learning_rate': 7.762700699887871e-05, 'weight_decay': 0.0002875294296165263, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4495


[I 2023-07-18 17:23:19,685] Trial 106 finished with value: 0.34350849345214385 and parameters: {'learning_rate': 5.9274292520444094e-05, 'weight_decay': 0.000321295549285643, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.449


[I 2023-07-18 17:25:45,771] Trial 107 finished with value: 0.3432826088783436 and parameters: {'learning_rate': 7.078202640925216e-05, 'weight_decay': 0.00045772165992912737, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.441


[I 2023-07-18 17:28:09,850] Trial 108 finished with value: 0.33654301471279974 and parameters: {'learning_rate': 5.418594756218671e-05, 'weight_decay': 0.0005474919567026117, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.5065


[I 2023-07-18 17:30:34,974] Trial 109 finished with value: 0.39623466721154693 and parameters: {'learning_rate': 0.0001106426340163977, 'weight_decay': 0.00037832926625291485, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4327


[I 2023-07-18 17:33:00,963] Trial 110 finished with value: 0.32900874847756295 and parameters: {'learning_rate': 8.470444393002282e-05, 'weight_decay': 0.00024740629421056524, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4434


[I 2023-07-18 17:35:27,384] Trial 111 finished with value: 0.3384883848348058 and parameters: {'learning_rate': 8.771736825590717e-05, 'weight_decay': 0.00024513418359871326, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4294


[I 2023-07-18 17:37:50,553] Trial 112 finished with value: 0.32896753468907863 and parameters: {'learning_rate': 7.600582692623886e-05, 'weight_decay': 0.00034752300322505865, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4484


[I 2023-07-18 17:40:15,739] Trial 113 finished with value: 0.3427361119062381 and parameters: {'learning_rate': 6.400998917957116e-05, 'weight_decay': 0.00034598654398947326, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.5368


[I 2023-07-18 17:42:39,487] Trial 114 finished with value: 0.4210437401793057 and parameters: {'learning_rate': 0.000105344761329584, 'weight_decay': 0.00041850843094562917, 'num_train_epochs': 5}. Best is trial 89 with value: 0.3239022950480755.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4197


[I 2023-07-18 17:45:06,533] Trial 115 finished with value: 0.3184867629431244 and parameters: {'learning_rate': 7.456002813629831e-05, 'weight_decay': 0.0002953648171116107, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4343


[I 2023-07-18 17:47:29,785] Trial 116 finished with value: 0.3326426857396176 and parameters: {'learning_rate': 7.129032670837882e-05, 'weight_decay': 0.0002966944506598524, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4399


[I 2023-07-18 17:49:57,156] Trial 117 finished with value: 0.33481596645556 and parameters: {'learning_rate': 5.2462396925072326e-05, 'weight_decay': 0.0002635215001616187, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.4335


[I 2023-07-18 17:52:21,532] Trial 118 finished with value: 0.32946283440840873 and parameters: {'learning_rate': 8.448113321085435e-05, 'weight_decay': 0.0003555479609862259, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4379


[I 2023-07-18 17:54:45,396] Trial 119 finished with value: 0.3333432946886335 and parameters: {'learning_rate': 8.671230607140931e-05, 'weight_decay': 0.00043338639449488225, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4435


[I 2023-07-18 17:57:11,332] Trial 120 finished with value: 0.33911620082711813 and parameters: {'learning_rate': 4.4009994293200296e-05, 'weight_decay': 0.00034453444992359303, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some we

Step,Training Loss
500,0.4438


[I 2023-07-18 17:59:35,146] Trial 121 finished with value: 0.33919904572623116 and parameters: {'learning_rate': 7.740160303029775e-05, 'weight_decay': 0.0003872873024246259, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.4461


[I 2023-07-18 18:02:02,351] Trial 122 finished with value: 0.34027424073757084 and parameters: {'learning_rate': 6.213049448753813e-05, 'weight_decay': 0.00029174549046994677, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss
500,0.5164


[I 2023-07-18 18:04:26,382] Trial 123 finished with value: 0.40073871899368174 and parameters: {'learning_rate': 0.00012328080196254422, 'weight_decay': 0.00047633784906175414, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some we

Step,Training Loss
500,0.4643


[I 2023-07-18 18:06:50,734] Trial 124 finished with value: 0.3539062137890579 and parameters: {'learning_rate': 9.5207430346081e-05, 'weight_decay': 0.0003259232151266855, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.436


[I 2023-07-18 18:09:15,479] Trial 125 finished with value: 0.3342960759213096 and parameters: {'learning_rate': 7.349246105317668e-05, 'weight_decay': 0.0002515244149617616, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.4491


[I 2023-07-18 18:11:39,039] Trial 126 finished with value: 0.34231528339529393 and parameters: {'learning_rate': 5.905293791513107e-05, 'weight_decay': 0.00020805955756182423, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss
500,0.4357


[I 2023-07-18 18:14:04,845] Trial 127 finished with value: 0.33041852631963287 and parameters: {'learning_rate': 8.518707989810698e-05, 'weight_decay': 0.0003751827805596012, 'num_train_epochs': 5}. Best is trial 115 with value: 0.3184867629431244.


In [21]:
best_lr = float(study.best_params['learning_rate'])
best_weight_decay = float(study.best_params['weight_decay'])
best_epoch = int(study.best_params['num_train_epochs'])

print("Optimal hyperparamers:")
print(f'Learning rate\t: {best_lr}')
print(f'Weight decay\t: {best_weight_decay}')
print(f'Epochs\t\t: {best_epoch}')

Optimal hyperparamers:
Learning rate	: 7.456002813629831e-05
Weight decay	: 0.0002953648171116107
Epochs		: 5


In [22]:
# Save best hyperparameters
data = {
    'learning_rate': best_lr,
    'weight_decay': best_weight_decay,
    'epoch': best_epoch
}

# Serializing json
json_object = json.dumps(data, indent=4)

# Writing to sample.json
with open(f'{save_dir}/best_hyperparameters.json', "w") as outfile:
    outfile.write(json_object)

## Training the best model

In [23]:
# Variables
load_dir = "jcblaise/roberta-tagalog-base"
save_dir = "./Models/standard-final"
hyperparameter_dir = "./Models/standard-tuning"

In [24]:
# Load saved hyperparameters
with open(f'{hyperparameter_dir}/best_hyperparameters.json') as f:
    hyperparameters = json.load(f)

hyperparameters

{'learning_rate': 7.456002813629831e-05,
 'weight_decay': 0.0002953648171116107,
 'epoch': 5}

In [25]:
config = RobertaConfig.from_pretrained(load_dir, num_labels=num_labels)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = RobertaAbusiveClassification.from_pretrained(load_dir, config=config)
model.to(device)
model.train()

training_args = TrainingArguments(
    output_dir= save_dir,
    learning_rate=hyperparameters['learning_rate'],
    weight_decay=hyperparameters['weight_decay'],
    num_train_epochs=hyperparameters['epoch'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to = "none")

# Initialize Trainer class and train
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics)
result = trainer.train()

# Evaluate model and save metrics and predictions
model.eval()
predictions = []
metrics = trainer.evaluate()
trainer.save_metrics('validation', metrics)
save_preds(f'{save_dir}/validation_predictions.json', predictions.tolist())

# Save trained model
model.save_pretrained(f'{save_dir}/model')
tokenizer.save_pretrained(f'{save_dir}/model')

Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAbusiveClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weig

Step,Training Loss
500,0.4283


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./Models/standard-final/model\\tokenizer_config.json',
 './Models/standard-final/model\\special_tokens_map.json',
 './Models/standard-final/model\\vocab.json',
 './Models/standard-final/model\\merges.txt',
 './Models/standard-final/model\\added_tokens.json',
 './Models/standard-final/model\\tokenizer.json')

In [26]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
val_df = pd.DataFrame(values.reshape(4, 4), columns=colnames, index=["Average"] + classes)

print("Validation set metrics:")
val_df

Validation set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.676991,0.688843,0.676991,0.678448
Explicit,0.75,0.5625,0.75,0.642857
Implicit,0.746667,0.557511,0.746667,0.638372
Non-abusive,0.571429,0.326531,0.571429,0.415584


In [27]:
predictions = []
metrics = trainer.evaluate(test_dataset)
trainer.save_metrics('test', metrics)
save_preds(f'{save_dir}/test_predictions.json', predictions.tolist())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
test_df = pd.DataFrame(values.reshape(4, 4), columns=colnames, index=["Average"] + classes)

print("Test set metrics:")
test_df

Test set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.718062,0.730146,0.718062,0.721571
Explicit,0.830769,0.690178,0.830769,0.753975
Implicit,0.776316,0.602666,0.776316,0.678558
Non-abusive,0.581395,0.338021,0.581395,0.427497


# Hierarchical multiclass classifier RoBERTa

## Preparation for models

In [8]:
def compute_metrics_main(eval_pred):
    global predictions
    acc_metric = evaluate.load("accuracy")
    pre_metric = evaluate.load("precision")
    rec_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = acc_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = pre_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = rec_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    
    # Calculate per-class metrics
    results = []
    for i in range(0, 2):
        x = labels[predictions == i]
        y = predictions[predictions == i]
        
        class_accuracy = acc_metric.compute(predictions=y, references=x)["accuracy"]
        class_precision = pre_metric.compute(predictions=y, references=x, average="weighted")["precision"]
        class_recall = rec_metric.compute(predictions=y, references=x, average="weighted")["recall"]
        class_f1 = f1_metric.compute(predictions=y, references=x, average="weighted")["f1"]
        results += [class_accuracy, class_precision, class_recall, class_f1]
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1":f1,
           "NA_accuracy": results[0], "NA_precision": results[1], "NA_recall": results[2], "NA_f1": results[3],
           "A_accuracy": results[4], "A_precision": results[5], "A_recall": results[6], "A_f1": results[7]}

In [9]:
def compute_metrics_sub(eval_pred):
    global predictions
    acc_metric = evaluate.load("accuracy")
    pre_metric = evaluate.load("precision")
    rec_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = acc_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = pre_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = rec_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    
    # Calculate per-class metrics
    results = []
    for i in range(0, 2):
        x = labels[predictions == i]
        y = predictions[predictions == i]
        
        class_accuracy = acc_metric.compute(predictions=y, references=x)["accuracy"]
        class_precision = pre_metric.compute(predictions=y, references=x, average="weighted")["precision"]
        class_recall = rec_metric.compute(predictions=y, references=x, average="weighted")["recall"]
        class_f1 = f1_metric.compute(predictions=y, references=x, average="weighted")["f1"]
        results += [class_accuracy, class_precision, class_recall, class_f1]
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1":f1,
           "E_accuracy": results[0], "E_precision": results[1], "E_recall": results[2], "E_f1": results[3],
           "I_accuracy": results[4], "I_precision": results[5], "I_recall": results[6], "I_f1": results[7]}

In [10]:
def load_preds(directory):
    # Load files
    f = open(f'{directory}/test_predictions.json')
    preds = [json.load(f)]
    f.close()

    return np.array(preds[0]['predictions'])

In [11]:
# Variables
num_labels = 2
load_dir = "jcblaise/roberta-tagalog-base"

# Get tokenizer from repository
tokenizer = AutoTokenizer.from_pretrained(load_dir, model_max_length=256)

def tokenize_function(data):
    return tokenizer(data["text"], padding="max_length", truncation=True)

In [13]:
def convert_abusive_encoding(x):
    if x != 0:
        return 1
    return 0

# Load and format CSVs
df = load_csv("train", True)
df["Class"] = df["Class"].apply(convert_abusive_encoding)
main_train_dataset = format_dataset(df)

df = load_csv("test", True)
df["Class"] = df["Class"].apply(convert_abusive_encoding)
main_test_dataset = format_dataset(df)

df = load_csv("validate", True)
df["Class"] = df["Class"].apply(convert_abusive_encoding)
main_val_dataset = format_dataset(df)

                                                                                                                       

In [14]:
def convert_sub_encoding(x):
    if x == 2:
        return 1
    return 0

# Prepare datasets composed of abusive text
df = load_csv("train", True)
df = df[df["Class"] != 0]
sub_train_dataset = format_dataset(df)

# Prepare datasets composed of abusive text
df = load_csv("test", True)
df = df[df["Class"] != 0]
sub_test_dataset = format_dataset(df)

df = load_csv("validate", True)
df = df[df["Class"] != 0]
sub_val_dataset = format_dataset(df)

                                                                                                                       

## Train initial models

### Main model (NA v A)

In [34]:
save_dir = "./Models/hierarchical-initial/main"

# Create directories
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [35]:
# Attach classification head and prepare model for trainer
config = RobertaConfig.from_pretrained(load_dir, num_labels=num_labels)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
training_args = TrainingArguments(
    output_dir = save_dir,
    report_to = "none")

# Initialize model
model = RobertaAbusiveClassification.from_pretrained(load_dir, config=config)
model.to(device)
model.train()

# Initialize Trainer class and train
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = main_train_dataset,
    eval_dataset = main_val_dataset,
    compute_metrics = compute_metrics_main)
result = trainer.train()

# Evaluate model and save metrics and predictions
model.eval()
predictions = []
metrics = trainer.evaluate()
trainer.save_metrics('validation', metrics)
save_preds(f'{save_dir}/validation_predictions.json', predictions.tolist())

# Save trained model
model.save_pretrained(f'{save_dir}/model')
tokenizer.save_pretrained(f'{save_dir}/model')

Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAbusiveClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weig

Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./Models/hierarchical-initial/main/model\\tokenizer_config.json',
 './Models/hierarchical-initial/main/model\\special_tokens_map.json',
 './Models/hierarchical-initial/main/model\\vocab.json',
 './Models/hierarchical-initial/main/model\\merges.txt',
 './Models/hierarchical-initial/main/model\\added_tokens.json',
 './Models/hierarchical-initial/main/model\\tokenizer.json')

In [36]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
val_df = pd.DataFrame(values.reshape(3, 4), columns=colnames, index=["Average", "Non-Abusive", "Abusive"])

print("Validation set metrics:")
val_df

Validation set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.814159,0.810454,0.814159,0.808711
Non-Abusive,0.770492,0.593658,0.770492,0.670613
Abusive,0.830303,0.689403,0.830303,0.753321


In [37]:
predictions = []
metrics = trainer.evaluate(main_test_dataset)
trainer.save_metrics('test', metrics)
save_preds(f'{save_dir}/test_predictions.json', predictions.tolist())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
test_df = pd.DataFrame(values.reshape(3, 4), columns=colnames, index=["Average", "Non-Abusive", "Abusive"])

print("Test set metrics:")
test_df

Test set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.837004,0.835006,0.837004,0.83556
Non-Abusive,0.774648,0.600079,0.774648,0.67628
Abusive,0.865385,0.748891,0.865385,0.802934


### Sub model (E v I)

In [15]:
save_dir = "./Models/hierarchical-initial/sub"

# Create directories
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [41]:
# Attach classification head and prepare model for trainer
config = RobertaConfig.from_pretrained(load_dir, num_labels=num_labels)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
training_args = TrainingArguments(
    output_dir = save_dir,
    report_to = "none")

# Initialize model
model = RobertaAbusiveClassification.from_pretrained(load_dir, config=config)
model.to(device)
model.train()

# Initialize Trainer class and train
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = sub_train_dataset,
    eval_dataset = sub_val_dataset,
    compute_metrics = compute_metrics_sub)
result = trainer.train()

# Evaluate model and save metrics and predictions
model.eval()
predictions = []
metrics = trainer.evaluate()
trainer.save_metrics('validation', metrics)
save_preds(f'{save_dir}/validation_predictions.json', predictions.tolist())

# Save trained model
model.save_pretrained(f'{save_dir}/model')
tokenizer.save_pretrained(f'{save_dir}/model')

Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAbusiveClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weig

Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./Models/hierarchical-initial/sub/model\\tokenizer_config.json',
 './Models/hierarchical-initial/sub/model\\special_tokens_map.json',
 './Models/hierarchical-initial/sub/model\\vocab.json',
 './Models/hierarchical-initial/sub/model\\merges.txt',
 './Models/hierarchical-initial/sub/model\\added_tokens.json',
 './Models/hierarchical-initial/sub/model\\tokenizer.json')

In [42]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
val_df = pd.DataFrame(values.reshape(3, 4), columns=colnames, index=["Average", "Explicit", "Implicit"])

print("Validation set metrics:")
val_df

Validation set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.748344,0.748563,0.748344,0.748322
Explicit,0.74026,0.547984,0.74026,0.629773
Implicit,0.756757,0.572681,0.756757,0.651975


In [47]:
predictions = []
metrics = trainer.evaluate(sub_test_dataset)
trainer.save_metrics('test', metrics)
save_preds(f'{save_dir}/test_predictions.json', predictions.tolist())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
test_df = pd.DataFrame(values.reshape(3, 4), columns=colnames, index=["Average", "Explicit", "Implicit"])

print("Test set metrics:")
test_df

Test set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.774834,0.775529,0.774834,0.774637
Explicit,0.7625,0.581406,0.7625,0.659752
Implicit,0.788732,0.622099,0.788732,0.695575


**Test using only correct Abusive predictions**

In [50]:
predictions = load_preds("./Models/hierarchical-initial/main")
df = load_csv("test", True)

# Create filtered sub model test dataset from predictions of main model
NA_indices = df[df["Class"] == 0].index.tolist()
A_indices = np.argwhere(predictions != 0).flatten()
A_indices = [x for x in A_indices if x not in NA_indices]

df = df.iloc[A_indices]
filtered_test_dataset = format_dataset(df)
filtered_test_dataset

                                                                                                                       

Dataset({
    features: ['labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 135
})

In [52]:
predictions = []
metrics = trainer.evaluate(filtered_test_dataset)
trainer.save_metrics('test_filtered', metrics)
save_preds(f'{save_dir}/test_filtered_predictions.json', predictions.tolist())

colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
test_df = pd.DataFrame(values.reshape(3, 4), columns=colnames, index=["Average", "Explicit", "Implicit"])

print("Filtered test set metrics:")
test_df

Filtered test set metrics:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.77037,0.77047,0.77037,0.769348
Explicit,0.769231,0.591716,0.769231,0.668896
Implicit,0.77193,0.595876,0.77193,0.672573


## Fine-tune pre-trained models

### Main model (NA v A)

In [45]:
save_dir = "./Models/hierarchical-tuning/main"

# Create directories
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [46]:
def objective(trial: optuna.Trial):
    config = RobertaConfig.from_pretrained(load_dir, num_labels=num_labels)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = RobertaAbusiveClassification.from_pretrained(load_dir, config=config)
    model.to(device)
    model.train()
    
    training_args = TrainingArguments(         
        output_dir= save_dir,
        optim = "adamw_torch",
        learning_rate = trial.suggest_float("learning_rate", 4e-5, 0.01, log=True),         
        weight_decay = trial.suggest_float("weight_decay", 4e-5, 0.01, log=True),         
        num_train_epochs = trial.suggest_int("num_train_epochs", low=2, high=5),         
        per_device_train_batch_size = 8,         
        per_device_eval_batch_size = 8,
        report_to = "none")

    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = main_train_dataset,
        eval_dataset = main_val_dataset)  
    
    result = trainer.train()     
    return result.training_loss

In [47]:
study = optuna.create_study(study_name="hp-search-robertaH-main", direction="minimize") 
study.optimize(func=objective, n_trials=128)

[I 2023-07-18 18:19:32,756] A new study created in memory with name: hp-search-robertaH-main
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAbusiveClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.

Step,Training Loss


[I 2023-07-18 18:20:28,425] Trial 0 finished with value: 0.3698821677301163 and parameters: {'learning_rate': 7.924610166825681e-05, 'weight_decay': 0.004973586884825482, 'num_train_epochs': 2}. Best is trial 0 with value: 0.3698821677301163.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss
500,0.8057


[I 2023-07-18 18:22:22,617] Trial 1 finished with value: 0.7969553022456348 and parameters: {'learning_rate': 0.004005065781711942, 'weight_decay': 9.006508469742897e-05, 'num_train_epochs': 4}. Best is trial 0 with value: 0.3698821677301163.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss
500,0.2571


[I 2023-07-18 18:24:46,275] Trial 2 finished with value: 0.19745998418420777 and parameters: {'learning_rate': 7.126222232963731e-05, 'weight_decay': 0.008340276242823026, 'num_train_epochs': 5}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.9589


[I 2023-07-18 18:27:08,608] Trial 3 finished with value: 0.9072831921111372 and parameters: {'learning_rate': 0.006707233406564497, 'weight_decay': 4.436410388862246e-05, 'num_train_epochs': 5}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss
500,0.7192


[I 2023-07-18 18:29:05,940] Trial 4 finished with value: 0.713288142268819 and parameters: {'learning_rate': 0.0017192959331252904, 'weight_decay': 0.0010407830203928796, 'num_train_epochs': 4}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss
500,0.4284


[I 2023-07-18 18:31:03,697] Trial 5 finished with value: 0.4160873567251335 and parameters: {'learning_rate': 0.0001654057477334289, 'weight_decay': 0.00023746647152694206, 'num_train_epochs': 4}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-18 18:31:59,427] Trial 6 finished with value: 0.4101974946215637 and parameters: {'learning_rate': 0.00013899472639991193, 'weight_decay': 0.005304613618452322, 'num_train_epochs': 2}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.6744


[I 2023-07-18 18:34:25,428] Trial 7 finished with value: 0.6680062200790061 and parameters: {'learning_rate': 0.00040219566437886957, 'weight_decay': 0.00758268767330388, 'num_train_epochs': 5}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss
500,0.2481


[I 2023-07-18 18:36:24,186] Trial 8 finished with value: 0.23488052996029532 and parameters: {'learning_rate': 0.00011575342571692708, 'weight_decay': 0.0001740941315238374, 'num_train_epochs': 4}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-18 18:37:19,712] Trial 9 finished with value: 0.38448525909194375 and parameters: {'learning_rate': 9.33634400533019e-05, 'weight_decay': 5.1161921405949965e-05, 'num_train_epochs': 2}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-18 18:38:44,320] Trial 10 finished with value: 0.28248833295396697 and parameters: {'learning_rate': 4.243235629951345e-05, 'weight_decay': 0.001590364953936504, 'num_train_epochs': 3}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.6694


[I 2023-07-18 18:41:10,303] Trial 11 finished with value: 0.6615009049723919 and parameters: {'learning_rate': 0.00044920118581327225, 'weight_decay': 0.0003936904798952658, 'num_train_epochs': 5}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-18 18:42:34,506] Trial 12 finished with value: 0.2876944290964227 and parameters: {'learning_rate': 4.217057767004489e-05, 'weight_decay': 0.002474176034664449, 'num_train_epochs': 3}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.6746


[I 2023-07-18 18:45:00,718] Trial 13 finished with value: 0.6646769358699484 and parameters: {'learning_rate': 0.00023137447554462284, 'weight_decay': 0.0006172353781939355, 'num_train_epochs': 5}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.6871


[I 2023-07-18 18:46:55,575] Trial 14 finished with value: 0.6826297752839282 and parameters: {'learning_rate': 0.0009528450701958629, 'weight_decay': 0.009871765794006275, 'num_train_epochs': 4}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss


[I 2023-07-18 18:48:19,304] Trial 15 finished with value: 0.6771501813616071 and parameters: {'learning_rate': 0.0002897702970501656, 'weight_decay': 0.0027100474180854445, 'num_train_epochs': 3}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2773


[I 2023-07-18 18:50:43,717] Trial 16 finished with value: 0.2116707730114012 and parameters: {'learning_rate': 8.1809579079647e-05, 'weight_decay': 0.00019543014021743718, 'num_train_epochs': 5}. Best is trial 2 with value: 0.19745998418420777.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.2426


[I 2023-07-18 18:53:07,119] Trial 17 finished with value: 0.18468800863825288 and parameters: {'learning_rate': 6.932984986188197e-05, 'weight_decay': 0.0008049001529475193, 'num_train_epochs': 5}. Best is trial 17 with value: 0.18468800863825288.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.6711


[I 2023-07-18 18:55:33,733] Trial 18 finished with value: 0.6630775394296288 and parameters: {'learning_rate': 0.0002334627981352316, 'weight_decay': 0.0009754062109808732, 'num_train_epochs': 5}. Best is trial 17 with value: 0.18468800863825288.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2392


[I 2023-07-18 18:58:00,866] Trial 19 finished with value: 0.1834207122487233 and parameters: {'learning_rate': 5.397933205908749e-05, 'weight_decay': 0.0029240823377069507, 'num_train_epochs': 5}. Best is trial 19 with value: 0.1834207122487233.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.6838


[I 2023-07-18 19:00:08,571] Trial 20 finished with value: 0.6793974754505587 and parameters: {'learning_rate': 0.0007469348076255606, 'weight_decay': 0.0021101819761789333, 'num_train_epochs': 4}. Best is trial 19 with value: 0.1834207122487233.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2553


[I 2023-07-18 19:02:36,113] Trial 21 finished with value: 0.1955277148942302 and parameters: {'learning_rate': 6.55585783969359e-05, 'weight_decay': 0.003802131954419325, 'num_train_epochs': 5}. Best is trial 19 with value: 0.1834207122487233.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss
500,0.2536


[I 2023-07-18 19:05:05,997] Trial 22 finished with value: 0.1969628678228622 and parameters: {'learning_rate': 4.224726868718084e-05, 'weight_decay': 0.0034340025406912784, 'num_train_epochs': 5}. Best is trial 19 with value: 0.1834207122487233.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2432


[I 2023-07-18 19:07:29,704] Trial 23 finished with value: 0.18680334987496972 and parameters: {'learning_rate': 6.719787603116116e-05, 'weight_decay': 0.0012172785293327283, 'num_train_epochs': 5}. Best is trial 19 with value: 0.1834207122487233.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.3757


[I 2023-07-18 19:09:55,306] Trial 24 finished with value: 0.3376629707508517 and parameters: {'learning_rate': 0.00014385177938930688, 'weight_decay': 0.0011582051055848365, 'num_train_epochs': 5}. Best is trial 19 with value: 0.1834207122487233.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2426


[I 2023-07-18 19:11:48,672] Trial 25 finished with value: 0.23094418936205985 and parameters: {'learning_rate': 6.309044639875502e-05, 'weight_decay': 0.0016123469420706099, 'num_train_epochs': 4}. Best is trial 19 with value: 0.1834207122487233.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2951


[I 2023-07-18 19:14:11,911] Trial 26 finished with value: 0.23342696598597934 and parameters: {'learning_rate': 0.00011761808437261847, 'weight_decay': 0.0006403888579125334, 'num_train_epochs': 5}. Best is trial 19 with value: 0.1834207122487233.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.3924


[I 2023-07-18 19:16:05,649] Trial 27 finished with value: 0.38218920840356585 and parameters: {'learning_rate': 0.0001872595115189173, 'weight_decay': 0.0017448129850598708, 'num_train_epochs': 4}. Best is trial 19 with value: 0.1834207122487233.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2389


[I 2023-07-18 19:18:28,924] Trial 28 finished with value: 0.18227613778938925 and parameters: {'learning_rate': 4.96993229605129e-05, 'weight_decay': 0.0027588312673944315, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-18 19:19:51,667] Trial 29 finished with value: 0.30282179753583177 and parameters: {'learning_rate': 9.958978227326512e-05, 'weight_decay': 0.005047859611128303, 'num_train_epochs': 3}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2495


[I 2023-07-18 19:22:16,241] Trial 30 finished with value: 0.19222722412052012 and parameters: {'learning_rate': 5.920236510540236e-05, 'weight_decay': 0.0033711908206409944, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2409


[I 2023-07-18 19:24:36,205] Trial 31 finished with value: 0.18525776970655397 and parameters: {'learning_rate': 5.189043409590228e-05, 'weight_decay': 0.0023649190437152563, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.242


[I 2023-07-18 19:26:55,568] Trial 32 finished with value: 0.1867660457926585 and parameters: {'learning_rate': 4.1717068428804615e-05, 'weight_decay': 0.002405894087945979, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2479


[I 2023-07-18 19:29:17,790] Trial 33 finished with value: 0.19035697126747075 and parameters: {'learning_rate': 9.108544838407165e-05, 'weight_decay': 0.004384160397597933, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2411


[I 2023-07-18 19:31:36,757] Trial 34 finished with value: 0.18423658080567096 and parameters: {'learning_rate': 5.0934288654104495e-05, 'weight_decay': 0.006556021644526041, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2461


[I 2023-07-18 19:33:28,656] Trial 35 finished with value: 0.2339344472813427 and parameters: {'learning_rate': 8.233770735727382e-05, 'weight_decay': 0.0057974285837120694, 'num_train_epochs': 4}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2661


[I 2023-07-18 19:35:47,518] Trial 36 finished with value: 0.20252274947058885 and parameters: {'learning_rate': 6.0036844504066754e-05, 'weight_decay': 0.007224190806372046, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2657


[I 2023-07-18 19:37:40,978] Trial 37 finished with value: 0.2510950871204075 and parameters: {'learning_rate': 0.00014715705217138283, 'weight_decay': 0.006333583160528776, 'num_train_epochs': 4}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2981


[I 2023-07-18 19:40:00,436] Trial 38 finished with value: 0.23722442110678307 and parameters: {'learning_rate': 0.00010929170156865344, 'weight_decay': 0.004442197923440166, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2405


[I 2023-07-18 19:41:53,494] Trial 39 finished with value: 0.2277482818616064 and parameters: {'learning_rate': 5.453168912215109e-05, 'weight_decay': 0.008379932134901057, 'num_train_epochs': 4}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.253


[I 2023-07-18 19:44:12,246] Trial 40 finished with value: 0.19427010134646766 and parameters: {'learning_rate': 7.958034346384701e-05, 'weight_decay': 0.003367863787973371, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2411


[I 2023-07-18 19:46:34,587] Trial 41 finished with value: 0.18362452105471963 and parameters: {'learning_rate': 5.4325311175720166e-05, 'weight_decay': 0.0029542097361642916, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.2387


[I 2023-07-18 19:48:54,152] Trial 42 finished with value: 0.18336659517503323 and parameters: {'learning_rate': 5.011120511334064e-05, 'weight_decay': 0.005845490581182441, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2412


[I 2023-07-18 19:51:17,702] Trial 43 finished with value: 0.18512012187699625 and parameters: {'learning_rate': 5.215637980952352e-05, 'weight_decay': 0.006012762688203755, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2917


[I 2023-07-18 19:53:38,814] Trial 44 finished with value: 0.2254560857787168 and parameters: {'learning_rate': 0.0001131255620809632, 'weight_decay': 0.004853768089296231, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2444


[I 2023-07-18 19:55:58,782] Trial 45 finished with value: 0.18812341582506223 and parameters: {'learning_rate': 4.11637675500724e-05, 'weight_decay': 0.008794258109385864, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-18 19:56:54,186] Trial 46 finished with value: 0.3708636348408864 and parameters: {'learning_rate': 7.912955733088905e-05, 'weight_decay': 0.003031180643776904, 'num_train_epochs': 2}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2398


[I 2023-07-18 19:58:46,619] Trial 47 finished with value: 0.22875200424875533 and parameters: {'learning_rate': 5.287134012974465e-05, 'weight_decay': 0.007015865496549271, 'num_train_epochs': 4}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.3844


[I 2023-07-18 20:01:06,078] Trial 48 finished with value: 0.3368850077005257 and parameters: {'learning_rate': 0.00012327426917912044, 'weight_decay': 0.004542828046159868, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2508


[I 2023-07-18 20:03:25,123] Trial 49 finished with value: 0.1897489265391701 and parameters: {'learning_rate': 8.877639907290019e-05, 'weight_decay': 0.009645257175467693, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.6393


[I 2023-07-18 20:05:17,146] Trial 50 finished with value: 0.6378028948504225 and parameters: {'learning_rate': 0.0001732624330425262, 'weight_decay': 0.0018856743063435598, 'num_train_epochs': 4}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2554


[I 2023-07-18 20:07:36,008] Trial 51 finished with value: 0.19997906075384383 and parameters: {'learning_rate': 6.868908539994634e-05, 'weight_decay': 0.003000584538330522, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2399


[I 2023-07-18 20:09:54,862] Trial 52 finished with value: 0.1845054949136605 and parameters: {'learning_rate': 4.7518879436206416e-05, 'weight_decay': 0.00262254657922696, 'num_train_epochs': 5}. Best is trial 28 with value: 0.18227613778938925.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2381


[I 2023-07-18 20:12:13,714] Trial 53 finished with value: 0.18086845928565004 and parameters: {'learning_rate': 4.817391461366549e-05, 'weight_decay': 0.003826880119155053, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2528


[I 2023-07-18 20:14:32,976] Trial 54 finished with value: 0.19584749336529494 and parameters: {'learning_rate': 4.0034456406379294e-05, 'weight_decay': 0.003787198443277496, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2395


[I 2023-07-18 20:16:58,320] Trial 55 finished with value: 0.18438070268559276 and parameters: {'learning_rate': 5.368659845986421e-05, 'weight_decay': 0.005363712402617912, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2414


[I 2023-07-18 20:19:25,348] Trial 56 finished with value: 0.18610255807862247 and parameters: {'learning_rate': 7.365922779127664e-05, 'weight_decay': 0.004053825035772263, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2389


[I 2023-07-18 20:21:56,613] Trial 57 finished with value: 0.1833980994116991 and parameters: {'learning_rate': 4.759049997129343e-05, 'weight_decay': 0.006920681710240379, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.241


[I 2023-07-18 20:24:21,887] Trial 58 finished with value: 0.1844817018150387 and parameters: {'learning_rate': 6.418371307457702e-05, 'weight_decay': 0.002039200188309822, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-18 20:25:50,611] Trial 59 finished with value: 0.2910577683221726 and parameters: {'learning_rate': 9.171697293066189e-05, 'weight_decay': 0.002900613770087015, 'num_train_epochs': 3}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2485


[I 2023-07-18 20:28:21,307] Trial 60 finished with value: 0.19140552721525494 and parameters: {'learning_rate': 4.553616784811231e-05, 'weight_decay': 0.0014347390760707135, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2536


[I 2023-07-18 20:30:43,636] Trial 61 finished with value: 0.19336253198465905 and parameters: {'learning_rate': 4.98345712702031e-05, 'weight_decay': 0.007331311488680101, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.247


[I 2023-07-18 20:33:07,399] Trial 62 finished with value: 0.1898801911146121 and parameters: {'learning_rate': 6.061617625858567e-05, 'weight_decay': 0.006052779951502128, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2379


[I 2023-07-18 20:35:29,320] Trial 63 finished with value: 0.18122225040779974 and parameters: {'learning_rate': 4.805115185289545e-05, 'weight_decay': 0.003751265018573267, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2454


[I 2023-07-18 20:37:55,746] Trial 64 finished with value: 0.19077994913086854 and parameters: {'learning_rate': 7.149907683870555e-05, 'weight_decay': 0.0038298458094841357, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.3054


[I 2023-07-18 20:40:34,426] Trial 65 finished with value: 0.24659560497542074 and parameters: {'learning_rate': 0.00010322855542217611, 'weight_decay': 0.002252680539119426, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2405


[I 2023-07-18 20:43:36,404] Trial 66 finished with value: 0.18439474034130124 and parameters: {'learning_rate': 4.045275655413196e-05, 'weight_decay': 0.004954316570510438, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2419


[I 2023-07-18 20:46:23,746] Trial 67 finished with value: 0.1857559673768237 and parameters: {'learning_rate': 6.133686797411941e-05, 'weight_decay': 0.0032898562815378823, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2393


[I 2023-07-18 20:49:10,645] Trial 68 finished with value: 0.18246625706665498 and parameters: {'learning_rate': 4.910577559803989e-05, 'weight_decay': 0.0026257854868008508, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2311


[I 2023-07-18 20:51:28,813] Trial 69 finished with value: 0.2201843996693317 and parameters: {'learning_rate': 7.498318945178837e-05, 'weight_decay': 0.0019539446036914874, 'num_train_epochs': 4}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2471


[I 2023-07-18 20:54:18,317] Trial 70 finished with value: 0.19011476703156205 and parameters: {'learning_rate': 4.607858458872111e-05, 'weight_decay': 0.0025232635170691414, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2476


[I 2023-07-18 20:57:07,300] Trial 71 finished with value: 0.18943612916128977 and parameters: {'learning_rate': 4.672979741604707e-05, 'weight_decay': 0.0037440875384343207, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2385


[I 2023-07-18 20:59:55,375] Trial 72 finished with value: 0.18498776084498356 and parameters: {'learning_rate': 5.896542285618048e-05, 'weight_decay': 0.002922627777798093, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2513


[I 2023-07-18 21:02:42,209] Trial 73 finished with value: 0.1941499416093181 and parameters: {'learning_rate': 9.13026515877592e-05, 'weight_decay': 0.005368094354210921, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.2393


[I 2023-07-18 21:05:42,177] Trial 74 finished with value: 0.1842680235554401 and parameters: {'learning_rate': 5.373435031996541e-05, 'weight_decay': 0.004340105997568169, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2465


[I 2023-07-18 21:08:36,445] Trial 75 finished with value: 0.191411152459625 and parameters: {'learning_rate': 7.140047110841365e-05, 'weight_decay': 0.008190942285422068, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.2452


[I 2023-07-18 21:10:59,814] Trial 76 finished with value: 0.18948120031141696 and parameters: {'learning_rate': 4.007097028579556e-05, 'weight_decay': 0.0022702764111552387, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2411


[I 2023-07-18 21:13:20,914] Trial 77 finished with value: 0.18558114668480435 and parameters: {'learning_rate': 4.721427376920618e-05, 'weight_decay': 0.0016149501724582892, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2381


[I 2023-07-18 21:15:42,027] Trial 78 finished with value: 0.18440715710919603 and parameters: {'learning_rate': 5.882782743115332e-05, 'weight_decay': 0.0034097456983804026, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2424


[I 2023-07-18 21:18:02,513] Trial 79 finished with value: 0.18407635581224485 and parameters: {'learning_rate': 6.64612603686472e-05, 'weight_decay': 0.004241362787477925, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2596


[I 2023-07-18 21:19:55,871] Trial 80 finished with value: 0.24796104072628164 and parameters: {'learning_rate': 8.193872549718587e-05, 'weight_decay': 0.006501999545152725, 'num_train_epochs': 4}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2586


[I 2023-07-18 21:22:16,252] Trial 81 finished with value: 0.197885425108716 and parameters: {'learning_rate': 6.30971646817012e-05, 'weight_decay': 0.00426423695348275, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss
500,0.2404


[I 2023-07-18 21:24:36,736] Trial 82 finished with value: 0.18332342025928927 and parameters: {'learning_rate': 4.896890713118737e-05, 'weight_decay': 0.00529509661412302, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2414


[I 2023-07-18 21:26:59,482] Trial 83 finished with value: 0.1842342123949438 and parameters: {'learning_rate': 4.858415015951296e-05, 'weight_decay': 0.005198213578240168, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2394


[I 2023-07-18 21:29:20,376] Trial 84 finished with value: 0.18312837450127853 and parameters: {'learning_rate': 5.354562726372148e-05, 'weight_decay': 0.005751052409867561, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2354


[I 2023-07-18 21:31:40,873] Trial 85 finished with value: 0.18100569732207106 and parameters: {'learning_rate': 4.511334404090645e-05, 'weight_decay': 0.00759321031218791, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2617


[I 2023-07-18 21:34:02,384] Trial 86 finished with value: 0.20076152722638352 and parameters: {'learning_rate': 4.542295961890322e-05, 'weight_decay': 0.008137970183647597, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2546


[I 2023-07-18 21:36:23,900] Trial 87 finished with value: 0.19338493365094178 and parameters: {'learning_rate': 8.352058218051856e-05, 'weight_decay': 0.007045433493028381, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.3089


[I 2023-07-18 21:38:44,387] Trial 88 finished with value: 0.25620376185366983 and parameters: {'learning_rate': 9.87414931389036e-05, 'weight_decay': 0.009710537922941265, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2479


[I 2023-07-18 21:41:06,102] Trial 89 finished with value: 0.1906604766845703 and parameters: {'learning_rate': 4.006691675018917e-05, 'weight_decay': 0.00611738949238798, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss


[I 2023-07-18 21:42:01,606] Trial 90 finished with value: 0.36523348586003584 and parameters: {'learning_rate': 5.61243162776826e-05, 'weight_decay': 0.005502240505224668, 'num_train_epochs': 2}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.242


[I 2023-07-18 21:44:22,096] Trial 91 finished with value: 0.18465860463622819 and parameters: {'learning_rate': 4.999485310312523e-05, 'weight_decay': 0.005025845916411321, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-18 21:45:45,049] Trial 92 finished with value: 0.28746505010695683 and parameters: {'learning_rate': 6.766890665662244e-05, 'weight_decay': 0.007872830905543015, 'num_train_epochs': 3}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2505


[I 2023-07-18 21:48:05,531] Trial 93 finished with value: 0.19206904683794293 and parameters: {'learning_rate': 4.5787511567216e-05, 'weight_decay': 0.006773033674893683, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss
500,0.2405


[I 2023-07-18 21:50:29,011] Trial 94 finished with value: 0.18383516799238392 and parameters: {'learning_rate': 5.6070973075223206e-05, 'weight_decay': 0.004952213754526865, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2482


[I 2023-07-18 21:52:49,544] Trial 95 finished with value: 0.19174411637442454 and parameters: {'learning_rate': 7.65020677857344e-05, 'weight_decay': 0.005997063625425274, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2403


[I 2023-07-18 21:55:11,714] Trial 96 finished with value: 0.18433703515762673 and parameters: {'learning_rate': 5.2793267309974666e-05, 'weight_decay': 0.003783773008383783, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.24


[I 2023-07-18 21:57:32,203] Trial 97 finished with value: 0.18326947169196336 and parameters: {'learning_rate': 4.459893950589771e-05, 'weight_decay': 0.00473637668307955, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2553


[I 2023-07-18 21:59:53,918] Trial 98 finished with value: 0.19503930134880812 and parameters: {'learning_rate': 4.3248087029260145e-05, 'weight_decay': 0.004562260183994904, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.24


[I 2023-07-18 22:02:14,611] Trial 99 finished with value: 0.18280355267058637 and parameters: {'learning_rate': 6.379678698365954e-05, 'weight_decay': 0.005686313376725664, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2421


[I 2023-07-18 22:04:34,900] Trial 100 finished with value: 0.18561281727668935 and parameters: {'learning_rate': 6.443812617733438e-05, 'weight_decay': 0.005665103172422751, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.239


[I 2023-07-18 22:06:55,796] Trial 101 finished with value: 0.18242444239164654 and parameters: {'learning_rate': 4.9154864605821146e-05, 'weight_decay': 0.00755265963074593, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2385


[I 2023-07-18 22:09:16,289] Trial 102 finished with value: 0.18445865265408853 and parameters: {'learning_rate': 5.898757210948116e-05, 'weight_decay': 0.008991560986406752, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2439


[I 2023-07-18 22:11:39,442] Trial 103 finished with value: 0.18841005590625276 and parameters: {'learning_rate': 7.273529825569935e-05, 'weight_decay': 0.007796691432995937, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2393


[I 2023-07-18 22:13:59,735] Trial 104 finished with value: 0.18289495697595123 and parameters: {'learning_rate': 5.05681898343097e-05, 'weight_decay': 0.0046251304184578915, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2534


[I 2023-07-18 22:16:22,446] Trial 105 finished with value: 0.19589326865690992 and parameters: {'learning_rate': 4.299333205761585e-05, 'weight_decay': 0.004529968566001756, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2385


[I 2023-07-18 22:18:42,758] Trial 106 finished with value: 0.18280123517029268 and parameters: {'learning_rate': 5.577570780040395e-05, 'weight_decay': 0.0034862954582248486, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.2382


[I 2023-07-18 22:21:03,485] Trial 107 finished with value: 0.18244318065786722 and parameters: {'learning_rate': 5.51559864948835e-05, 'weight_decay': 0.002643909656085254, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2408


[I 2023-07-18 22:23:28,186] Trial 108 finished with value: 0.18536956686722605 and parameters: {'learning_rate': 5.753964561671079e-05, 'weight_decay': 0.002669324196000331, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2685


[I 2023-07-18 22:25:48,618] Trial 109 finished with value: 0.21001248538942266 and parameters: {'learning_rate': 8.457625581118079e-05, 'weight_decay': 0.003332031696457107, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2425


[I 2023-07-18 22:28:12,189] Trial 110 finished with value: 0.1837569224207025 and parameters: {'learning_rate': 6.643968854605702e-05, 'weight_decay': 0.0036798326826178313, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2386


[I 2023-07-18 22:30:32,515] Trial 111 finished with value: 0.18195708066897284 and parameters: {'learning_rate': 4.434254866901661e-05, 'weight_decay': 0.0031338732464839297, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.2401


[I 2023-07-18 22:32:57,132] Trial 112 finished with value: 0.18414229808893418 and parameters: {'learning_rate': 5.2802081374011296e-05, 'weight_decay': 0.0027759762026779503, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss
500,0.2666


[I 2023-07-18 22:35:18,054] Trial 113 finished with value: 0.205402784419239 and parameters: {'learning_rate': 4.017490066176095e-05, 'weight_decay': 0.004013769778992708, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2404


[I 2023-07-18 22:37:39,580] Trial 114 finished with value: 0.1840863625806077 and parameters: {'learning_rate': 6.121436423607594e-05, 'weight_decay': 0.0025101572879081915, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2411


[I 2023-07-18 22:40:01,531] Trial 115 finished with value: 0.18485660768093023 and parameters: {'learning_rate': 5.221039093893928e-05, 'weight_decay': 0.0031176400706957293, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.242


[I 2023-07-18 22:42:22,040] Trial 116 finished with value: 0.1864206572224323 and parameters: {'learning_rate': 7.30772260585931e-05, 'weight_decay': 0.002183880645624178, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss
500,0.2465


[I 2023-07-18 22:44:43,160] Trial 117 finished with value: 0.18846008472872855 and parameters: {'learning_rate': 4.6376885589829765e-05, 'weight_decay': 0.003586098914701958, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.2427


[I 2023-07-18 22:47:03,674] Trial 118 finished with value: 0.1852897454025154 and parameters: {'learning_rate': 5.6715461514112396e-05, 'weight_decay': 0.003365268440140757, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2434


[I 2023-07-18 22:49:24,175] Trial 119 finished with value: 0.18792134371018948 and parameters: {'learning_rate': 6.582743640651842e-05, 'weight_decay': 0.0029858603931939443, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.251


[I 2023-07-18 22:51:45,308] Trial 120 finished with value: 0.19406637822775016 and parameters: {'learning_rate': 7.809885172098516e-05, 'weight_decay': 0.0024399479068225676, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss
500,0.2515


[I 2023-07-18 22:54:12,800] Trial 121 finished with value: 0.19212580874450225 and parameters: {'learning_rate': 4.402334803947955e-05, 'weight_decay': 0.004487126154457663, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2361


[I 2023-07-18 22:56:37,742] Trial 122 finished with value: 0.1813016859212316 and parameters: {'learning_rate': 4.475028245129669e-05, 'weight_decay': 0.004059019306439742, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss
500,0.2385


[I 2023-07-18 22:59:02,835] Trial 123 finished with value: 0.1825867348147514 and parameters: {'learning_rate': 4.978164692574383e-05, 'weight_decay': 0.0038364719140930492, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2388


[I 2023-07-18 23:01:28,085] Trial 124 finished with value: 0.18150537211195866 and parameters: {'learning_rate': 4.833932105790857e-05, 'weight_decay': 0.003999826217569698, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2389


[I 2023-07-18 23:03:52,615] Trial 125 finished with value: 0.18251371043069023 and parameters: {'learning_rate': 4.454445496012115e-05, 'weight_decay': 0.003175834749367224, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss
500,0.2373


[I 2023-07-18 23:06:17,858] Trial 126 finished with value: 0.18160812478316457 and parameters: {'learning_rate': 4.468676561693775e-05, 'weight_decay': 0.0026601842989247555, 'num_train_epochs': 5}. Best is trial 53 with value: 0.18086845928565004.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-18 23:07:43,024] Trial 127 finished with value: 0.2907696296099135 and parameters: {'learning_rate': 4.012645327141113e-05, 'weight_decay': 0.002201506946379743, 'num_train_epochs': 3}. Best is trial 53 with value: 0.18086845928565004.


In [48]:
best_lr = float(study.best_params['learning_rate'])
best_weight_decay = float(study.best_params['weight_decay'])
best_epoch = int(study.best_params['num_train_epochs'])

print("Optimal hyperparamers:")
print(f'Learning rate\t: {best_lr}')
print(f'Weight decay\t: {best_weight_decay}')
print(f'Epochs\t\t: {best_epoch}')

Optimal hyperparamers:
Learning rate	: 4.817391461366549e-05
Weight decay	: 0.003826880119155053
Epochs		: 5


In [49]:
# Save best hyperparameters
data = {
    'learning_rate': best_lr,
    'weight_decay': best_weight_decay,
    'epoch': best_epoch
}

# Serializing json
json_object = json.dumps(data, indent=4)

# Writing to sample.json
with open(f'{save_dir}/best_hyperparameters.json', "w") as outfile:
    outfile.write(json_object)

### Sub model (E v I)

In [67]:
save_dir = "./Models/hierarchical-tuning/sub"

# Create directories
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [14]:
def objective(trial: optuna.Trial):
    config = RobertaConfig.from_pretrained(load_dir, num_labels=num_labels)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = RobertaAbusiveClassification.from_pretrained(load_dir, config=config)
    model.to(device)
    model.train()
    
    training_args = TrainingArguments(         
        output_dir= save_dir,
        optim = "adamw_torch",
        learning_rate = trial.suggest_float("learning_rate", 4e-5, 0.01, log=True),         
        weight_decay = trial.suggest_float("weight_decay", 4e-5, 0.01, log=True),         
        num_train_epochs = trial.suggest_int("num_train_epochs", low=2, high=5),         
        per_device_train_batch_size = 8,         
        per_device_eval_batch_size = 8,
        report_to = "none")

    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = sub_train_dataset,
        eval_dataset = sub_val_dataset)  
    
    result = trainer.train()     
    return result.training_loss

In [15]:
study = optuna.create_study(study_name="hp-search-robertaH-sub", direction="minimize") 
study.optimize(func=objective, n_trials=128)

[I 2023-07-20 04:10:45,256] A new study created in memory with name: hp-search-robertaH-sub
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAbusiveClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.o

Step,Training Loss


[I 2023-07-20 04:11:43,626] Trial 0 finished with value: 0.6394448713822798 and parameters: {'learning_rate': 0.00012179589962964528, 'weight_decay': 0.00015712083708170174, 'num_train_epochs': 3}. Best is trial 0 with value: 0.6394448713822798.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 04:12:17,036] Trial 1 finished with value: 0.8888439698652788 and parameters: {'learning_rate': 0.0014413291881208384, 'weight_decay': 0.0002625848732460458, 'num_train_epochs': 2}. Best is trial 0 with value: 0.6394448713822798.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss


[I 2023-07-20 04:13:40,015] Trial 2 finished with value: 0.8469607960094105 and parameters: {'learning_rate': 0.0018928310011835563, 'weight_decay': 0.0006879330505381637, 'num_train_epochs': 5}. Best is trial 0 with value: 0.6394448713822798.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss


[I 2023-07-20 04:14:49,455] Trial 3 finished with value: 0.7574059746482156 and parameters: {'learning_rate': 0.000522992242679723, 'weight_decay': 0.0005482274450803609, 'num_train_epochs': 4}. Best is trial 0 with value: 0.6394448713822798.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss


[I 2023-07-20 04:15:46,734] Trial 4 finished with value: 0.9754793571703362 and parameters: {'learning_rate': 0.004585798935881452, 'weight_decay': 0.004963925729071655, 'num_train_epochs': 3}. Best is trial 0 with value: 0.6394448713822798.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of 

Step,Training Loss


[I 2023-07-20 04:17:21,322] Trial 5 finished with value: 0.2293059609153054 and parameters: {'learning_rate': 5.904832927219576e-05, 'weight_decay': 0.00022823273035295797, 'num_train_epochs': 5}. Best is trial 5 with value: 0.2293059609153054.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss


[I 2023-07-20 04:18:17,859] Trial 6 finished with value: 0.754189462372751 and parameters: {'learning_rate': 0.0005393676628035888, 'weight_decay': 0.0002813454504662628, 'num_train_epochs': 3}. Best is trial 5 with value: 0.2293059609153054.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss


[I 2023-07-20 04:19:13,461] Trial 7 finished with value: 1.2454569267504143 and parameters: {'learning_rate': 0.008685056057565633, 'weight_decay': 0.0002555924285355664, 'num_train_epochs': 3}. Best is trial 5 with value: 0.2293059609153054.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss


[I 2023-07-20 04:20:31,789] Trial 8 finished with value: 0.8094717372547496 and parameters: {'learning_rate': 0.001505352549965447, 'weight_decay': 0.0013646640066446241, 'num_train_epochs': 4}. Best is trial 5 with value: 0.2293059609153054.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

Step,Training Loss


[I 2023-07-20 04:21:27,387] Trial 9 finished with value: 0.8012845010468455 and parameters: {'learning_rate': 0.0008672150777834965, 'weight_decay': 0.0001250245123181237, 'num_train_epochs': 3}. Best is trial 5 with value: 0.2293059609153054.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss


[I 2023-07-20 04:22:58,520] Trial 10 finished with value: 0.2248676820234819 and parameters: {'learning_rate': 5.298146934977288e-05, 'weight_decay': 5.013904222994834e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 04:24:29,764] Trial 11 finished with value: 0.23111780340021307 and parameters: {'learning_rate': 4.136549225969705e-05, 'weight_decay': 5.4317922286836754e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 04:26:01,099] Trial 12 finished with value: 0.23182966058904475 and parameters: {'learning_rate': 4.197312807744377e-05, 'weight_decay': 5.0557593724929624e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 04:27:32,439] Trial 13 finished with value: 0.3531486511230469 and parameters: {'learning_rate': 0.00011747222395207439, 'weight_decay': 4.188874143894031e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 04:28:45,544] Trial 14 finished with value: 0.3858473084189675 and parameters: {'learning_rate': 0.00011807819934237959, 'weight_decay': 9.679602208795652e-05, 'num_train_epochs': 4}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 04:30:17,187] Trial 15 finished with value: 0.697346982088956 and parameters: {'learning_rate': 0.0002500487815826918, 'weight_decay': 7.589415979598756e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss


[I 2023-07-20 04:31:30,717] Trial 16 finished with value: 0.29282905838706275 and parameters: {'learning_rate': 6.989339469007923e-05, 'weight_decay': 9.917648903933095e-05, 'num_train_epochs': 4}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 04:32:07,975] Trial 17 finished with value: 0.6544132666154341 and parameters: {'learning_rate': 0.0002408645655002566, 'weight_decay': 0.0001478589568997225, 'num_train_epochs': 2}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 04:33:39,224] Trial 18 finished with value: 0.24873495968905363 and parameters: {'learning_rate': 7.33560220896472e-05, 'weight_decay': 4.603455238215234e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 04:34:52,429] Trial 19 finished with value: 0.7348045869307085 and parameters: {'learning_rate': 0.00028246654363665597, 'weight_decay': 7.622989047938792e-05, 'num_train_epochs': 4}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 04:36:23,678] Trial 20 finished with value: 0.24022638147527522 and parameters: {'learning_rate': 5.883358322244355e-05, 'weight_decay': 0.00017980609870406342, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 04:37:55,318] Trial 21 finished with value: 0.22527353113347834 and parameters: {'learning_rate': 4.469544254252516e-05, 'weight_decay': 4.0259428902822724e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 04:39:26,968] Trial 22 finished with value: 0.23111293099143287 and parameters: {'learning_rate': 4.1367281882357746e-05, 'weight_decay': 6.43826548244507e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 04:40:40,173] Trial 23 finished with value: 0.34145435419949616 and parameters: {'learning_rate': 8.98064154969122e-05, 'weight_decay': 8.049321648634743e-05, 'num_train_epochs': 4}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 04:42:11,628] Trial 24 finished with value: 0.23556112809614702 and parameters: {'learning_rate': 6.316192814451152e-05, 'weight_decay': 4.055431950622401e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 04:43:42,852] Trial 25 finished with value: 0.4081850225275213 and parameters: {'learning_rate': 0.000166916724685419, 'weight_decay': 0.00010095558114342787, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 04:44:56,287] Trial 26 finished with value: 0.2740499323064631 and parameters: {'learning_rate': 5.6568976160733515e-05, 'weight_decay': 6.950982877509242e-05, 'num_train_epochs': 4}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 04:46:27,414] Trial 27 finished with value: 0.3043392181396484 and parameters: {'learning_rate': 9.485411301960028e-05, 'weight_decay': 0.0001293493431339494, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 04:47:41,034] Trial 28 finished with value: 0.6172976493835449 and parameters: {'learning_rate': 0.0001699902186585125, 'weight_decay': 4.067040481114941e-05, 'num_train_epochs': 4}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 04:49:12,068] Trial 29 finished with value: 0.27853476784446024 and parameters: {'learning_rate': 9.380673931725938e-05, 'weight_decay': 0.0001826890509144229, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 04:50:43,617] Trial 30 finished with value: 0.7382892955433239 and parameters: {'learning_rate': 0.00014148675732671522, 'weight_decay': 8.364780814248145e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 04:52:14,843] Trial 31 finished with value: 0.23335557417436079 and parameters: {'learning_rate': 4.08575848270174e-05, 'weight_decay': 6.407949269163214e-05, 'num_train_epochs': 5}. Best is trial 10 with value: 0.2248676820234819.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 04:53:45,886] Trial 32 finished with value: 0.22137754613702948 and parameters: {'learning_rate': 5.31858844353488e-05, 'weight_decay': 6.458023062365206e-05, 'num_train_epochs': 5}. Best is trial 32 with value: 0.22137754613702948.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 04:54:23,053] Trial 33 finished with value: 0.49196217276833276 and parameters: {'learning_rate': 6.348548065577424e-05, 'weight_decay': 5.910914528671325e-05, 'num_train_epochs': 2}. Best is trial 32 with value: 0.22137754613702948.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 04:55:54,590] Trial 34 finished with value: 0.2949615478515625 and parameters: {'learning_rate': 9.683822238950667e-05, 'weight_decay': 9.82755861591166e-05, 'num_train_epochs': 5}. Best is trial 32 with value: 0.22137754613702948.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 04:57:08,121] Trial 35 finished with value: 0.275304685939442 and parameters: {'learning_rate': 5.49338709049357e-05, 'weight_decay': 5.486950509655786e-05, 'num_train_epochs': 4}. Best is trial 32 with value: 0.22137754613702948.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss


[I 2023-07-20 04:58:39,356] Trial 36 finished with value: 0.25089558688077057 and parameters: {'learning_rate': 7.246394559929675e-05, 'weight_decay': 0.00020769270585251006, 'num_train_epochs': 5}. Best is trial 32 with value: 0.22137754613702948.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 05:00:10,594] Trial 37 finished with value: 0.2198813178322532 and parameters: {'learning_rate': 5.2681075801798845e-05, 'weight_decay': 0.0001292251881188, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Step,Training Loss


[I 2023-07-20 05:01:41,832] Trial 38 finished with value: 0.3175597450949929 and parameters: {'learning_rate': 0.00013315130222831114, 'weight_decay': 0.00012228141075132837, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:02:55,042] Trial 39 finished with value: 0.27861634167757904 and parameters: {'learning_rate': 4.9286682500649294e-05, 'weight_decay': 4.0274766384888264e-05, 'num_train_epochs': 4}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 05:04:26,279] Trial 40 finished with value: 0.26191655939275565 and parameters: {'learning_rate': 8.974609357692329e-05, 'weight_decay': 5.889497844047016e-05, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:05:57,418] Trial 41 finished with value: 0.22117240212180397 and parameters: {'learning_rate': 5.049496839754345e-05, 'weight_decay': 0.00014243318941164075, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:07:28,649] Trial 42 finished with value: 0.22065799019553445 and parameters: {'learning_rate': 5.080244663061905e-05, 'weight_decay': 0.00010373279100978797, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:08:59,683] Trial 43 finished with value: 0.2199276317249645 and parameters: {'learning_rate': 5.280239367691226e-05, 'weight_decay': 0.0003018823260346803, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 05:10:31,431] Trial 44 finished with value: 0.24046845869584518 and parameters: {'learning_rate': 6.977354527256635e-05, 'weight_decay': 0.00030327788301452843, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:12:02,562] Trial 45 finished with value: 0.2250086004083807 and parameters: {'learning_rate': 5.481054388341209e-05, 'weight_decay': 0.0001419195271325179, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 05:13:33,702] Trial 46 finished with value: 0.24997270757501774 and parameters: {'learning_rate': 7.381283630777896e-05, 'weight_decay': 0.00034102883676777616, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:14:46,821] Trial 47 finished with value: 0.2798408811742609 and parameters: {'learning_rate': 4.887512172559584e-05, 'weight_decay': 0.00021663677392270503, 'num_train_epochs': 4}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:15:42,198] Trial 48 finished with value: 0.35273077993681934 and parameters: {'learning_rate': 4.1813559872384735e-05, 'weight_decay': 0.0004014531179805415, 'num_train_epochs': 3}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:17:13,239] Trial 49 finished with value: 0.28710004633123226 and parameters: {'learning_rate': 0.00011103937760190242, 'weight_decay': 0.0001563832044109771, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:18:44,376] Trial 50 finished with value: 0.25644331845370205 and parameters: {'learning_rate': 8.273856474779728e-05, 'weight_decay': 0.0002591222189202084, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:20:15,921] Trial 51 finished with value: 0.2199227593161843 and parameters: {'learning_rate': 5.273412026094808e-05, 'weight_decay': 0.00011913597647235832, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:21:47,059] Trial 52 finished with value: 0.2205243717540394 and parameters: {'learning_rate': 5.06743057621254e-05, 'weight_decay': 0.00011670157057666935, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 05:23:18,286] Trial 53 finished with value: 0.24059181213378905 and parameters: {'learning_rate': 7.088263757606901e-05, 'weight_decay': 0.00012101329070921747, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:24:49,738] Trial 54 finished with value: 0.23647675947709518 and parameters: {'learning_rate': 4.0006911761971285e-05, 'weight_decay': 0.0001770841318458715, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:26:21,271] Trial 55 finished with value: 0.22255748401988637 and parameters: {'learning_rate': 5.501135793181214e-05, 'weight_decay': 0.0002345022104562816, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:27:52,818] Trial 56 finished with value: 0.2613534753972834 and parameters: {'learning_rate': 8.100492937834238e-05, 'weight_decay': 0.00011216585228113661, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:29:24,157] Trial 57 finished with value: 0.22807168093594637 and parameters: {'learning_rate': 6.21925582424986e-05, 'weight_decay': 8.87029590056223e-05, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss


[I 2023-07-20 05:30:37,401] Trial 58 finished with value: 0.4357729825106534 and parameters: {'learning_rate': 0.00010581595604639903, 'weight_decay': 0.00014259823981771784, 'num_train_epochs': 4}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:32:08,406] Trial 59 finished with value: 0.2229660380970348 and parameters: {'learning_rate': 4.8565870224522806e-05, 'weight_decay': 0.00017361430673169898, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:33:21,619] Trial 60 finished with value: 0.31743634830821643 and parameters: {'learning_rate': 8.036416478608443e-05, 'weight_decay': 0.000110387610345767, 'num_train_epochs': 4}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 05:34:53,059] Trial 61 finished with value: 0.22847999225963245 and parameters: {'learning_rate': 5.024510525240658e-05, 'weight_decay': 7.932668168426583e-05, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:36:24,698] Trial 62 finished with value: 0.2298246210271662 and parameters: {'learning_rate': 6.275795841694023e-05, 'weight_decay': 9.47007433614218e-05, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights 

Step,Training Loss


[I 2023-07-20 05:37:55,735] Trial 63 finished with value: 0.22116959311745382 and parameters: {'learning_rate': 4.834531665360019e-05, 'weight_decay': 0.00013899350317466585, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:39:26,664] Trial 64 finished with value: 0.22306365966796876 and parameters: {'learning_rate': 4.7472009825836246e-05, 'weight_decay': 0.00015470748553959564, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 05:40:58,308] Trial 65 finished with value: 0.23544852516867898 and parameters: {'learning_rate': 6.404121688451819e-05, 'weight_decay': 0.0002021616989976568, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:42:29,855] Trial 66 finished with value: 0.4565690127286044 and parameters: {'learning_rate': 0.00011165273663996094, 'weight_decay': 0.00013723932078754265, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:44:00,982] Trial 67 finished with value: 0.234659836509011 and parameters: {'learning_rate': 4.0520210841761806e-05, 'weight_decay': 0.00011819479509914513, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:44:38,464] Trial 68 finished with value: 0.5056764429265802 and parameters: {'learning_rate': 8.462287329176784e-05, 'weight_decay': 0.00026171500614180715, 'num_train_epochs': 2}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:46:09,907] Trial 69 finished with value: 0.2523301211270419 and parameters: {'learning_rate': 5.787966310849831e-05, 'weight_decay': 0.00019143398482726875, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:47:23,231] Trial 70 finished with value: 0.2859658761457963 and parameters: {'learning_rate': 4.5268198015713595e-05, 'weight_decay': 0.00010011112738753083, 'num_train_epochs': 4}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:48:54,365] Trial 71 finished with value: 0.22819708043878728 and parameters: {'learning_rate': 5.2014348931437665e-05, 'weight_decay': 7.345654526922629e-05, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 05:49:49,962] Trial 72 finished with value: 0.36139950607762195 and parameters: {'learning_rate': 6.818105581761739e-05, 'weight_decay': 0.0001627152112217352, 'num_train_epochs': 3}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:51:21,095] Trial 73 finished with value: 0.2288122697310014 and parameters: {'learning_rate': 5.764367650913628e-05, 'weight_decay': 6.957811931375768e-05, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 05:52:52,130] Trial 74 finished with value: 0.25108739679509945 and parameters: {'learning_rate': 7.720986988485611e-05, 'weight_decay': 8.936934400150926e-05, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:54:23,472] Trial 75 finished with value: 0.22377260381525213 and parameters: {'learning_rate': 4.6864219698446566e-05, 'weight_decay': 0.00012945502156618883, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 05:55:55,017] Trial 76 finished with value: 0.2813674579967152 and parameters: {'learning_rate': 9.128630866512443e-05, 'weight_decay': 0.00010493548063337683, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 05:57:26,449] Trial 77 finished with value: 0.23081363331187854 and parameters: {'learning_rate': 6.14971018526839e-05, 'weight_decay': 8.157216306830927e-05, 'num_train_epochs': 5}. Best is trial 37 with value: 0.2198813178322532.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 05:58:57,901] Trial 78 finished with value: 0.21921402324329722 and parameters: {'learning_rate': 5.1987537900858175e-05, 'weight_decay': 6.7077703057999e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 06:00:29,032] Trial 79 finished with value: 0.2946685097434304 and parameters: {'learning_rate': 9.93585235733613e-05, 'weight_decay': 4.855352743251498e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

Step,Training Loss


[I 2023-07-20 06:02:00,167] Trial 80 finished with value: 0.22419641668146306 and parameters: {'learning_rate': 4.620020338820088e-05, 'weight_decay': 0.0001398964935026361, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:03:31,709] Trial 81 finished with value: 0.22115230560302734 and parameters: {'learning_rate': 5.062284800773517e-05, 'weight_decay': 6.82656875198654e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 06:05:02,841] Trial 82 finished with value: 0.236056171764027 and parameters: {'learning_rate': 4.0176787718232307e-05, 'weight_decay': 5.265749907113116e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 06:06:33,980] Trial 83 finished with value: 0.24080621545965022 and parameters: {'learning_rate': 7.089963693463916e-05, 'weight_decay': 0.00010977097856672207, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 06:08:05,115] Trial 84 finished with value: 0.21981684944846414 and parameters: {'learning_rate': 5.262182112332654e-05, 'weight_decay': 8.760140161263835e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:09:36,140] Trial 85 finished with value: 0.22550099112770775 and parameters: {'learning_rate': 5.783337129073822e-05, 'weight_decay': 6.43605932096282e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 06:11:07,587] Trial 86 finished with value: 0.24673023223876953 and parameters: {'learning_rate': 6.758303772343045e-05, 'weight_decay': 8.555121817304224e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:12:39,127] Trial 87 finished with value: 0.22468195828524504 and parameters: {'learning_rate': 4.585914455005596e-05, 'weight_decay': 7.582389978079777e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:14:10,261] Trial 88 finished with value: 0.25165713917125354 and parameters: {'learning_rate': 7.716474099238878e-05, 'weight_decay': 5.995673833842912e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:15:41,800] Trial 89 finished with value: 0.22087492509321732 and parameters: {'learning_rate': 5.32963171296944e-05, 'weight_decay': 9.877783461760156e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 06:17:12,947] Trial 90 finished with value: 0.3434345592151989 and parameters: {'learning_rate': 0.0001267249779175708, 'weight_decay': 9.489686029468634e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 06:18:44,078] Trial 91 finished with value: 0.22309230457652698 and parameters: {'learning_rate': 5.3886920114156096e-05, 'weight_decay': 0.00011594414402334387, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss


[I 2023-07-20 06:20:15,210] Trial 92 finished with value: 0.21967986713756216 and parameters: {'learning_rate': 5.243885657397386e-05, 'weight_decay': 7.47269319332358e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 06:21:46,340] Trial 93 finished with value: 0.22974295182661578 and parameters: {'learning_rate': 6.269980698988333e-05, 'weight_decay': 6.863199742588803e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:23:17,782] Trial 94 finished with value: 0.2242218017578125 and parameters: {'learning_rate': 5.453681284878393e-05, 'weight_decay': 4.601866261031492e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 06:24:49,018] Trial 95 finished with value: 0.22586382085626774 and parameters: {'learning_rate': 4.3828984007141666e-05, 'weight_decay': 8.574824926871459e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 06:26:20,256] Trial 96 finished with value: 0.27706558921120383 and parameters: {'learning_rate': 8.564451069821654e-05, 'weight_decay': 5.749744324506912e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:27:51,290] Trial 97 finished with value: 0.24173492084849965 and parameters: {'learning_rate': 7.211368563410699e-05, 'weight_decay': 7.322689066631774e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:29:22,835] Trial 98 finished with value: 0.21992553364146838 and parameters: {'learning_rate': 5.28101204029203e-05, 'weight_decay': 0.00010508302605770287, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:30:53,970] Trial 99 finished with value: 0.23501397913152522 and parameters: {'learning_rate': 6.439776954599698e-05, 'weight_decay': 0.00016564738182731154, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 06:32:25,101] Trial 100 finished with value: 0.2939550919966264 and parameters: {'learning_rate': 9.714954410645831e-05, 'weight_decay': 9.689932587974197e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:33:56,232] Trial 101 finished with value: 0.2206469102339311 and parameters: {'learning_rate': 5.3033432644257116e-05, 'weight_decay': 0.0001084449634632147, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 06:35:27,370] Trial 102 finished with value: 0.22577545859596945 and parameters: {'learning_rate': 4.3902346640056985e-05, 'weight_decay': 0.00012275465222151222, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some we

Step,Training Loss


[I 2023-07-20 06:36:58,802] Trial 103 finished with value: 0.22586458379572089 and parameters: {'learning_rate': 5.776246550283369e-05, 'weight_decay': 0.00010615721954717422, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss


[I 2023-07-20 06:38:30,363] Trial 104 finished with value: 0.2527122670953924 and parameters: {'learning_rate': 7.690300593141282e-05, 'weight_decay': 8.91819103482948e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

Step,Training Loss


[I 2023-07-20 06:40:01,588] Trial 105 finished with value: 0.22075788324529474 and parameters: {'learning_rate': 5.15126478659457e-05, 'weight_decay': 0.00010699453087309892, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 06:41:33,228] Trial 106 finished with value: 0.24171374927867542 and parameters: {'learning_rate': 6.561020517047997e-05, 'weight_decay': 0.00013188265760420592, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss


[I 2023-07-20 06:43:04,363] Trial 107 finished with value: 0.22459474043412642 and parameters: {'learning_rate': 4.4980530026105856e-05, 'weight_decay': 0.00015486415521081626, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some we

Step,Training Loss


[I 2023-07-20 06:44:35,610] Trial 108 finished with value: 0.23475213484330612 and parameters: {'learning_rate': 4.047713700229522e-05, 'weight_decay': 8.138881146925515e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 06:46:06,734] Trial 109 finished with value: 0.23091298883611505 and parameters: {'learning_rate': 5.88775191525558e-05, 'weight_decay': 0.00011756972431995589, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 06:47:37,765] Trial 110 finished with value: 0.2212909698486328 and parameters: {'learning_rate': 5.016105687419691e-05, 'weight_decay': 0.00017421096632171386, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 06:49:09,310] Trial 111 finished with value: 0.22404521595348012 and parameters: {'learning_rate': 5.3695166747104785e-05, 'weight_decay': 0.00010847084444876678, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some we

Step,Training Loss


[I 2023-07-20 06:50:40,442] Trial 112 finished with value: 0.21941814422607422 and parameters: {'learning_rate': 5.203652039286952e-05, 'weight_decay': 0.00010096834499966604, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss


[I 2023-07-20 06:52:11,688] Trial 113 finished with value: 0.23067240281538529 and parameters: {'learning_rate': 6.158262539154388e-05, 'weight_decay': 7.541946685371663e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 06:53:42,826] Trial 114 finished with value: 0.24282538674094462 and parameters: {'learning_rate': 6.943082905934179e-05, 'weight_decay': 0.00013209219209708395, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss


[I 2023-07-20 06:55:14,369] Trial 115 finished with value: 0.26188879880038174 and parameters: {'learning_rate': 8.400625335929575e-05, 'weight_decay': 9.768621467136994e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 06:56:45,396] Trial 116 finished with value: 0.22244522788307883 and parameters: {'learning_rate': 4.813613074913867e-05, 'weight_decay': 0.00021983392121920577, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss


[I 2023-07-20 06:58:16,431] Trial 117 finished with value: 0.2356375434181907 and parameters: {'learning_rate': 4.028815018405466e-05, 'weight_decay': 8.615237586028639e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 06:59:29,547] Trial 118 finished with value: 0.2881923588839444 and parameters: {'learning_rate': 4.476383875375226e-05, 'weight_decay': 0.00014949576489529183, 'num_train_epochs': 4}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Step,Training Loss


[I 2023-07-20 07:01:00,772] Trial 119 finished with value: 0.228364389592951 and parameters: {'learning_rate': 5.194487560909912e-05, 'weight_decay': 0.00011881014945273752, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 07:02:32,022] Trial 120 finished with value: 0.257341176813299 and parameters: {'learning_rate': 7.665986816146161e-05, 'weight_decay': 0.00010735036289795258, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 07:04:03,356] Trial 121 finished with value: 0.2241172443736683 and parameters: {'learning_rate': 5.652744470694867e-05, 'weight_decay': 9.564286066093018e-05, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 07:05:34,595] Trial 122 finished with value: 0.22075824737548827 and parameters: {'learning_rate': 5.1514355732157105e-05, 'weight_decay': 0.00019306757181108066, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some we

Step,Training Loss


[I 2023-07-20 07:07:05,934] Trial 123 finished with value: 0.24173016981645065 and parameters: {'learning_rate': 6.569355773390657e-05, 'weight_decay': 0.00018704460210534714, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some wei

Step,Training Loss


[I 2023-07-20 07:08:37,267] Trial 124 finished with value: 0.22245490334250712 and parameters: {'learning_rate': 4.8109061414976304e-05, 'weight_decay': 0.00012982289385572355, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some we

Step,Training Loss


[I 2023-07-20 07:10:08,514] Trial 125 finished with value: 0.23073130520907315 and parameters: {'learning_rate': 6.043650233918616e-05, 'weight_decay': 0.000146147137394288, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 07:11:40,160] Trial 126 finished with value: 0.22063302126797762 and parameters: {'learning_rate': 5.07962158420747e-05, 'weight_decay': 0.0001101862105202829, 'num_train_epochs': 5}. Best is trial 78 with value: 0.21921402324329722.
Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss


[I 2023-07-20 07:12:35,346] Trial 127 finished with value: 0.36433893261533795 and parameters: {'learning_rate': 7.354600939060409e-05, 'weight_decay': 7.818822118884811e-05, 'num_train_epochs': 3}. Best is trial 78 with value: 0.21921402324329722.


In [16]:
best_lr = float(study.best_params['learning_rate'])
best_weight_decay = float(study.best_params['weight_decay'])
best_epoch = int(study.best_params['num_train_epochs'])

print("Optimal hyperparamers:")
print(f'Learning rate\t: {best_lr}')
print(f'Weight decay\t: {best_weight_decay}')
print(f'Epochs\t\t: {best_epoch}')

Optimal hyperparamers:
Learning rate	: 5.1987537900858175e-05
Weight decay	: 6.7077703057999e-05
Epochs		: 5


In [17]:
# Save best hyperparameters
data = {
    'learning_rate': best_lr,
    'weight_decay': best_weight_decay,
    'epoch': best_epoch
}

# Serializing json
json_object = json.dumps(data, indent=4)

# Writing to sample.json
with open(f'{save_dir}/best_hyperparameters.json', "w") as outfile:
    outfile.write(json_object)

## Training the best models

### Main model (NA v A)

In [18]:
# Variables
load_dir = "jcblaise/roberta-tagalog-base"
save_dir = "./Models/hierarchical-final/main"
hyperparameter_dir = "./Models/hierarchical-tuning/main"

# Create directories
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [19]:
# Load saved hyperparameters
with open(f'{hyperparameter_dir}/best_hyperparameters.json') as f:
    hyperparameters = json.load(f)

hyperparameters

{'learning_rate': 4.817391461366549e-05,
 'weight_decay': 0.003826880119155053,
 'epoch': 5}

In [20]:
config = RobertaConfig.from_pretrained(load_dir, num_labels=num_labels)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = RobertaAbusiveClassification.from_pretrained(load_dir, config=config)
model.to(device)
model.train()

training_args = TrainingArguments(
    output_dir= save_dir,
    learning_rate=hyperparameters['learning_rate'],
    weight_decay=hyperparameters['weight_decay'],
    num_train_epochs=hyperparameters['epoch'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to = "none")

# Initialize Trainer class and train
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = main_train_dataset,
    eval_dataset = main_val_dataset,
    compute_metrics = compute_metrics_main)
result = trainer.train()

# Evaluate model and save metrics and predictions
model.eval()
predictions = []
metrics = trainer.evaluate()
trainer.save_metrics('validation', metrics)
save_preds(f'{save_dir}/validation_predictions.json', predictions.tolist())

# Save trained model
model.save_pretrained(f'{save_dir}/model')
tokenizer.save_pretrained(f'{save_dir}/model')

Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAbusiveClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.out_proj.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'classifier.out_p

Step,Training Loss
500,0.2572


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./Models/hierarchical-final/main/model\\tokenizer_config.json',
 './Models/hierarchical-final/main/model\\special_tokens_map.json',
 './Models/hierarchical-final/main/model\\vocab.json',
 './Models/hierarchical-final/main/model\\merges.txt',
 './Models/hierarchical-final/main/model\\added_tokens.json',
 './Models/hierarchical-final/main/model\\tokenizer.json')

In [21]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
val_df = pd.DataFrame(values.reshape(3, 4), columns=colnames, index=["Average", "Non-Abusive", "Abusive"])

print("Validation set metrics:")
val_df

Validation set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.818584,0.815152,0.818584,0.815419
Non-Abusive,0.757576,0.573921,0.757576,0.653083
Abusive,0.84375,0.711914,0.84375,0.772246


In [22]:
predictions = []
metrics = trainer.evaluate(main_test_dataset)
trainer.save_metrics('test', metrics)
save_preds(f'{save_dir}/test_predictions.json', predictions.tolist())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
test_df = pd.DataFrame(values.reshape(3, 4), columns=colnames, index=["Average", "Non-Abusive", "Abusive"])

print("Test set metrics:")
test_df

Test set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.828194,0.82878,0.828194,0.82847
Non-Abusive,0.74026,0.547984,0.74026,0.629773
Abusive,0.873333,0.762711,0.873333,0.814282


### Sub model (E v I)

In [53]:
# Variables
load_dir = "jcblaise/roberta-tagalog-base"
save_dir = "./Models/hierarchical-final/sub"
hyperparameter_dir = "./Models/hierarchical-tuning/sub"

# Create directories
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [54]:
# Load saved hyperparameters
#with open(f'{hyperparameter_dir}/best_hyperparameters.json') as f:

with open(f'{hyperparameter_dir}/best_hyperparameters.json') as f:
    hyperparameters = json.load(f)

hyperparameters

{'learning_rate': 5.1987537900858175e-05,
 'weight_decay': 6.7077703057999e-05,
 'epoch': 5}

In [80]:
# Attach classification head and prepare model for trainer
config = RobertaConfig.from_pretrained(load_dir, num_labels=num_labels)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
training_args = TrainingArguments(
    output_dir = save_dir,
    learning_rate=hyperparameters['learning_rate'],
    weight_decay=hyperparameters['weight_decay'],
    num_train_epochs=hyperparameters['epoch'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to = "none")

# Initialize model
model = RobertaAbusiveClassification.from_pretrained(load_dir, config=config)
model.to(device)
model.train()

# Initialize Trainer class and train
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = sub_train_dataset,
    eval_dataset = sub_val_dataset,
    compute_metrics = compute_metrics_sub)
result = trainer.train()

# Evaluate model and save metrics and predictions
model.eval()
predictions = []
metrics = trainer.evaluate()
trainer.save_metrics('validation', metrics)
save_preds(f'{save_dir}/validation_predictions.json', predictions.tolist())

# Save trained model
model.save_pretrained(f'{save_dir}/model')
tokenizer.save_pretrained(f'{save_dir}/model')

Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaAbusiveClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAbusiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAbusiveClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.

Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('./Models/hierarchical-final/sub/model\\tokenizer_config.json',
 './Models/hierarchical-final/sub/model\\special_tokens_map.json',
 './Models/hierarchical-final/sub/model\\vocab.json',
 './Models/hierarchical-final/sub/model\\merges.txt',
 './Models/hierarchical-final/sub/model\\added_tokens.json',
 './Models/hierarchical-final/sub/model\\tokenizer.json')

In [82]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
val_df = pd.DataFrame(values.reshape(3, 4), columns=colnames, index=["Average", "Explicit", "Implicit"])

print("Validation set metrics:")
val_df

Validation set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.781457,0.785683,0.781457,0.780765
Explicit,0.75,0.5625,0.75,0.642857
Implicit,0.820896,0.673869,0.820896,0.740152


In [56]:
predictions = []
metrics = trainer.evaluate(sub_test_dataset)
trainer.save_metrics('test', metrics)
save_preds(f'{save_dir}/test_predictions.json', predictions.tolist())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
test_df = pd.DataFrame(values.reshape(3, 4), columns=colnames, index=["Average", "Explicit", "Implicit"])

print("Test set metrics:")
test_df

Test set metrics:


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.768212,0.771899,0.768212,0.767293
Explicit,0.741176,0.549343,0.741176,0.631002
Implicit,0.80303,0.644858,0.80303,0.715304


**Test using only correct Abusive predictions**

In [59]:
predictions = load_preds("./Models/hierarchical-final/main")
df = load_csv("test", True)

# Create filtered sub model test dataset from predictions of main model
NA_indices = df[df["Class"] == 0].index.tolist()
A_indices = np.argwhere(predictions != 0).flatten()
A_indices = [x for x in A_indices if x not in NA_indices]

df = df.iloc[A_indices]
filtered_test_dataset = format_dataset(df)
filtered_test_dataset

                                                                                                                       

Dataset({
    features: ['labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 131
})

In [60]:
predictions = []
metrics = trainer.evaluate(filtered_test_dataset)
trainer.save_metrics('test_filtered', metrics)
save_preds(f'{save_dir}/test_filtered_predictions.json', predictions.tolist())

colnames = ["Accuracy", "Precision", "Recall", "F1"]
values = np.array(list(metrics.values())[1:-4])
test_df = pd.DataFrame(values.reshape(3, 4), columns=colnames, index=["Average", "Explicit", "Implicit"])

print("Filtered test set metrics:")
test_df

Filtered test set metrics:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1
Average,0.755725,0.757545,0.755725,0.752032
Explicit,0.746988,0.557991,0.746988,0.638803
Implicit,0.770833,0.594184,0.770833,0.671078
