In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")  

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



+ datapreprocessing

In [35]:
def load_iob_data(filepath):
    tokens, labels = [], []
    sentence_tokens, sentence_labels = [], []

    with open(filepath, "r") as file:
        for line in file:
            line = line.strip()

            # Check for blank line (indicates end of sentence)
            if not line:
                if sentence_tokens:
                    tokens.append(sentence_tokens)
                    labels.append(sentence_labels)
                    sentence_tokens, sentence_labels = [], []
            else:
                # Token format: word POS_tag IOB_label
                parts = line.split()
                if len(parts) == 3:
                    token, pos_tag, label = parts
                    sentence_tokens.append(token)
                    sentence_labels.append(label)
                else:
                    raise ValueError(f"Unexpected format in line: {line}")

    # Add the last sentence if it exists
    if sentence_tokens:
        tokens.append(sentence_tokens)
        labels.append(sentence_labels)

    return tokens, labels

In [36]:
# Load each data split
train_tokens, train_labels = load_iob_data("fold1/train.txt")
val_tokens, val_labels = load_iob_data("fold1/val.txt")
test_tokens, test_labels = load_iob_data("fold1/test.txt")

print(f"Loaded {len(train_tokens)} sentences for training.")
print(f"Loaded {len(val_tokens)} sentences for validation.")
print(f"Loaded {len(test_tokens)} sentences for testing.")

Loaded 1992 sentences for training.
Loaded 850 sentences for validation.
Loaded 864 sentences for testing.


In [37]:
# Create a set of unique labels and a label-to-id mapping
unique_labels = set(label for sentence_labels in train_labels + val_labels + test_labels for label in sentence_labels)
label_to_id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print("label_to_id:", label_to_id)
print("id_to_label:", id_to_label)

label_to_id: {'B-ART': 0, 'B-CON': 1, 'B-LOC': 2, 'B-MAT': 3, 'B-PER': 4, 'B-SPE': 5, 'I-ART': 6, 'I-CON': 7, 'I-LOC': 8, 'I-MAT': 9, 'I-PER': 10, 'I-SPE': 11, 'O': 12}
id_to_label: {0: 'B-ART', 1: 'B-CON', 2: 'B-LOC', 3: 'B-MAT', 4: 'B-PER', 5: 'B-SPE', 6: 'I-ART', 7: 'I-CON', 8: 'I-LOC', 9: 'I-MAT', 10: 'I-PER', 11: 'I-SPE', 12: 'O'}


In [38]:
# Tokenize and align labels with tokenized data
def tokenize_and_align_labels(tokens, labels):
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True)

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to original word IDs
        label_ids = []
        previous_word_idx = None

        if word_ids is None:
            print(f"Warning: No word_ids generated for sentence {i}. Tokens: {tokens[i]}")
            continue

        # Align each token's label
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id.get(label[word_idx], -100))  # Convert label to ID
            else:
                label_ids.append(-100)  # Ignore sub-tokens
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    if len(aligned_labels) != len(tokenized_inputs["input_ids"]):
        print(f"Mismatch in length for tokens and labels. Tokens: {len(tokenized_inputs['input_ids'])}, Labels: {len(aligned_labels)}")
        raise ValueError(f"Mismatch in length for tokens and labels: {len(tokenized_inputs['input_ids'])} vs {len(aligned_labels)}")

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

In [39]:
# Tokenize and align the datasets
train_data = tokenize_and_align_labels(train_tokens, train_labels)
val_data = tokenize_and_align_labels(val_tokens, val_labels)
test_data = tokenize_and_align_labels(test_tokens, test_labels)

# Convert tokenized data into Hugging Face Dataset format
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
test_dataset = Dataset.from_dict(test_data)

# Display dataset structure for verification
print(train_dataset[0])

{'input_ids': [101, 15982, 1407, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [-100, 12, 12, 12, -100]}


In [40]:
print(train_dataset)
print(val_dataset)
print(test_dataset)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1992
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 850
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 864
})


baseline model

In [41]:
unique_labels = set()
for example in train_dataset:
    unique_labels.update(example['labels'])

num_labels = len(unique_labels)
print("Number of unique labels:", num_labels)

Number of unique labels: 14


In [42]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import torch
from datasets import Dataset

# Load the pre-trained tokenizer and model for token classification
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=14)  # Set num_labels to match your dataset

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",           # Directory for saving results
    evaluation_strategy="epoch",      # Evaluate at the end of each epoch
    learning_rate=5e-5,               # Default learning rate
    per_device_train_batch_size=8,    # Default batch size for training
    per_device_eval_batch_size=8,     # Default batch size for evaluation
    num_train_epochs=3,               # Default number of epochs
    weight_decay=0.0,                 # Default weight decay (no regularization)
    logging_dir="./logs",             # Directory for logs
    logging_steps=500,                # Log every 500 steps
    report_to="tensorboard",          # Report to TensorBoard by default
    fp16=False,                       # Disable mixed precision by default
    gradient_accumulation_steps=1,    # No gradient accumulation by default
)


# Define metrics for evaluation (you can adjust this function to compute additional metrics like precision, recall)
def compute_metrics(p):
    predictions, labels = p
    predictions = torch.tensor(predictions) if not isinstance(predictions, torch.Tensor) else predictions
    labels = torch.tensor(labels) if not isinstance(labels, torch.Tensor) else labels

    # Get predicted labels and filter out padding (labels == -100)
    predictions = torch.argmax(predictions, axis=-1)
    active_predictions = predictions[labels != -100]
    active_labels = labels[labels != -100]

    # Compute accuracy
    accuracy = (active_predictions == active_labels).sum().item() / active_labels.numel()

    return {"accuracy": accuracy}


# Use the DataCollatorForTokenClassification to handle padding
data_collator = DataCollatorForTokenClassification(tokenizer)

# Set up the Trainer with the model, training arguments, datasets, and evaluation metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Your training dataset
    eval_dataset=val_dataset,  # Your validation dataset
    compute_metrics=compute_metrics,  # Function to compute metrics
    data_collator=data_collator,  # This will handle padding for you
)

# Train the model
trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.195894,0.943738
2,No log,0.194992,0.950522
3,0.143200,0.227527,0.946759


TrainOutput(global_step=747, training_loss=0.10660283217627999, metrics={'train_runtime': 139.2552, 'train_samples_per_second': 42.914, 'train_steps_per_second': 5.364, 'total_flos': 200648682299520.0, 'train_loss': 0.10660283217627999, 'epoch': 3.0})

In [43]:
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Print the evaluation results (metrics such as accuracy)
print(test_results)

{'eval_loss': 0.15888415277004242, 'eval_accuracy': 0.9640189314021782, 'eval_runtime': 1.5983, 'eval_samples_per_second': 540.567, 'eval_steps_per_second': 67.571, 'epoch': 3.0}


 hyperparameter optimization

In [12]:
pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [44]:
import gc
import torch
import optuna
from transformers import TrainingArguments, Trainer

gc.collect()
torch.cuda.empty_cache()  # Only if using a GPU

def objective(trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
    batch_size = trial.suggest_categorical("batch_size", [4, 8])
    weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)

    training_args = TrainingArguments(
        output_dir="./results_hyper",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=2,
        weight_decay=weight_decay,
        logging_dir="./logs_hyper",
        report_to="tensorboard",
        gradient_accumulation_steps=2,
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset.select(range(int(len(val_dataset) * 0.2))),
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )

    trainer.train()
    eval_result = trainer.evaluate()
    accuracy = eval_result["eval_accuracy"]
    return accuracy

# Run the optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)


[I 2024-11-10 16:25:20,273] A new study created in memory with name: no-name-a8b983ef-ba60-4190-85c3-2211851fc5e3
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.233944,0.943469
2,No log,0.271608,0.944957


[I 2024-11-10 16:26:56,180] Trial 0 finished with value: 0.944956858077953 and parameters: {'learning_rate': 2.247209011867037e-05, 'batch_size': 4, 'weight_decay': 0.034729169627198575}. Best is trial 0 with value: 0.944956858077953.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.294009,0.945552
2,No log,0.325048,0.945254


[I 2024-11-10 16:28:39,853] Trial 1 finished with value: 0.9452543885748289 and parameters: {'learning_rate': 2.5459812381564283e-05, 'batch_size': 4, 'weight_decay': 0.025628416282990587}. Best is trial 1 with value: 0.9452543885748289.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.407567,0.936328
2,No log,0.379968,0.940791


[I 2024-11-10 16:30:04,618] Trial 2 finished with value: 0.94079143112169 and parameters: {'learning_rate': 1.2125822331870064e-05, 'batch_size': 4, 'weight_decay': 0.07612760465788357}. Best is trial 1 with value: 0.9452543885748289.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.39642,0.939899
2,No log,0.399336,0.939006


[I 2024-11-10 16:31:35,053] Trial 3 finished with value: 0.9390062481404344 and parameters: {'learning_rate': 3.359311883199352e-05, 'batch_size': 4, 'weight_decay': 0.08298326844229478}. Best is trial 1 with value: 0.9452543885748289.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.516857,0.936924
2,No log,0.468348,0.940494


[I 2024-11-10 16:33:11,086] Trial 4 finished with value: 0.9404939006248141 and parameters: {'learning_rate': 1.0777132847101411e-05, 'batch_size': 4, 'weight_decay': 0.03865925288993745}. Best is trial 1 with value: 0.9452543885748289.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.401454,0.946742
2,No log,0.379075,0.946742


[I 2024-11-10 16:34:23,127] Trial 5 finished with value: 0.9467420410592086 and parameters: {'learning_rate': 4.336734071856019e-05, 'batch_size': 4, 'weight_decay': 0.0473495216113404}. Best is trial 5 with value: 0.9467420410592086.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.448785,0.943172
2,No log,0.446654,0.945254


[I 2024-11-10 16:35:38,284] Trial 6 finished with value: 0.9452543885748289 and parameters: {'learning_rate': 1.0200914316890729e-05, 'batch_size': 4, 'weight_decay': 0.07504459949089802}. Best is trial 5 with value: 0.9467420410592086.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.36345,0.94823
1,No log,0.380532,0.947337


[I 2024-11-10 16:36:30,894] Trial 7 finished with value: 0.9473371020529604 and parameters: {'learning_rate': 3.1770367715212835e-05, 'batch_size': 8, 'weight_decay': 0.09561987597262896}. Best is trial 7 with value: 0.9473371020529604.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.422896,0.947635
1,No log,0.433054,0.947635


[I 2024-11-10 16:37:17,388] Trial 8 finished with value: 0.9476346325498364 and parameters: {'learning_rate': 1.310499186412251e-05, 'batch_size': 8, 'weight_decay': 0.007430315249135899}. Best is trial 8 with value: 0.9476346325498364.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.392797,0.945254
2,No log,0.401571,0.948825


[I 2024-11-10 16:38:29,153] Trial 9 finished with value: 0.94882475453734 and parameters: {'learning_rate': 4.6204380410082856e-05, 'batch_size': 4, 'weight_decay': 0.08729508072711857}. Best is trial 9 with value: 0.94882475453734.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.35797,0.949717
1,No log,0.367052,0.94823


[I 2024-11-10 16:39:10,759] Trial 10 finished with value: 0.9482296935435882 and parameters: {'learning_rate': 4.8350648197824486e-05, 'batch_size': 8, 'weight_decay': 0.0638899564413803}. Best is trial 9 with value: 0.94882475453734.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.413874,0.949717
1,No log,0.422243,0.944659


[I 2024-11-10 16:39:52,401] Trial 11 finished with value: 0.9446593275810771 and parameters: {'learning_rate': 4.760032448380101e-05, 'batch_size': 8, 'weight_decay': 0.061382122984850004}. Best is trial 9 with value: 0.94882475453734.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.380888,0.947635
1,No log,0.415778,0.947635


[I 2024-11-10 16:40:34,235] Trial 12 finished with value: 0.9476346325498364 and parameters: {'learning_rate': 4.9698876051929795e-05, 'batch_size': 8, 'weight_decay': 0.06092199222042975}. Best is trial 9 with value: 0.94882475453734.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.482991,0.946445
1,No log,0.443524,0.94823


[I 2024-11-10 16:41:17,690] Trial 13 finished with value: 0.9482296935435882 and parameters: {'learning_rate': 3.7879173389512784e-05, 'batch_size': 8, 'weight_decay': 0.09453678711443217}. Best is trial 9 with value: 0.94882475453734.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.479295,0.942279
1,No log,0.476666,0.946147


[I 2024-11-10 16:41:59,274] Trial 14 finished with value: 0.9461469800654567 and parameters: {'learning_rate': 2.7524258387239552e-05, 'batch_size': 8, 'weight_decay': 0.06278065056508216}. Best is trial 9 with value: 0.94882475453734.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.576799,0.944957
1,No log,0.572889,0.946147


[I 2024-11-10 16:42:44,154] Trial 15 finished with value: 0.9461469800654567 and parameters: {'learning_rate': 1.5404335125118692e-05, 'batch_size': 8, 'weight_decay': 0.08191162928395553}. Best is trial 9 with value: 0.94882475453734.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.552442,0.944362
2,No log,0.561109,0.941982


[I 2024-11-10 16:44:03,162] Trial 16 finished with value: 0.9419815531091937 and parameters: {'learning_rate': 1.8396705709198982e-05, 'batch_size': 4, 'weight_decay': 0.0994108976863678}. Best is trial 9 with value: 0.94882475453734.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.622568,0.943469
1,No log,0.579676,0.944659


[I 2024-11-10 16:44:43,355] Trial 17 finished with value: 0.9446593275810771 and parameters: {'learning_rate': 3.9332048011524026e-05, 'batch_size': 8, 'weight_decay': 0.07086966588645473}. Best is trial 9 with value: 0.94882475453734.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.552884,0.950015
1,No log,0.561755,0.946445


[I 2024-11-10 16:45:29,937] Trial 18 finished with value: 0.9464445105623326 and parameters: {'learning_rate': 3.091434572180203e-05, 'batch_size': 8, 'weight_decay': 0.052744756597826396}. Best is trial 9 with value: 0.94882475453734.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform("weight_decay", 0.0, 0.1)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.497013,0.949122
2,No log,0.456063,0.946445


[I 2024-11-10 16:46:43,997] Trial 19 finished with value: 0.9464445105623326 and parameters: {'learning_rate': 3.991226373609179e-05, 'batch_size': 4, 'weight_decay': 0.08570473266855978}. Best is trial 9 with value: 0.94882475453734.


In [46]:
print(study.best_params)

{'learning_rate': 4.6204380410082856e-05, 'batch_size': 4, 'weight_decay': 0.08729508072711857}


In [45]:
# Retrieve the best hyperparameters
best_params = study.best_params

# Set up TrainingArguments with the best hyperparameters
training_args = TrainingArguments(
    output_dir="./final_model",
    evaluation_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"],
    num_train_epochs=2,  # You may increase this for final training if needed
    weight_decay=best_params["weight_decay"],
    logging_dir="./logs_final",
    report_to="tensorboard",
    gradient_accumulation_steps=2,
    fp16=True,
)

# Initialize the Trainer with best parameters
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Full training dataset
    eval_dataset=val_dataset,  # Full validation dataset for final evaluation during training
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Retrain the model on the full training set
trainer.train()

# Evaluate on the test set
test_results = trainer.evaluate(test_dataset)

# Print final test results
print("Test set results:", test_results)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.516397,0.947842
2,No log,0.524311,0.946018


Test set results: {'eval_loss': 0.34223097562789917, 'eval_accuracy': 0.9643040428807664, 'eval_runtime': 3.1307, 'eval_samples_per_second': 275.978, 'eval_steps_per_second': 68.994, 'epoch': 2.0}


Extend the evaluation function

In [16]:
pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=cb878261cdb6923b13147aef8ed88774e82d5da624043029ef5063b2ce67b96a
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [47]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import torch

# Define a label map for easier access to label names
label_map = {i: label for i, label in enumerate(model.config.id2label)}

def compute_metrics(p):
    predictions, labels = p
    predictions = torch.tensor(predictions) if not isinstance(predictions, torch.Tensor) else predictions
    labels = torch.tensor(labels) if not isinstance(labels, torch.Tensor) else labels

    # Get predicted labels and filter out padding (-100)
    predictions = torch.argmax(predictions, axis=-1)

    true_labels = []
    pred_labels = []

    # Convert predictions and labels to string labels using label_map, filtering out padding tokens
    for i in range(len(labels)):
        true_labels_seq = []
        pred_labels_seq = []
        for j in range(len(labels[i])):
            if labels[i][j] != -100:  # Only process tokens that are not padding
                true_label = label_map[int(labels[i][j].item())] if isinstance(labels[i][j], torch.Tensor) else label_map[labels[i][j]]
                pred_label = label_map[int(predictions[i][j].item())] if isinstance(predictions[i][j], torch.Tensor) else label_map[predictions[i][j]]
                true_labels_seq.append(str(true_label))
                pred_labels_seq.append(str(pred_label))
        true_labels.append(true_labels_seq)
        pred_labels.append(pred_labels_seq)

    # Debug print to verify true_labels and pred_labels structure
    print("Sample true_labels (first 3):", true_labels[:3])
    print("Sample pred_labels (first 3):", pred_labels[:3])

    # Compute metrics using seqeval
    try:
        precision = precision_score(true_labels, pred_labels)
        recall = recall_score(true_labels, pred_labels)
        f1 = f1_score(true_labels, pred_labels)
        report = classification_report(true_labels, pred_labels)
        print("Classification Report:\n", report)
    except Exception as e:
        print(f"Error calculating metrics: {e}")
        return {}

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Set up the Trainer with the updated compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Train the model
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.50947,0.60019,0.684078,0.639394
2,No log,0.50108,0.600371,0.699207,0.646031


Sample true_labels (first 3): [['12', '4', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '4', '12', '12', '12', '12', '4', '10', '12', '12']]
Sample pred_labels (first 3): [['12', '4', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '7', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12']]




Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.57      0.57       127
           1       0.00      0.00      0.00         9
           2       0.60      0.68      0.64      1719
           _       0.60      0.72      0.65       921

   micro avg       0.60      0.68      0.64      2776
   macro avg       0.44      0.49      0.47      2776
weighted avg       0.60      0.68      0.64      2776

Sample true_labels (first 3): [['12', '4', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '4', '12', '12', '12', '12', '4', '10', '12', '12']]
Sample pred_labels (first 3): [['12', '4', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '7', '12', '12', '12', '12',



Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.61      0.62       127
           1       0.00      0.00      0.00         9
           2       0.60      0.69      0.64      1719
           _       0.59      0.74      0.66       921

   micro avg       0.60      0.70      0.65      2776
   macro avg       0.46      0.51      0.48      2776
weighted avg       0.60      0.70      0.65      2776



TrainOutput(global_step=498, training_loss=0.0062527460266787365, metrics={'train_runtime': 121.7657, 'train_samples_per_second': 32.719, 'train_steps_per_second': 4.09, 'total_flos': 112098262098288.0, 'train_loss': 0.0062527460266787365, 'epoch': 2.0})

In [28]:
# Evaluate the model on the test set
test_results = trainer.evaluate(test_dataset)
print("Test set results:", test_results)


Sample true_labels (first 3): [['12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12']]
Sample pred_labels (first 3): [['12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12', '12', '12', '12'], ['12', '12', '12', '12', '12', '12']]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.74      0.76        74
           2       0.68      0.75      0.71      1549
           _       0.60      0.75      0.67       735

   micro avg       0.66      0.75      0.70      2358
   macro avg       0.69      0.75      0.71      2358
weighted avg       0.66      0.75      0.70      2358

Test set results: {'eval_loss': 0.4911172

Macro-average F1 score

In [48]:
from sklearn.metrics import precision_score, recall_score, f1_score
import torch
import numpy as np

# Define the function to compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = torch.tensor(predictions) if not isinstance(predictions, torch.Tensor) else predictions
    labels = torch.tensor(labels) if not isinstance(labels, torch.Tensor) else labels

    # Get predicted labels by taking argmax (this is for token classification)
    predictions = torch.argmax(predictions, axis=-1)

    # Convert predictions and labels to numpy arrays for metric computation
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    # Filter out padding (labels == -100)
    active_predictions = predictions[labels != -100]
    active_labels = labels[labels != -100]

    # Store true labels and predicted labels for later use
    true_labels = active_labels
    pred_labels = active_predictions

    # Compute Accuracy (optional)
    accuracy = (active_predictions == active_labels).sum().item() / active_labels.size  # Use .size instead of .numel()

    # Compute Precision, Recall, F1 Score per class
    precision = precision_score(true_labels, pred_labels, average=None, labels=np.unique(true_labels))
    recall = recall_score(true_labels, pred_labels, average=None, labels=np.unique(true_labels))
    f1 = f1_score(true_labels, pred_labels, average=None, labels=np.unique(true_labels))

    # Compute Macro and Micro F1 scores
    macro_f1 = f1.mean()  # Macro-average F1 score
    micro_f1 = f1_score(true_labels, pred_labels, average='micro')  # Micro-average F1 score

    # Log all metrics
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Macro F1: {macro_f1}")
    print(f"Micro F1: {micro_f1}")

    # Return the metrics for logging
    return {
        "accuracy": accuracy,
        "macro_f1": macro_f1,
        "micro_f1": micro_f1
    }

# Set up Trainer with the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Your training dataset
    eval_dataset=val_dataset,  # Your validation dataset
    compute_metrics=compute_metrics,  # Function to compute metrics
    data_collator=data_collator,  # This will handle padding for you
)

# Evaluate the model
eval_result = trainer.evaluate()


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Precision: [0.49698795 0.39849624 0.80921053 0.48484848 0.76785714 0.38461538
 0.47058824 0.33333333 0.77419355 0.         0.93809524 1.
 0.97854536]
Recall: [0.6547619  0.65030675 0.82550336 0.4        0.89317507 0.64516129
 0.22068966 0.3        0.7826087  0.         0.69858156 0.35714286
 0.9732182 ]
F1 Score: [0.56506849 0.49417249 0.81727575 0.43835616 0.82578875 0.48192771
 0.30046948 0.31578947 0.77837838 0.         0.80081301 0.52631579
 0.97587451]
Macro F1: 0.5630946157547269
Micro F1: 0.9465313800376218
