In [1]:
#1. build our own dataset

In [2]:
from datasets import Dataset, DatasetDict
import pandas as pd

# map POS to POS_id
cnt = 0
POS_id = {}
POS_ls = ['NN', 'IN', 'NNP', 'DT', 'NNS', 'JJ', 'COMMA', 'CD', '.', 'VBD', 'RB','VB', 'CC', 'VBN', 'VBZ', 
          'VBG', 'TO', 'PRP', 'VBP', 'POS', 'PRP$','MD', '$', '``', "''", 'WDT', ':', 'JJR', 'RP', 'RBR', 
          'WP', 'NNPS','JJS', ')', '(', 'EX', 'RBS', 'WRB', '-', 'UH', 'WP$', 'PDT', '/', '#', 'LS', 'SYM', 'FW', 'AUX']
for pos in POS_ls:
    POS_id[pos] = cnt
    cnt += 1

# map BIO to BIO_id
cnt = 0
BIO_id = {}
BIO_ls = ['O', 'B-NP', 'I-NP', 'B-PP', 'B-ADVP', 'B-ADJP', 'B-SBAR', 'B-CONJP',
       'I-ADJP', 'I-PP', 'I-ADVP', 'I-CONJP', 'B-INTJ', 'I-SBAR', 'B-LST',
       'B-VP', 'B-PRT', 'I-INTJ', 'I-VP']
for bio in BIO_ls:
    BIO_id[bio] = cnt
    cnt += 1

# map label to BIO_id
Label_id = {"ARG0":0,"ARG1":1,"ARG2":2,"PRED":3,"SUPPORT":4}
def mapLabel(label):
    return Label_id[label] if label in Label_id else 5

# build datasets
def condense_df(file):
    df = pd.DataFrame()
    with open(file, 'r') as file:
        ls = [i.split('\t') for i in file.read().split('\n')]
        df = pd.DataFrame(ls)

    df['id'] = df.index
    df[0].replace('', None, inplace=True)
    df.dropna(axis=0, subset = [0], inplace = True)
    df['BIO'] = df[2].map(BIO_id)
    df['POS'] = df[1].map(POS_id)
    df['label'] = df[5].map(mapLabel)
    df['id'] = df[4].map(int)
    df.drop(columns = [1, 2, 3, 4, 5, 6], inplace = True)
    condense = df.groupby('id').apply(lambda x: [list(x[0]),list(x['POS']), list(x['BIO']), list(x['label'])]).apply(pd.Series)
    condense.columns =['tokens','POS','BIO','label']
    return condense


train = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.train"))
eval_ = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.dev"))
test = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.test"))
datasets = DatasetDict({"train": train, "validation":eval_, "test":test})
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 2174
    })
    validation: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 83
    })
    test: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 150
    })
})

In [3]:
# 2. tokenize

In [4]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if not word_id:
            new_labels.append(-100)
        else:
            if word_id != current_word:# Start of a new word!
                current_word = word_id       
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    all_labels = examples["label"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns = datasets["train"].column_names,
)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
# 3. train while evaluate

In [6]:
small_train = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval = tokenized_datasets["validation"].shuffle(seed=42).select(range(30))
small_test = tokenized_datasets["test"].shuffle(seed=42).select(range(30))

In [7]:
label_names = ["ARG0", "ARG1", "ARG2", "PRED", "SUPPORT", "None"]

In [8]:
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

args = TrainingArguments(
    output_dir = "bert-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_datasets["train"],#small_train,#
    eval_dataset = tokenized_datasets["validation"],#small_eval,#
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer = tokenizer,
)
trainer.train()



Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.069533,0.725177,0.834694,0.776091,0.969301
2,0.085500,0.065171,0.776876,0.781633,0.779247,0.971418
3,0.085500,0.065721,0.749071,0.822449,0.784047,0.971066


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-272
Configuration saved in bert-finetuned/checkpoint-272/config.json
Model weights saved in bert-finetuned/checkpoint-272/pytorch_model.bin
tokenizer config file saved in bert-finetuned/checkpoint-272/tokenizer_config.json
Special tokens file saved in bert-finetuned/checkpoint-272/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-544
Configuration saved in bert-finetuned/checkpoint-544/config.json
Model weights saved in bert-finetuned/checkpoint-544/pytorch_model.bin
tokenizer config file saved in bert-finetuned/checkpoint-544/tokenizer_config.json
Special tokens file saved in bert-finetuned/checkpoint-544/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-816
Configuration saved i

TrainOutput(global_step=816, training_loss=0.06685576251908845, metrics={'train_runtime': 577.1336, 'train_samples_per_second': 11.301, 'train_steps_per_second': 1.414, 'total_flos': 207936455044752.0, 'train_loss': 0.06685576251908845, 'epoch': 3.0})

In [11]:
# loading the model you previously trained
model = AutoModelForTokenClassification.from_pretrained("bert-finetuned/checkpoint-816")
BATCH_SIZE = 1
# arguments for Trainer
test_args = TrainingArguments(
    output_dir = "bert-finetuned-testing",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = BATCH_SIZE,   
    dataloader_drop_last = False    
)

# init trainer
trainer = Trainer(
          model = model, 
          args = test_args, 
          compute_metrics = compute_metrics)

test_results = trainer.predict(tokenized_datasets["test"])

loading configuration file bert-finetuned/checkpoint-816/config.json
Model config BertConfig {
  "_name_or_path": "bert-finetuned/checkpoint-816",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "ARG0",
    "1": "ARG1",
    "2": "ARG2",
    "3": "PRED",
    "4": "SUPPORT",
    "5": "None"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "ARG0": "0",
    "ARG1": "1",
    "ARG2": "2",
    "None": "5",
    "PRED": "3",
    "SUPPORT": "4"
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": 

In [12]:
test_results.metrics

{'test_loss': 0.05072753131389618,
 'test_precision': 0.801123595505618,
 'test_recall': 0.8176605504587156,
 'test_f1': 0.8093076049943245,
 'test_accuracy': 0.9759927797833935,
 'test_runtime': 7.1178,
 'test_samples_per_second': 21.074,
 'test_steps_per_second': 21.074}