In [1]:
#1. build our own dataset

In [2]:
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np

# map POS to POS_id
cnt = 0
POS_id = {}
POS_ls = ['NN', 'IN', 'NNP', 'DT', 'NNS', 'JJ', 'COMMA', 'CD', '.', 'VBD', 'RB','VB', 'CC', 'VBN', 'VBZ', 
          'VBG', 'TO', 'PRP', 'VBP', 'POS', 'PRP$','MD', '$', '``', "''", 'WDT', ':', 'JJR', 'RP', 'RBR', 
          'WP', 'NNPS','JJS', ')', '(', 'EX', 'RBS', 'WRB', '-', 'UH', 'WP$', 'PDT', '/', '#', 'LS', 'SYM', 'FW', 'AUX']
for pos in POS_ls:
    POS_id[pos] = cnt
    cnt += 1

# map BIO to BIO_id
cnt = 0
BIO_id = {}
BIO_ls = ['O', 'B-NP', 'I-NP', 'B-PP', 'B-ADVP', 'B-ADJP', 'B-SBAR', 'B-CONJP',
       'I-ADJP', 'I-PP', 'I-ADVP', 'I-CONJP', 'B-INTJ', 'I-SBAR', 'B-LST',
       'B-VP', 'B-PRT', 'I-INTJ', 'I-VP']
for bio in BIO_ls:
    BIO_id[bio] = cnt
    cnt += 1

# map label to BIO_id
Label_id = {"ARG0":0,"ARG1":1,"ARG2":2,"PRED":3,"SUPPORT":4}
def mapLabel(label):
    return Label_id[label] if label in Label_id else 5

# build datasets
def condense_df(file):
    df = pd.DataFrame()
    with open(file, 'r') as file:
        ls = [i.split('\t') for i in file.read().split('\n')]
        df = pd.DataFrame(ls)

    df['id'] = df.index
    df[0].replace('', np.nan, inplace=True)
    df.dropna(axis=0, subset = [0], inplace = True)
    df['BIO'] = df[2].map(BIO_id)
    df['POS'] = df[1].map(POS_id)
    df['label'] = df[5].map(mapLabel)
    df['id'] = df[4].map(int)
    df.drop(columns = [1, 2, 3, 4, 5, 6], inplace = True)
    condense = df.groupby('id').apply(lambda x: [list(x[0]),list(x['POS']), list(x['BIO']), list(x['label'])]).apply(pd.Series)
    condense.columns =['tokens','POS','BIO','label']
    return condense


train = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.train"))
eval_ = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.dev"))
test = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.test"))
datasets = DatasetDict({"train": train, "validation":eval_, "test":test})
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 2174
    })
    validation: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 83
    })
    test: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 150
    })
})

In [3]:
# 2. tokenize

In [4]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if not word_id:
            new_labels.append(6)
        else:
            if word_id != current_word:# Start of a new word!
                current_word = word_id       
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    all_labels = examples["label"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns = datasets["train"].column_names,
)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
# 3. train while evaluate

In [6]:
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

small_train = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval = tokenized_datasets["validation"].shuffle(seed=42).select(range(30))
small_test = tokenized_datasets["test"].shuffle(seed=42).select(range(30))

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=1, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=1, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=1, collate_fn=data_collator
)

In [7]:
label_names = ["ARG0", "ARG1", "ARG2", "PRED", "SUPPORT", "None", "-100"]

In [8]:
# def compute_metrics(eval_preds):
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)

#     # Remove ignored index (special tokens) and convert to labels
#     true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
#     true_predictions = [
#         [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": all_metrics["overall_precision"],
#         "recall": all_metrics["overall_recall"],
#         "f1": all_metrics["overall_f1"],
#         "accuracy": all_metrics["overall_accuracy"],
#     }
# id2label = {str(i): label for i, label in enumerate(label_names)}
# label2id = {v: k for k, v in id2label.items()}

# model = AutoModelForTokenClassification.from_pretrained(
#     model_checkpoint,
#     id2label=id2label,
#     label2id=label2id,
#     output_attentions=True,
#     output_hidden_states=True
# )

# args = TrainingArguments(
#     output_dir = "bert-finetuned",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

# trainer = Trainer(
#     model = model,
#     args = args,
#     train_dataset = tokenized_datasets["train"],#small_train,#
#     eval_dataset = tokenized_datasets["validation"],#small_eval,#
#     data_collator = data_collator,
#     compute_metrics = compute_metrics,
#     tokenizer = tokenizer,
# )




In [9]:
from transformers import AutoConfig, AutoModel
from transformers.modeling_outputs import TokenClassifierOutput

import torch.nn as nn
class CustomModel(nn.Module):
    def __init__(self,checkpoint,num_labels): 
        super(CustomModel,self).__init__() 
        self.num_labels = num_labels 

        #Load Model with given checkpoint and extract its body
        self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
        self.dropout = nn.Dropout(0.1) 
        self.classifier = nn.Linear(768,num_labels) # load and initialize weights

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None,labels=None):
        #Extract outputs from the body
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
#         print("input_ids:::"+str(input_ids.shape))
#         print("labels:::"+str(labels.shape))
        #Add custom layers
        sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state
#         print("sequence_output:::"+str(sequence_output.shape))

        logits = self.classifier(sequence_output[:,:,:].view(-1, 768)) # calculate losses
        self.logits = logits
#         print("logits::"+str(logits.shape))

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [10]:
import torch
from tqdm import tqdm
from transformers import AdamW,get_scheduler
from datasets import load_metric
metric = evaluate.load("seqeval")

num_epochs = 3
model_cc = CustomModel(checkpoint=model_checkpoint,num_labels=len(label_names)).cuda()
optimizer = AdamW(model_cc.parameters(), lr=2e-5)
num_training_steps = num_epochs * len(train_dataloader)
progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))
f1_best = 0
resume_flag = True
best_net = None

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

for epoch in range(num_epochs):
    if resume_flag:
        model_cc.load_state_dict(torch.load("sstcls_best.dat"))
    model_cc.train()
    for batch in train_dataloader:
        batch = {k: v.cuda() for k, v in batch.items()}
        outputs = model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)
    model_cc.eval()
    f1_now = []
    for batch in eval_dataloader:
        batch = {k: v.cuda() for k, v in batch.items()}
        with torch.no_grad():
            outputs = model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        true_labels = [[label_names[l] for l in label if l != 6] for label in batch["labels"]]
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != 6]
            for prediction, label in zip([predictions], batch["labels"])
        ]
        
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)
        progress_bar_eval.update(1)
        f1_now.append(all_metrics["overall_f1"])
        
    if np.mean(f1_now) > f1_best:
        torch.save(model_cc.state_dict(), 'sstcls_best.dat')
        f1_best = np.mean(f1_now)
        print("the best f1 is now: "+ str(np.mean(f1_now)))
        best_net = model_cc
        

    print(metric.compute())
        


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|                                                 | 0/6522 [00:00<?, ?it/s]
  0%|                                                  | 0/249 [00:00<?, ?it/

the best f1 is now: 0.8013660748738027
{'RED': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1}, 'one': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2}, 'overall_precision': 1.0, 'overall_recall': 1.0, 'overall_f1': 1.0, 'overall_accuracy': 1.0}


 36%|█████████████▋                        | 2340/6522 [01:56<03:26, 20.25it/s]
 67%|█████████████████████████▎            | 4347/6522 [03:33<01:44, 20.91it/s][A
 34%|█████████████▊                           | 84/249 [03:33<15:14,  5.54s/it][A
 37%|███████████████▏                         | 92/249 [03:33<09:11,  3.51s/it][A
 40%|████████████████                        | 100/249 [03:33<05:44,  2.31s/it][A
 43%|█████████████████▎                      | 108/249 [03:33<03:39,  1.56s/it][A
 47%|██████████████████▋                     | 116/249 [03:34<02:21,  1.06s/it][A
 50%|███████████████████▉                    | 124/249 [03:34<01:31,  1.36it/s][A
 53%|█████████████████████▏                  | 132/249 [03:34<00:59,  1.96it/s][A
 56%|██████████████████████▍                 | 140/249 [03:34<00:39,  2.79it/s][A
 59%|███████████████████████▊                | 148/249 [03:34<00:25,  3.95it/s][A
 63%|█████████████████████████               | 156/249 [03:34<00:16,  5.54it/s][A
 66%|██

{'RED': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1}, 'one': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2}, 'overall_precision': 1.0, 'overall_recall': 1.0, 'overall_f1': 1.0, 'overall_accuracy': 1.0}


 70%|██████████████████████████▋           | 4584/6522 [03:46<01:32, 20.87it/s]
100%|██████████████████████████████████████| 6522/6522 [05:20<00:00, 20.77it/s][A
 67%|██████████████████████████▊             | 167/249 [05:20<06:51,  5.01s/it][A

 73%|█████████████████████████████▍          | 183/249 [05:20<02:25,  2.20s/it][A
 77%|██████████████████████████████▋         | 191/249 [05:20<01:26,  1.50s/it][A
 80%|███████████████████████████████▉        | 199/249 [05:20<00:51,  1.03s/it][A
 83%|█████████████████████████████████▎      | 207/249 [05:20<00:30,  1.40it/s][A
 86%|██████████████████████████████████▌     | 215/249 [05:20<00:16,  2.00it/s][A
 90%|███████████████████████████████████▊    | 223/249 [05:21<00:09,  2.84it/s][A
 93%|█████████████████████████████████████   | 231/249 [05:21<00:04,  4.02it/s][A
 96%|██████████████████████████████████████▍ | 239/249 [05:21<00:01,  5.63it/s][A
 99%|███████████████████████████████████████▋| 247/249 [05:21<00:00,  7.80it/s][A

{'RED': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1}, 'one': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2}, 'overall_precision': 1.0, 'overall_recall': 1.0, 'overall_f1': 1.0, 'overall_accuracy': 1.0}


In [11]:
# trainer.train()

In [12]:
# # loading the model you previously trained
# model = AutoModelForTokenClassification.from_pretrained("bert-finetuned/checkpoint-816")
# BATCH_SIZE = 1
# # arguments for Trainer
# test_args = TrainingArguments(
#     output_dir = "bert-finetuned-testing",
#     do_train = False,
#     do_predict = True,
#     per_device_eval_batch_size = BATCH_SIZE,   
#     dataloader_drop_last = False    
# )

# # init trainer
# trainer = Trainer(
#           model = model, 
#           args = test_args, 
#           compute_metrics = compute_metrics)

# test_results = trainer.predict(tokenized_datasets["test"])

In [13]:
def predict(net, dataloader, gpu):
    net.eval()

    f1_now = []
    precision_now = []
    recall_now = []
    accuracy_now = []
    for batch in eval_dataloader:
        batch = {k: v.cuda() for k, v in batch.items()}
        with torch.no_grad():
            outputs = model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        true_labels = [[label_names[l] for l in label if l != 6] for label in batch["labels"]]
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != 6]
            for prediction, label in zip([predictions], batch["labels"])
        ]
        
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)
        progress_bar_eval.update(1)
        f1_now.append(all_metrics["overall_f1"])
        precision_now.append(all_metrics["overall_precision"])
        recall_now.append(all_metrics["overall_recall"])
        accuracy_now.append(all_metrics["overall_accuracy"])


    return np.mean(f1_now), np.mean(precision_now), np.mean(recall_now), np.mean(accuracy_now),



In [14]:
preds = predict(best_net, test_dataloader, 0)
print("test_f1_score:::"+str(preds[0]))
print("test_precision_score:::"+str(preds[1]))
print("test_recall_score:::"+str(preds[2]))
print("test_accuracy_score:::"+str(preds[3]))


255it [05:21, 10.47it/s]                                                       [A
263it [05:21, 14.13it/s][A
271it [05:21, 18.64it/s][A
279it [05:21, 24.03it/s][A
287it [05:21, 29.84it/s][A
295it [05:22, 36.20it/s][A
303it [05:22, 42.48it/s][A
311it [05:22, 48.83it/s][A
319it [05:22, 55.11it/s][A
327it [05:22, 58.97it/s][A

test_f1_score:::0.7975521782938886
test_precision_score:::0.7754407642518912
test_recall_score:::0.8283199230488388
test_accuracy_score:::0.967577520670473


100%|██████████████████████████████████████| 6522/6522 [05:36<00:00, 20.77it/s]
332it [05:36, 58.97it/s][A