In [1]:
#1. build our own dataset

In [2]:
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import pandas as pd
import numpy as np

# map POS to POS_id
cnt = 0
POS_id = {}
POS_ls = ['NN', 'IN', 'NNP', 'DT', 'NNS', 'JJ', 'COMMA', 'CD', '.', 'VBD', 'RB','VB', 'CC', 'VBN', 'VBZ', 
          'VBG', 'TO', 'PRP', 'VBP', 'POS', 'PRP$','MD', '$', '``', "''", 'WDT', ':', 'JJR', 'RP', 'RBR', 
          'WP', 'NNPS','JJS', ')', '(', 'EX', 'RBS', 'WRB', '-', 'UH', 'WP$', 'PDT', '/', '#', 'LS', 'SYM', 'FW', 'AUX']
for pos in POS_ls:
    POS_id[pos] = cnt
    cnt += 1

# map BIO to BIO_id
cnt = 0
BIO_id = {}
BIO_ls = ['O', 'B-NP', 'I-NP', 'B-PP', 'B-ADVP', 'B-ADJP', 'B-SBAR', 'B-CONJP',
       'I-ADJP', 'I-PP', 'I-ADVP', 'I-CONJP', 'B-INTJ', 'I-SBAR', 'B-LST',
       'B-VP', 'B-PRT', 'I-INTJ', 'I-VP']
for bio in BIO_ls:
    BIO_id[bio] = cnt
    cnt += 1

In [3]:
# map label to BIO_id
import glob
Label_id = {"ARG0":0,"ARG1":1,"ARG2":2,"PRED":3,"SUPPORT":4}
def mapLabel(label):
    return Label_id[label] if label in Label_id else 5

# build datasets
def condense_df(target):
    result = []
    for file in glob.glob("nombank_train_dev_test/"+target):

        with open(file, 'r') as file:

            ls = [i.split('\t') for i in file.read().split('\n')]

            result.append(pd.DataFrame(ls))

    df = pd.concat(result)

    df['id'] = df.index
    df[0].replace('', None, inplace=True)
    df.dropna(axis=0, subset = [4], inplace = True)
    df['BIO'] = df[2].map(BIO_id)
    df['POS'] = df[1].map(POS_id)
    df['label'] = df[5].map(mapLabel)
    df['id'] = df[4].map(int)
    df.drop(columns = [1, 2, 3, 4, 5, 6], inplace = True)
    condense = df.groupby('id').apply(lambda x: [list(x[0]),list(x['POS']), list(x['BIO']), list(x['label'])]).apply(pd.Series)
    condense.columns =['tokens','BIO', 'POS','label']
    return condense


train = Dataset.from_pandas(condense_df("*.train"))
eval_ = Dataset.from_pandas(condense_df("*.dev"))
test = Dataset.from_pandas(condense_df("*.test"))


datasets = DatasetDict({"train": train, "validation":eval_, "test":test})
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'BIO', 'POS', 'label', 'id'],
        num_rows: 84169
    })
    validation: Dataset({
        features: ['tokens', 'BIO', 'POS', 'label', 'id'],
        num_rows: 3235
    })
    test: Dataset({
        features: ['tokens', 'BIO', 'POS', 'label', 'id'],
        num_rows: 5382
    })
})

In [4]:
# 2. tokenize

In [20]:

from transformers import AutoTokenizer

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
NonePOS = len(POS_ls)
NoneBIO = len(BIO_ls)
def align_labels_with_tokens(labels, POSs, BIOs, word_ids):
    new_labels = []
    POS_labels = []
    BIO_labels = []
    current_word = None
    for word_id in word_ids:
        if not word_id:
            new_labels.append(-100)
            POS_labels.append(NonePOS)
            BIO_labels.append(NoneBIO)
        else:
            if word_id != current_word:# Start of a new word!
                current_word = word_id       
            new_labels.append(labels[word_id])
            POS_labels.append(POSs[word_id])
            BIO_labels.append(BIOs[word_id])

    return new_labels, POS_labels, BIO_labels

def tokenize_and_align_labels(examples):

    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=True, max_length = 48)
    all_BIO = examples["BIO"]
    all_POS = examples["POS"]
    all_labels = examples["label"]
    
    new_labels = []
    POS_labels = []
    BIO_labels = []
    for i in range(len(all_labels)):
        word_ids = tokenized_inputs.word_ids(i)
        new_label, POS_label, BIO_label = align_labels_with_tokens(all_labels[i], all_POS[i], all_BIO[i], word_ids)
        new_labels.append(new_label)
        POS_labels.append(POS_label)
        BIO_labels.append(BIO_label)
    
    curLen = len(BIO_labels)

    tokenized_inputs["BIOL"] = BIO_labels
    tokenized_inputs["POSL"] = POS_labels
    tokenized_inputs["labels"] = new_labels
    


    return tokenized_inputs

tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns = datasets["train"].column_names,
)

  0%|          | 0/85 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [6]:
# 3. train while evaluate

In [None]:
import evaluate
import numpy as np

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

#small_train = tokenized_datasets["train"].shuffle(seed=42).select(range(64*8))
#small_eval = tokenized_datasets["validation"].shuffle(seed=42).select(range(64*8))
#small_test = tokenized_datasets["test"].shuffle(seed=42).select(range(64*8))


BatchSize = 64
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=BatchSize, collate_fn=data_collator)##small_train
eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=BatchSize, collate_fn=data_collator)##small_eval
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=BatchSize, collate_fn=data_collator)##small_test

In [None]:
label_names = ["ARG0", "ARG1", "ARG2", "PRED", "SUPPORT", "None"]
POS_len = len(POS_ls)
BIO_len = len(BIO_ls)
feature_dim = 866

In [None]:
from transformers import AutoConfig, AutoModel
from transformers.modeling_outputs import TokenClassifierOutput

import torch.nn as nn
class CustomModel(nn.Module):
    def __init__(self,checkpoint,num_labels): 
        super(CustomModel,self).__init__() 
        self.num_labels = num_labels 

        #Load Model with given checkpoint and extract its body
        self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
        self.dropout = nn.Dropout(0.1) 
        self.classifier = nn.Linear(feature_dim,num_labels) # load and initialize weights

    def forward(self, input_ids, token_type_ids, attention_mask, labels, POS, BIO):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state
        
        POS_f = torch.eye(POS_len+1)[POS]
        BIO_f = torch.eye(POS_len+1)[BIO]
        
        sequence_output = torch.cat((sequence_output,POS_f, BIO_f),2)
        print(sequence_output.size())
        logits = self.classifier(sequence_output[:,:,:].view(-1, feature_dim)) # calculate losses
        self.logits = logits
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [None]:
import torch
from tqdm import tqdm
from transformers import AdamW,get_scheduler
from datasets import load_metric
metric = evaluate.load("seqeval")

num_epochs = 3
model_cc = CustomModel(checkpoint=model_checkpoint,num_labels=len(label_names))#.cuda()
optimizer = AdamW(model_cc.parameters(), lr=2e-5)
num_training_steps = num_epochs * len(train_dataloader)
progress_bar_train = tqdm(range(num_training_steps),miniters=2)
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)),miniters=2)
f1_best = 0
resume_flag = False
best_net = None

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

for epoch in range(num_epochs):
    if resume_flag:
        model_cc.load_state_dict(torch.load("sstcls_best.dat"))
    model_cc.train()
    for batch in train_dataloader:
        batch = {k: v for k, v in batch.items()}#.cuda()
        outputs = model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'],labels=batch['labels'], POS=batch['POSL'], BIO=batch['BIOL'])
      
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)
    model_cc.eval()
    f1_now = []
    for batch in eval_dataloader:
        batch = {k: v for k, v in batch.items()}#.cuda()
        with torch.no_grad():
            outputs = model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'],labels=batch['labels'], POS=batch['POSL'], BIO=batch['BIOL'])
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        true_labels = [[label_names[l] for l in label if l != -100] for label in batch["labels"]]
        
        reshaped_predictions = torch.reshape(predictions, (BatchSize,-1))
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(reshaped_predictions, batch["labels"])
        ]
        
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)
        progress_bar_eval.update(1)
        f1_now.append(all_metrics["overall_f1"])
        
    if np.mean(f1_now) > f1_best or not best_net:
        torch.save(model_cc.state_dict(), 'sstcls_best.dat')
        f1_best = np.mean(f1_now)
        print("the best f1 is now: "+ str(np.mean(f1_now)))
        best_net = model_cc
        

    print(metric.compute())

In [None]:
def predict(net, dataloader, gpu):
    net.eval()

    f1_now = []
    precision_now = []
    recall_now = []
    accuracy_now = []
    for batch in eval_dataloader:
        batch = {k: v for k, v in batch.items()}#.cuda()
        with torch.no_grad():
            outputs = model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'], POS=batch['POSL'], BIO=batch['BIOL'])
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        true_labels = [[label_names[l] for l in label if l != -100] for label in batch["labels"]]
        
        reshaped_predictions = torch.reshape(predictions, (BatchSize,-1))
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(reshaped_predictions, batch["labels"])
        ]
        
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)
        progress_bar_eval.update(1)
        f1_now.append(all_metrics["overall_f1"])
        precision_now.append(all_metrics["overall_precision"])
        recall_now.append(all_metrics["overall_recall"])
        accuracy_now.append(all_metrics["overall_accuracy"])


    return np.mean(f1_now), np.mean(precision_now), np.mean(recall_now), np.mean(accuracy_now),

In [None]:
preds = predict(best_net, test_dataloader, 0)
print("test_f1_score:::"+str(preds[0]))
print("test_precision_score:::"+str(preds[1]))
print("test_recall_score:::"+str(preds[2]))
print("test_accuracy_score:::"+str(preds[3]))