In [1]:
#1. build our own dataset

In [2]:
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import pandas as pd
import numpy as np

# map POS to POS_id
cnt = 0
POS_id = {}
POS_ls = ['NN', 'IN', 'NNP', 'DT', 'NNS', 'JJ', 'COMMA', 'CD', '.', 'VBD', 'RB','VB', 'CC', 'VBN', 'VBZ', 
          'VBG', 'TO', 'PRP', 'VBP', 'POS', 'PRP$','MD', '$', '``', "''", 'WDT', ':', 'JJR', 'RP', 'RBR', 
          'WP', 'NNPS','JJS', ')', '(', 'EX', 'RBS', 'WRB', '-', 'UH', 'WP$', 'PDT', '/', '#', 'LS', 'SYM', 'FW', 'AUX']
for pos in POS_ls:
    POS_id[pos] = cnt
    cnt += 1

# map BIO to BIO_id
cnt = 0
BIO_id = {}
BIO_ls = ['O', 'B-NP', 'I-NP', 'B-PP', 'B-ADVP', 'B-ADJP', 'B-SBAR', 'B-CONJP',
       'I-ADJP', 'I-PP', 'I-ADVP', 'I-CONJP', 'B-INTJ', 'I-SBAR', 'B-LST',
       'B-VP', 'B-PRT', 'I-INTJ', 'I-VP']
for bio in BIO_ls:
    BIO_id[bio] = cnt
    cnt += 1
print(len(POS_ls))
print(len(BIO_ls))

48
19


In [3]:


# map label to BIO_id
Label_id = {"ARG0":0,"ARG1":1,"ARG2":2,"PRED":3,"SUPPORT":4}
def mapLabel(label):
    return Label_id[label] if label in Label_id else 5

# build datasets
def condense_df(file):
    df = pd.DataFrame()
    with open(file, 'r') as file:
        ls = [i.split('\t') for i in file.read().split('\n')]
        df = pd.DataFrame(ls)

    df['id'] = df.index
    df[0].replace('', np.nan, inplace=True)
    df.dropna(axis=0, subset = [0], inplace = True)
    df['BIO'] = df[2].map(BIO_id)
    df['POS'] = df[1].map(POS_id)
    df['label'] = df[5].map(mapLabel)
    df['id'] = df[4].map(int)
    df.drop(columns = [1, 2, 3, 4, 5, 6], inplace = True)
    condense = df.groupby('id').apply(lambda x: [list(x[0]),list(x['POS']), list(x['BIO']), list(x['label'])]).apply(pd.Series)
    condense.columns =['tokens','POS','BIO','label']
    return condense


train = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.train"))
eval_ = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.dev"))
test = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.test"))


datasets = DatasetDict({"train": train, "validation":eval_, "test":test})
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 2174
    })
    validation: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 83
    })
    test: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 150
    })
})

In [4]:
# 2. tokenize

In [5]:

from transformers import AutoTokenizer

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def align_labels_with_tokens(labels, POSs, BIOs, word_ids):
    new_labels = []
    POS_labels = []
    BIO_labels = []
    current_word = None
    for word_id in word_ids:
        if not word_id:
            new_labels.append(-100)
            POS_labels.append(-100)
            BIO_labels.append(-100)
        else:
            if word_id != current_word:# Start of a new word!
                current_word = word_id       
            new_labels.append(labels[word_id])
            POS_labels.append(POSs[word_id])
            BIO_labels.append(BIOs[word_id])

    return new_labels, POS_labels, BIO_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    all_labels = examples["label"]
    all_POS = examples["POS"]
    all_BIO = examples["BIO"]
    
    new_labels = []
    POS_labels = []
    BIO_labels = []
    for i in range(len(all_labels)):
        word_ids = tokenized_inputs.word_ids(i)
        new_label, POS_label,BIO_label = align_labels_with_tokens(all_labels[i], all_POS[i], all_BIO[i], word_ids)
        new_labels.append(new_label)
        POS_labels.append(POS_label)
        BIO_labels.append(BIO_label)

    tokenized_inputs["labels"] = new_labels
    tokenized_inputs["POS"] = POS_labels
    tokenized_inputs["BIO"] = BIO_labels
    return tokenized_inputs

tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns = datasets["train"].column_names,
)
"""
from transformers import AutoTokenizer

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if not word_id:
            new_labels.append(-100)
        else:
            if word_id != current_word:# Start of a new word!
                current_word = word_id       
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    all_labels = examples["label"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns = datasets["train"].column_names,
)
"""

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 2174
    })
    validation: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 83
    })
    test: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 150
    })
})

In [7]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2174
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 83
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 150
    })
})

In [None]:
#for target in ['train','validation','test']:
#    for add in ['POS', 'BIO']:
#        tokenized_datasets[target] = tokenized_datasets[target].add_column(add, datasets[target][add])

In [None]:
#tokenized_datasets['train'][100]['POS']
#print(len(tokenized_datasets['train'][100]['POS']))

KeyError: 'POS'

In [None]:
print(len(tokenized_datasets['train'][100]['attention_mask']))

55


In [None]:
tokenized_datasets['train'][100]['input_ids']
print(len(tokenized_datasets['train'][100]['input_ids']))

55


In [None]:
a = pd.DataFrame(tokenized_datasets["train"])
a 

Unnamed: 0,input_ids,token_type_ids,attention_mask,labels
0,"[101, 2021, 2055, 2423, 1003, 1997, 1996, 2529...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, 5, 5, 3, 5, 5, 1, 1, 5, 5, 5, 5, ..."
1,"[101, 1996, 5416, 4636, 2097, 15697, 1999, 215...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
2,"[101, 1996, 2597, 1997, 1996, 2142, 2163, 4012...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
3,"[101, 1996, 2034, 8893, 9781, 8079, 2550, 2478...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
4,"[101, 2720, 1012, 3389, 15154, 2056, 1996, 889...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, -100, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
...,...,...,...,...
2169,"[101, 3653, 2696, 2595, 4082, 5618, 2013, 7026...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, -100, -100, 5, 1, 5, 5, 5, 4, 5, ..."
2170,"[101, 12594, 3123, 1022, 1012, 1022, 1003, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, 4, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, ..."
2171,"[101, 1996, 2194, 2056, 1996, 4284, 2443, 1037...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
2172,"[101, 1996, 2194, 2056, 1996, 4284, 2443, 1037...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, -100, 5, 5, 5, 5, 5, 5, 5, 3, 4, 5, 5, ..."


In [None]:
len(POS_len)

In [None]:
POS_len = list(a['POS'].map(len))
BIO_len = list(a['BIO'].map(len))
input_ids_len = list(a['input_ids'].map(len))
token_type_ids_len = list(a['token_type_ids'].map(len))
attention_mask_len = list(a['attention_mask'].map(len))
labels_len = list(a['labels'].map(len))


In [None]:
POS_len == token_type_ids_len

In [13]:
from torch.utils.data import DataLoader
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator#
)

In [None]:
tokenized_datasets["train"]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 2174
})

In [14]:
for i,batch in enumerate(train_dataloader):
    pass
    """
    print("input_ids",batch['input_ids'].size())
    print("token_type_ids",batch['token_type_ids'].size())
    print("attention_mask",batch['attention_mask'].size())
    print("labels",batch['labels'].size())
    print("POS",batch['POS'].size())
    print("BIO",batch['BIO'].size())
    """

In [None]:
print(tokenized_datasets['train'])
print(tokenized_datasets['validation'])
print(tokenized_datasets['test'])

In [None]:
#tokenized_datasets['test'][0]

In [None]:
mxLen, mxIdx = 0, 0
for idx, x in enumerate(tokenized_datasets["train"]['input_ids']):
    if mxLen < len(x):
        mxLen = len(x)
        mxIdx = idx

print(mxLen)
print(mxIdx)

In [None]:
example = datasets['train'][286]['tokens']
tokenized_input = tokenizer(example, is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print("the original length is: ", len(example))
print("the original example is: ", example)
print("the tokenized output is: ", tokens)
print("the tokenized length is: ", len(tokens))

In [None]:
tokenized_datasets

In [None]:
import pandas as pd
import missingno as msno
msno.matrix(pd.DataFrame(tokenized_datasets["train"]['labels']))

In [None]:
# 3. train while evaluate

In [None]:
aa
import evaluate
import numpy as np

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

small_train = tokenized_datasets["train"].shuffle(seed=42).select(range(64))
small_eval = tokenized_datasets["validation"].shuffle(seed=42).select(range(16))
small_test = tokenized_datasets["test"].shuffle(seed=42).select(range(8))


BatchSize = 8
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    small_train, shuffle=True, batch_size=BatchSize, collate_fn=data_collator#tokenized_datasets["train"]
)
eval_dataloader = DataLoader(
    small_eval, batch_size=BatchSize, collate_fn=data_collator#tokenized_datasets["validation"]
)
test_dataloader = DataLoader(
    small_test, batch_size=BatchSize, collate_fn=data_collator#tokenized_datasets["test"]
)

In [None]:
#len(small_train['POS'])

In [None]:
#len(small_train['POS'][2])

In [None]:
label_names = ["ARG0", "ARG1", "ARG2", "PRED", "SUPPORT", "None"]

In [None]:
from transformers import AutoConfig, AutoModel
from transformers.modeling_outputs import TokenClassifierOutput

import torch.nn as nn
class CustomModel(nn.Module):
    def __init__(self,checkpoint,num_labels): 
        super(CustomModel,self).__init__() 
        self.num_labels = num_labels 

        #Load Model with given checkpoint and extract its body
        self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
        self.dropout = nn.Dropout(0.1) 
        self.classifier = nn.Linear(768,num_labels) # load and initialize weights

    def forward(self, input_ids, token_type_ids, attention_mask, labels):
        #TODO: add 2 more parameters as: forward(self, input_ids, token_type_ids, attention_mask, POS, BIO, labels)
        #Extract outputs from the body
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
#         print("input_ids:::"+str(input_ids.shape))
#         print("labels:::"+str(labels.shape))
        #Add custom layers
        sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state
#         print("sequence_output:::"+str(sequence_output.shape))
        #sequence_output = outputs
        print(sequence_output.size())
        """
        TODO: concat
        After using print(sequence_output.size()) here, we have:
        torch.Size([8, 44, 768])
        torch.Size([8, 50, 768]) etc.
        第一维度是全局batch_size我设置成8
        第二维是一句话里面token的个数，是经过分割OOV过后的token数量
        第三维是Bert的输出
        
        假设POS 48维，BIO 10维，那么总的第三维在concat后的总维数是 768+48+10，concat不会改变第一维和第二维的长度
        
        logits = self.classifier(sequence_output[:,:,:].view(-1, 768+48+10)) # calculate losses
        """
        
        logits = self.classifier(sequence_output[:,:,:].view(-1, 768)) # calculate losses
        self.logits = logits
        #print("logits::"+str(logits.shape))

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [None]:
import torch
from tqdm import tqdm
from transformers import AdamW,get_scheduler
from datasets import load_metric
metric = evaluate.load("seqeval")

num_epochs = 3
model_cc = CustomModel(checkpoint=model_checkpoint,num_labels=len(label_names))#.cuda()
optimizer = AdamW(model_cc.parameters(), lr=2e-5)
num_training_steps = num_epochs * len(train_dataloader)
progress_bar_train = tqdm(range(num_training_steps),miniters=2)
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)),miniters=2)
f1_best = 0
resume_flag = False
best_net = None

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

for epoch in range(num_epochs):
    if resume_flag:
        model_cc.load_state_dict(torch.load("sstcls_best.dat"))
    model_cc.train()
    for batch in train_dataloader:
        batch = {k: v for k, v in batch.items()}#.cuda()
        outputs = model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'],labels=batch['labels'])
        #TODO: add 2 more parameters as followed:
        #model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'],POS = batch['POS'], BIO = batch['BIO'], labels=batch['labels'])
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)
    model_cc.eval()
    f1_now = []
    for batch in eval_dataloader:
        batch = {k: v for k, v in batch.items()}#.cuda()
        with torch.no_grad():
            outputs = model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'],labels=batch['labels'])
            #TODO: add 2 more parameters as followed:
            #model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'],POS = batch['POS'], BIO = batch['BIO'], labels=batch['labels'])
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        true_labels = [[label_names[l] for l in label if l != -100] for label in batch["labels"]]
        
        reshaped_predictions = torch.reshape(predictions, (BatchSize,-1))
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(reshaped_predictions, batch["labels"])
        ]
        
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)
        progress_bar_eval.update(1)
        f1_now.append(all_metrics["overall_f1"])
        
    if np.mean(f1_now) > f1_best or not best_net:
        torch.save(model_cc.state_dict(), 'sstcls_best.dat')
        f1_best = np.mean(f1_now)
        print("the best f1 is now: "+ str(np.mean(f1_now)))
        best_net = model_cc
        

    print(metric.compute())

In [None]:
def predict(net, dataloader, gpu):
    net.eval()

    f1_now = []
    precision_now = []
    recall_now = []
    accuracy_now = []
    for batch in eval_dataloader:
        batch = {k: v for k, v in batch.items()}#.cuda()
        with torch.no_grad():
            outputs = model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
            #TODO: add 2 more parameters as followed:
            #model_cc(batch['input_ids'], token_type_ids=batch['token_type_ids'], attention_mask=batch['attention_mask'],POS = batch['POS'], BIO = batch['BIO'], labels=batch['labels'])
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        true_labels = [[label_names[l] for l in label if l != -100] for label in batch["labels"]]
        
        reshaped_predictions = torch.reshape(predictions, (BatchSize,-1))
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(reshaped_predictions, batch["labels"])
        ]
        
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)
        progress_bar_eval.update(1)
        f1_now.append(all_metrics["overall_f1"])
        precision_now.append(all_metrics["overall_precision"])
        recall_now.append(all_metrics["overall_recall"])
        accuracy_now.append(all_metrics["overall_accuracy"])


    return np.mean(f1_now), np.mean(precision_now), np.mean(recall_now), np.mean(accuracy_now),



In [None]:
preds = predict(best_net, test_dataloader, 0)
print("test_f1_score:::"+str(preds[0]))
print("test_precision_score:::"+str(preds[1]))
print("test_recall_score:::"+str(preds[2]))
print("test_accuracy_score:::"+str(preds[3]))