In [1]:
#1. build our own dataset

In [2]:
from datasets import Dataset, DatasetDict
import pandas as pd

# map POS to POS_id
cnt = 0
POS_id = {}
POS_ls = ['NN', 'IN', 'NNP', 'DT', 'NNS', 'JJ', 'COMMA', 'CD', '.', 'VBD', 'RB','VB', 'CC', 'VBN', 'VBZ', 
          'VBG', 'TO', 'PRP', 'VBP', 'POS', 'PRP$','MD', '$', '``', "''", 'WDT', ':', 'JJR', 'RP', 'RBR', 
          'WP', 'NNPS','JJS', ')', '(', 'EX', 'RBS', 'WRB', '-', 'UH', 'WP$', 'PDT', '/', '#', 'LS', 'SYM', 'FW', 'AUX']
for pos in POS_ls:
    POS_id[pos] = cnt
    cnt += 1

# map BIO to BIO_id
cnt = 0
BIO_id = {}
BIO_ls = ['O', 'B-NP', 'I-NP', 'B-PP', 'B-ADVP', 'B-ADJP', 'B-SBAR', 'B-CONJP',
       'I-ADJP', 'I-PP', 'I-ADVP', 'I-CONJP', 'B-INTJ', 'I-SBAR', 'B-LST',
       'B-VP', 'B-PRT', 'I-INTJ', 'I-VP']
for bio in BIO_ls:
    BIO_id[bio] = cnt
    cnt += 1

# map label to BIO_id
Label_id = {"ARG0":0,"ARG1":1,"ARG2":2,"PRED":3,"SUPPORT":4}
def mapLabel(label):
    return Label_id[label] if label in Label_id else 5

# build datasets
def condense_df(file):
    df = pd.DataFrame()
    with open(file, 'r') as file:
        ls = [i.split('\t') for i in file.read().split('\n')]
        df = pd.DataFrame(ls)

    df['id'] = df.index
    df[0].replace('', None, inplace=True)
    df.dropna(axis=0, subset = [0], inplace = True)
    df['BIO'] = df[2].map(BIO_id)
    df['POS'] = df[1].map(POS_id)
    df['label'] = df[5].map(mapLabel)
    df['id'] = df[4].map(int)
    df.drop(columns = [1, 2, 3, 4, 5, 6], inplace = True)
    condense = df.groupby('id').apply(lambda x: [list(x[0]),list(x['POS']), list(x['BIO']), list(x['label'])]).apply(pd.Series)
    condense.columns =['tokens','POS','BIO','label']
    return condense


train = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.train"))
eval_ = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.dev"))
test = Dataset.from_pandas(condense_df("Partitive-Files/%_nombank.clean.test"))
datasets = DatasetDict({"train": train, "validation":eval_, "test":test})
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 2174
    })
    validation: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 83
    })
    test: Dataset({
        features: ['tokens', 'POS', 'BIO', 'label', 'id'],
        num_rows: 150
    })
})

In [3]:
import pandas as pd
import nltk
import re

def construct_dataset(filename):
    corpus = []
    sentence = []
    labels = []
    i = 0

    with open(filename, 'r') as f:
        for line in f.readlines():
            line = line.strip('\n')

            if line:
                word = line.split()[0]
                sentence.append(word)
                if line.split()[-1] == 'ARG1':
                    labels.append(i) 
                i += 1
            else:
                sentence = ' '.join(sentence)
                sentence = sentence.replace('COMMA', ',')
                sentence = re.sub(r' +', ' ', sentence)
                corpus.append(sentence)
                sentence = []
                i = 0
        return corpus, labels

In [4]:
X_train, y_train = construct_dataset('Partitive-Files/%_nombank.clean.train')
X_dev, y_dev = construct_dataset('Partitive-Files/%_nombank.clean.dev')
X_test, y_test = construct_dataset('Partitive-Files/%_nombank.clean.test')

In [5]:
#load pretrained bert base model
from transformers import BertModel
bert_model = BertModel.from_pretrained('bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
 # 2. Data loader and tokenization

In [7]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

# nltk.download('punkt')

class MyDataset(Dataset):

    def __init__(self, corpus, labels, maxlen):

        self.corpus = corpus
        self.labels = labels
        
        # initialize tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.maxlen = maxlen

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, index):

        sentence = self.corpus[index]
        label = self.labels[index]

        tokens = self.tokenizer.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']

        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] 
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]']

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids_tensor = torch.tensor(tokens_ids)

        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label



In [8]:
from torch.utils.data import DataLoader

train_set = MyDataset(X_train, y_train, maxlen = 365)
dev_set = MyDataset(X_dev, y_dev, maxlen = 365)

train_loader = DataLoader(train_set, batch_size = 8, num_workers = 1)
dev_loader = DataLoader(dev_set, batch_size = 8, num_workers = 1)

In [9]:
import torch
import torch.nn as nn
from transformers import BertModel

class TokenClassifier(nn.Module):

    def __init__(self):
        super(TokenClassifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-cased')
        
        self.cls_layer = nn.Linear(768, 365)

    def forward(self, seq, attn_masks):

        outputs = self.bert_layer(seq, attention_mask = attn_masks)
        cont_reps = outputs.last_hidden_state

        cls_rep = cont_reps[:, 0]

        logits = self.cls_layer(cls_rep)

        return logits

In [10]:
import torch
gpu = 0
torch.cuda.is_available()
net = TokenClassifier()
net.cuda(gpu)
print("Done creating classifier.")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating classifier.


In [30]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps):

    best_acc = 0
    st = time.time()
    best_net = None
    for ep in range(max_eps):
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):

            opti.zero_grad()  
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)

            loss = criterion(logits.squeeze(-1), labels)

            loss.backward()

            opti.step()
              
            if it % 10 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep+1, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep+1, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            best_net = net
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep+1))
    return best_net



In [31]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps):

    best_acc = 0
    st = time.time()
    best_net = None
    for ep in range(max_eps):
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):

            opti.zero_grad()  

            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            logits = net(seq, attn_masks)
            loss = criterion(logits.squeeze(-1), labels)

            loss.backward()

            opti.step()
              
            if it % 10 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep+1, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep+1, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            best_net = net
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep+1))
    return best_net

In [32]:


def get_accuracy_from_logits(logits, labels):
    preds = logits.argmax(dim=1)
    acc = (preds == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count



In [34]:
num_epoch = 5

best_net = train(net, criterion, opti, train_loader, dev_loader, num_epoch)

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 5.81 GiB total capacity; 3.11 GiB already allocated; 19.31 MiB free; 3.38 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [17]:
test_set = MyDataset(X_test, y_test, maxlen = 365)
test_loader = DataLoader(test_set, batch_size = 16, num_workers = 2)

In [20]:
test_acc, test_loss = evaluate(best_net, criterion, test_loader)

In [21]:
def predict(net, dataloader, gpu):
    net.eval()

    y_preds = []
    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks = seq.cuda(gpu), attn_masks.cuda(gpu)
            logits = net(seq, attn_masks).squeeze()
            y_pred = logits.argmax(dim=1).tolist()
            y_preds += y_pred

    return y_preds

In [22]:
preds = predict(best_net, test_loader, gpu)

In [26]:
results = []
i = 0
j = 0
with open('Partitive-Files/%_nombank.clean.test', 'r') as f:
    for line in f.readlines():
        line = line.strip('\n')
        line_split = line.split('\t')
        if line_split and line_split[-1] == 'ARG1':
            line_split = line_split[:-1]
        if preds[j] == i:
            line_split.append('ARG1')
        i += 1
        if not line_split:
            i = 0
            j += 1
        results.append('\t'.join(line_split) + '\n')

with open('partitive.txt', 'w') as f:
    for result in results:
        f.write(result)

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if not word_id:
            new_labels.append(-100)
        else:
            if word_id != current_word:# Start of a new word!
                current_word = word_id       
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    all_labels = examples["label"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns = datasets["train"].column_names,
)

In [None]:
# 3. train while evaluate

In [None]:
small_train = tokenized_datasets["train"].shuffle(seed=42).select(range(30))
small_eval = tokenized_datasets["validation"].shuffle(seed=42).select(range(30))
small_test = tokenized_datasets["test"].shuffle(seed=42).select(range(30))

In [None]:
label_names = ["ARG0", "ARG1", "ARG2", "PRED", "SUPPORT", "None"]

In [None]:
for it, input_ in enumerate(tokenized_datasets["train"]):
    print(input_)

In [None]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps):

    best_acc = 0
    st = time.time()
    best_net = None
    for ep in range(max_eps):
        for it, input_ in enumerate(tokenized_datasets["train"]):
            seq = input_['input_ids']
            attn_masks = input_['attention_mask']
            labels = labels
            
            opti.zero_grad()  

            seq, attn_masks, labels = seq, attn_masks, labels

            logits = net(seq, attn_masks)
            print("logits.squeeze(-1)::"+str(logits.squeeze(-1).shape))
            print("labels::"+str(labels.shape))
            loss = criterion(logits.squeeze(-1), labels)

            loss.backward()

            opti.step()
              
            if it % 10 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep+1, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep+1, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            best_net = net
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep+1))
    return best_net


In [None]:
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, BertModel, Trainer
import evaluate
import numpy as np

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

args = TrainingArguments(
    output_dir = "bert-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_datasets["train"],#small_train,#
    eval_dataset = tokenized_datasets["validation"],#small_eval,#
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer = tokenizer,
)
trainer.train()



In [None]:
# Customer Trainer 2
AutoModelForTokenClassification

class CustomTrainModel(AutoModelForTokenClassification):
    def check_hidden(self):
        pass
    
for it, input_ in enumerate(tokenized_datasets["train"]):
    print(input_)

In [None]:
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = CustomTrainModel.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

args = TrainingArguments(
    output_dir = "bert-finetuned-custom",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_datasets["train"],#small_train,#
    eval_dataset = tokenized_datasets["validation"],#small_eval,#
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer = tokenizer,
)
trainer.train()



In [None]:
# loading the model you previously trained
model = AutoModelForTokenClassification.from_pretrained("bert-finetuned/checkpoint-816")
BATCH_SIZE = 1
# arguments for Trainer
test_args = TrainingArguments(
    output_dir = "bert-finetuned-testing",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = BATCH_SIZE,   
    dataloader_drop_last = False    
)

# init trainer
trainer = Trainer(
          model = model, 
          args = test_args, 
          compute_metrics = compute_metrics)

test_results = trainer.predict(tokenized_datasets["test"])

In [None]:
test_results.metrics