In [2]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_tokens(["<ROOT>", "<EMPTY>"], special_tokens=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(42)
print(device)


BATCH_SIZE = 32

cpu


# Data


In [3]:
def is_projective(tree):
    for i in range(len(tree)):
        if tree[i] == -1:
            continue
        left = min(i, tree[i])
        right = max(i, tree[i])

        for j in range(0, left):
            if tree[j] > left and tree[j] < right:
                return False
        for j in range(left + 1, right):
            if tree[j] < left or tree[j] > right:
                return False
        for j in range(right + 1, len(tree)):
            if tree[j] > left and tree[j] < right:
                return False

    return True

In [4]:
tokenizer("tokenizer", add_special_tokens=False)

{'input_ids': [19204, 17629], 'token_type_ids': [0, 0], 'attention_mask': [1, 1]}

In [5]:
from datasets import load_dataset

train_dataset=load_dataset("universal_dependencies", "en_lines", split="train")
validation_dataset=load_dataset("universal_dependencies", "en_lines", split="validation")
test_dataset=load_dataset("universal_dependencies", "en_lines", split="test")
print(len(train_dataset))
print(len(validation_dataset))
print(len(test_dataset))


# remove non projective
train_dataset = train_dataset.filter(lambda x:is_projective([-1]+list(map(int,x['head'])))) 
validation_dataset = validation_dataset.filter(lambda x:is_projective([-1]+list(map(int,x['head']))))
test_dataset = test_dataset.filter(lambda x:is_projective([-1]+list(map(int,x['head']))))
print(len(train_dataset))
print(len(validation_dataset))
print(len(test_dataset))

Found cached dataset universal_dependencies (/home/matteo/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7)
Found cached dataset universal_dependencies (/home/matteo/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7)
Found cached dataset universal_dependencies (/home/matteo/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7)
Loading cached processed dataset at /home/matteo/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7/cache-f8354f14dcbe18f8.arrow
Loading cached processed dataset at /home/matteo/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7/cache-cc45d20ff3a9cc5d.arrow
Loading cach

3176
1032
1035
2922
930
968


In [71]:
def generate_gold_path(sentence, gold):
    from arceagerparser import ArcEager, Oracle
    parser = ArcEager(sentence)
    oracle = Oracle(parser, gold)

    gold_configurations = []
    gold_moves = []

    while not parser.is_tree_final():
        # save configuration
        configuration = [
            parser.stack[ - 1],
        ]
        if len(parser.buffer) == 0:
            configuration.append(-1)
        else:
            configuration.append(parser.buffer[0])
        
        # save configuration    
        gold_configurations.append(configuration)

            # save gold move
        if oracle.is_left_arc_gold():
            gold_moves.append(0)
            parser.left_arc()
        elif oracle.is_right_arc_gold():
            gold_moves.append(1)
            parser.right_arc()
        elif oracle.is_shift_gold():
            gold_moves.append(2)
            parser.shift()
        elif oracle.is_reduce_gold():
            gold_moves.append(3)
            parser.reduce()
    
    return gold_configurations, gold_moves, 

def get_configurations(toks, heads, get_gold_path=False):
    # put sentence and gold tree in our format
        # gold_path and gold_moves are parallel arrays whose elements refer to parsing steps
    gold_configurations= (
        []
    )  # record two topmost stack tokens and first 2 buffer token for current step
    gold_moves = (
        []
    )  # contains oracle (canonical) move for current step: 0 is left, 1 right, 2 shift, 3 reduce
    gold_heads=[]
    for tokens, head in zip(toks, heads):
        conf=[]   
        mov=[]

        tokens = ["<ROOT>"] + tokens
        head = [-1] + list(map(int,head))

        if get_gold_path:  # only for training
            conf, mov=generate_gold_path(tokens, head)
            
            
        gold_configurations.append(conf)
        gold_moves.append(mov)
        gold_heads.append(head)

    return gold_configurations, gold_moves,gold_heads
  
def prepare_batch(batch_data,get_gold_path=False):
    global tokenizer
    tok_sentences= tokenizer(["<ROOT> "+bd["text"] for bd in batch_data], padding=True, return_tensors="pt") # FIXME : add ROOT token
    configurations, moves, gold = get_configurations(
        [bd["tokens"] for bd in batch_data],
        [bd["head"] for bd in batch_data],
        get_gold_path) 

    return tok_sentences, configurations, moves, gold



In [72]:
# processed_sample = tokenizer(train_dataset["text"]) # input_ids token_type_ids attention_mask

# processed_sample.update(get_oracledata(train_dataset["tokens"], train_dataset["head"])) # configurations moves

# processed_sample.keys()


## Dataloader


In [73]:
train_dataloader = torch.utils.data.DataLoader( # type:ignore
  train_dataset,
  batch_size=BATCH_SIZE, 
  shuffle=True,
  collate_fn=lambda x: prepare_batch(x, get_gold_path=True)
)

validation_dataloader = torch.utils.data.DataLoader( # type: ignore
  validation_dataset,
  batch_size=BATCH_SIZE,
  shuffle=True,
  collate_fn=lambda x: prepare_batch(x, get_gold_path=True)
)

test_dataloader = torch.utils.data.DataLoader( # type:ignore
  test_dataset,
  batch_size=BATCH_SIZE,
  shuffle=True,
  collate_fn=lambda x: prepare_batch(x, get_gold_path=False)
)

# NET


In [74]:
BATCH_SIZE = 32
DIM_CONFIG = 2
LSTM_ISBI = True
BERT_SIZE = 768
EMBEDDING_SIZE = BERT_SIZE
DIM_CONFIG = 2
LSTM_LAYERS = 1
MLP_SIZE = 200
CLASSES = 4
DROPOUT = 0.2
EPOCHS = 1 # 30
LR = 0.001  # learning rate
NUM_LABELS_OUT = 4

In [96]:
from transformers import AutoModel

#modelBert=AutoModel.from_pretrained('bert-base-uncased')

class BERTNet(nn.Module):
  def __init__(self,device) -> None:
    super().__init__()
    self.device=device
    
    self.embeddings = nn.Embedding(
        len(tokenizer), EMBEDDING_SIZE, padding_idx=0
    )
    
    self.bert = AutoModel.from_pretrained('bert-base-uncased')
    self.bert.resize_token_embeddings(len(tokenizer))
    self.w1=nn.Linear(DIM_CONFIG*BERT_SIZE, MLP_SIZE)
    self.w2=nn.Linear(MLP_SIZE, CLASSES)
    self.activation= nn.Tanh()
    self.softmax=nn.Softmax(dim=-1)
    self.dropout = nn.Dropout(DROPOUT)
    
  def forward(self, bertInput, configs):
    #x=[self.dropout(self.embeddings(torch.tensor(s).to(self.device))) for s in bertInput]
    bertInput=bertInput.to(self.device)
    input_ids=bertInput['input_ids'].to(self.device)
    attention_mask=bertInput['attention_mask'].to(self.device)
    
        # Apply the BERT model. This will return a sequence of hidden-states at the output of the last layer of the model.
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

    # Get the last hidden state of the token `[CLS]` for each example. BERT gives this as the first token in the sequence.
    cls_output = outputs.last_hidden_state#[:, 0, :]

    # Apply dropout on cls_output (not on the input)
    x = self.dropout(cls_output)
    
    print(x.shape)
    

model = BERTNet(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## run model


In [97]:
def train(model, dataloader, criterion, optimizer):
    model.train()  # setup model for training mode
    total_loss = 0
    count = 0
    for batch in dataloader:
        optimizer.zero_grad()
        sentences, paths, moves, trees = batch
        print(sentences)        print(sentences[0])
        out = model(sentences, paths)
        ##out = model(input_ids=sentences['input_ids'].to(device), 
        ##    attention_mask=sentences['attention_mask'].to(device), 
        ##    paths)

        labels = torch.tensor(sum(moves, [])).to(
            device
        )  # sum(moves, []) flatten the array
        loss = criterion(out, labels)
        count += 1
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / count

def evaluate(gold, preds):
    total = 0
    correct = 0
    for g, p in zip(gold, preds):
        for i in range(1, len(g)):
            total += 1
            if g[i] == p[i]:
                correct += 1
    return correct / total

def test(model, dataloader):
    model.eval()
    gold = []
    preds = []
    for batch in dataloader:
        sentences, paths, moves, trees = batch
        with torch.no_grad():
            pred = model.infere(sentences)
            gold += trees
            preds += pred
    return evaluate(gold, preds)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
for epoch in range(EPOCHS):
    print("Starting Epoch", epoch)
    avg_train_loss = train(model, train_dataloader, criterion, optimizer)
    val_uas = test(model, validation_dataloader)
    print(
        "Epoch: {:3d} | avg_train_loss: {:5.3f} | dev_uas: {:5.3f} |".format(
            epoch, avg_train_loss, val_uas
        )
    )
    #save the model on pytorch format

Starting Epoch 0
{'input_ids': tensor([[  101, 30522,  2019,  ...,     0,     0,     0],
        [  101, 30522,  2009,  ...,     0,     0,     0],
        [  101, 30522,  2023,  ...,     0,     0,     0],
        ...,
        [  101, 30522,  3513,  ...,     0,     0,     0],
        [  101, 30522,  1996,  ...,     0,     0,     0],
        [  101, 30522,  1996,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
Encoding(num_tokens=82, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
torch.Size([3

TypeError: cross_entropy_loss(): argument 'input' (position 1) must be Tensor, not NoneType