In [11]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer.add_tokens(["<ROOT>", "<EMPTY>"], special_tokens=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)
print(device)


BATCH_SIZE = 32

cpu


# Data


In [12]:
def is_projective(tree):
    for i in range(len(tree)):
        if tree[i] == -1:
            continue
        left = min(i, tree[i])
        right = max(i, tree[i])

        for j in range(0, left):
            if tree[j] > left and tree[j] < right:
                return False
        for j in range(left + 1, right):
            if tree[j] < left or tree[j] > right:
                return False
        for j in range(right + 1, len(tree)):
            if tree[j] > left and tree[j] < right:
                return False

    return True

In [13]:
tokenizer("tokenizer", add_special_tokens=False)

{'input_ids': [19204, 17629], 'token_type_ids': [0, 0], 'attention_mask': [1, 1]}

In [14]:
from datasets import load_dataset

train_dataset = load_dataset("universal_dependencies", "en_lines", split="train")
validation_dataset = load_dataset(
    "universal_dependencies", "en_lines", split="validation"
)
test_dataset = load_dataset("universal_dependencies", "en_lines", split="test")
print(len(train_dataset))
print(len(validation_dataset))
print(len(test_dataset))


# remove non projective
train_dataset = train_dataset.filter(
    lambda x: is_projective([-1] + list(map(int, x["head"])))
)
validation_dataset = validation_dataset.filter(
    lambda x: is_projective([-1] + list(map(int, x["head"])))
)
test_dataset = test_dataset.filter(
    lambda x: is_projective([-1] + list(map(int, x["head"])))
)
print(len(train_dataset))
print(len(validation_dataset))
print(len(test_dataset))

Found cached dataset universal_dependencies (/home/matteo/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7)
Found cached dataset universal_dependencies (/home/matteo/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7)
Found cached dataset universal_dependencies (/home/matteo/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7)
Loading cached processed dataset at /home/matteo/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7/cache-f8354f14dcbe18f8.arrow
Loading cached processed dataset at /home/matteo/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7/cache-cc45d20ff3a9cc5d.arrow
Loading cach

3176
1032
1035
2922
930
968


In [15]:
def generate_gold_path(sentence, gold):
    from arceagerparser import ArcEager, Oracle

    parser = ArcEager(sentence)
    oracle = Oracle(parser, gold)

    gold_configurations = []
    gold_moves = []

    while not parser.is_tree_final():
        # save configuration - index of token in sentence
        configuration = [
            parser.stack[-1],
        ]
        if len(parser.buffer) == 0:
            configuration.append(-1)
        else:
            configuration.append(parser.buffer[0])

        # save configuration
        gold_configurations.append(configuration)

        # save gold move
        if oracle.is_left_arc_gold():
            gold_moves.append(0)
            parser.left_arc()
        elif oracle.is_right_arc_gold():
            gold_moves.append(1)
            parser.right_arc()
        elif oracle.is_shift_gold():
            gold_moves.append(2)
            parser.shift()
        elif oracle.is_reduce_gold():
            gold_moves.append(3)
            parser.reduce()

    return (
        gold_configurations,
        gold_moves,
    )


def get_configurations(toks, heads, get_gold_path=False):
    # put sentence and gold tree in our format
    # gold_path and gold_moves are parallel arrays whose elements refer to parsing steps
    gold_configurations = (
        []
    )  # record two topmost stack tokens and first 2 buffer token for current step
    gold_moves = (
        []
    )  # contains oracle (canonical) move for current step: 0 is left, 1 right, 2 shift, 3 reduce
    gold_heads = []
    for tokens, head in zip(toks, heads):
        conf = []
        mov = []

        tokens = ["<ROOT>"] + tokens
        head = [-1] + list(map(int, head))

        if get_gold_path:  # only for training
            conf, mov = generate_gold_path(tokens, head)

        gold_configurations.append(conf)
        gold_moves.append(mov)
        gold_heads.append(head)

    return gold_configurations, gold_moves, gold_heads


def match_subtokens(l1, l2):
    # Create output list
    output = []
    # Initialize index for l2
    index = 0
    # Iterate through l1
    for token in l1:
        subtoken_indices = []
        # Get the indices of the subtokens
        while index < len(l2) and (not subtoken_indices or l2[index].startswith("#")):
            subtoken_indices.append(index)
            index += 1
        # Append subtoken indices to output
        output.append(subtoken_indices)
    return output


def tokens_tokenizer_correspondence(tokens, berttokens):
    global tokenizer
    correspondences = []

    for t, bt in zip(tokens, berttokens):
        correspondences.append(match_subtokens(t, tokenizer.convert_ids_to_tokens(bt)))
    return correspondences


def prepare_batch(batch_data, get_gold_path=False):
    global tokenizer
    tok_sentences = tokenizer(
        ["<ROOT> " + bd["text"] for bd in batch_data],
        padding=True,
        return_tensors="pt",
        add_special_tokens=False,
    )  # FIXME : add ROOT token
    configurations, moves, gold = get_configurations(
        [bd["tokens"] for bd in batch_data],
        [bd["head"] for bd in batch_data],
        get_gold_path,
    )
    correspondences = tokens_tokenizer_correspondence(
        [["<ROOT>"] + bd["tokens"] for bd in batch_data], tok_sentences["input_ids"]
    )

    return tok_sentences, configurations, moves, gold, correspondences

In [16]:
# processed_sample = tokenizer(train_dataset["text"]) # input_ids token_type_ids attention_mask

# processed_sample.update(get_oracledata(train_dataset["tokens"], train_dataset["head"])) # configurations moves

# processed_sample.keys()

## Dataloader


In [17]:
train_dataloader = torch.utils.data.DataLoader(  # type:ignore
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda x: prepare_batch(x, get_gold_path=True),
)

validation_dataloader = torch.utils.data.DataLoader(  # type: ignore
    validation_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda x: prepare_batch(x, get_gold_path=True),
)

test_dataloader = torch.utils.data.DataLoader(  # type:ignore
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda x: prepare_batch(x, get_gold_path=False),
)

# NET


In [18]:
BATCH_SIZE = 32
DIM_CONFIG = 2
LSTM_ISBI = True
BERT_SIZE = 768
EMBEDDING_SIZE = BERT_SIZE
DIM_CONFIG = 2
LSTM_LAYERS = 1
MLP_SIZE = 200
CLASSES = 4
DROPOUT = 0.2
EPOCHS = 1  # 30
LR = 0.001  # learning rate
NUM_LABELS_OUT = 4

In [9]:
from transformers import AutoModel

# modelBert=AutoModel.from_pretrained('bert-base-uncased')


class BERTNet(nn.Module):
    def __init__(self, device) -> None:
        super().__init__()
        self.device = device

        self.embeddings = nn.Embedding(len(tokenizer), EMBEDDING_SIZE, padding_idx=0)

        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.bert.resize_token_embeddings(len(tokenizer))
        self.w1 = nn.Linear(DIM_CONFIG * BERT_SIZE, MLP_SIZE)
        self.w2 = nn.Linear(MLP_SIZE, CLASSES)
        self.activation = nn.Tanh()
        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(DROPOUT)

    def get_mlp_input(self, configs, h, correspondences):
        def getAvgH(h, corr):
            avgH = torch.zeros(BERT_SIZE, requires_grad=False).to(self.device)
            for i in corr:
                avgH += h[i]
            avgH /= len(corr)
            return avgH

        c = 0
        mlp_input = []
        zero_tensor = torch.zeros(BERT_SIZE, requires_grad=False).to(self.device)
        for i, (conf, corr) in enumerate(zip(configs, correspondences)):
            c += len(conf)
            for j in conf:
                mlp_input.append(
                    torch.cat(
                        [
                            zero_tensor if j[0] == -1 else getAvgH(h[i], corr[j[0]]),
                            zero_tensor if j[1] == -1 else getAvgH(h[i], corr[j[1]]),
                        ]
                    )
                )

        return mlp_input

    def forward(self, bertInput, configs, correspondencens):
        # --------------------------------- BERT  ---------------------------------
        # x=[self.dropout(self.embeddings(torch.tensor(s).to(self.device))) for s in bertInput]
        bertInput = bertInput.to(self.device)
        input_ids = bertInput["input_ids"].to(self.device)
        attention_mask = bertInput["attention_mask"].to(self.device)

        # Apply the BERT model. This will return a sequence of hidden-states at the output of the last layer of the model.
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Get the last hidden state of the token `[CLS]` for each example. BERT gives this as the first token in the sequence.
        h = outputs.last_hidden_state

        # Apply dropout on cls_output (not on the input)
        # --------------------------------- LINEAR--------------------------------
        # mlp_input = self.get_mlp_input(configurations, h, correspondences)
        def getAvgH(h, corr):
            avgH = torch.zeros(BERT_SIZE, requires_grad=False).to(self.device)
            for i in corr:
                avgH += h[i]
            avgH /= len(corr)
            return avgH

        c = 0
        mlp_input = []
        zero_tensor = torch.zeros(BERT_SIZE, requires_grad=False).to(self.device)
        for i, (conf, corr) in enumerate(zip(configs, correspondencens)):
            c += len(conf)
            for j in conf:
                mlp_input.append(
                    torch.cat(
                        [
                            zero_tensor if j[0] == -1 else getAvgH(h[i], corr[j[0]]),
                            zero_tensor if j[1] == -1 else getAvgH(h[i], corr[j[1]]),
                        ]
                    )
                )
        mlp_input = torch.stack(mlp_input)

        out = self.softmax(self.w2(self.activation(self.w1(self.dropout(mlp_input)))))
        return out

    def get_configurations(self, parsers):
        configurations = []
        for parser in parsers:
            if parser.is_tree_final():
                conf = [-1, -1]
            else:
                conf = [
                    parser.stack[len(parser.stack) - 1],
                ]
                if len(parser.buffer) == 0:
                    conf.append(-1)
                else:
                    conf.append(parser.buffer[0])
            configurations.append([conf])
        # print(f"configurations {configurations}")
        return configurations

    def parsed_all(self, parsers):
        for parser in parsers:
            if not parser.is_tree_final():
                return False
        return True

    def infere(self, bertInput):
        from arceagerparser import ArcEager

        parsers = [ArcEager(tok) for tok in bertInput["tokens"]]

        bertInput = bertInput.to(self.device)
        input_ids = bertInput["input_ids"].to(self.device)
        attention_mask = bertInput["attention_mask"].to(self.device)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        while not self.parsed_all(parsers):
            # get the current configuration and score next moves
            configurations = self.get_configurations(parsers)
            mlp_input = self.get_mlp_input(configurations, h)
            mlp_out = self.mlp(mlp_input)
            # take the next parsing step
            self.parse_step(parsers, mlp_out)

        # return the predicted dependency tree
        return [parser.arcs for parser in parsers]
        # In this function we select and perform the next move according to the scores obtained.
        # We need to be careful and select correct moves, e.g. don't do a shift if the buffer
        # is empty or a left arc if σ2 is the ROOT. For clarity sake we didn't implement
        # these checks in the parser so we must do them here. This renders the function quite ugly
        # 0 Lx; 1 Rx, 2 shifr; 3 reduce

    def parse_step(self, parsers, moves):
        moves_argm = moves.argmax(-1)
        for i in range(len(parsers)):
            noMove = False
            # Conditions
            cond_left = (
                len(parsers[i].stack)
                and len(parsers[i].buffer)
                and parsers[i].stack[-1] != 0
            )
            cond_right = len(parsers[i].stack) and len(parsers[i].buffer)
            cond_reduce = len(parsers[i].stack) and parsers[i].stack[-1] != 0
            cond_shift = len(parsers[i].buffer) > 0
            if parsers[i].is_tree_final():
                continue
            else:
                if moves_argm[i] == 0:
                    # ------------------------------ firdt condition to check is the left arc -> right arc -> shift -> reduce------------------------------
                    if cond_left:
                        parsers[i].left_arc()
                    else:
                        if cond_right:
                            parsers[i].right_arc()
                        elif cond_shift:
                            parsers[i].shift()
                        elif cond_reduce:
                            parsers[i].reduce()
                        else:
                            print("noMove was possible on left")
                # ------------------------------ firdt condition to check is the right arc -> shift -> reduce------------------------------
                if moves_argm[i] == 1:
                    # print("right")
                    if cond_right:
                        parsers[i].right_arc()
                    else:
                        if cond_shift:
                            parsers[i].shift()
                        elif cond_reduce:
                            parsers[i].reduce()
                        else:
                            print("noMove was possible on right")
                # ------------------------------ firdt condition to check is the shift -> reduce------------------------------
                if moves_argm[i] == 2:
                    if cond_shift:
                        parsers[i].shift()
                    elif cond_reduce:
                        parsers[i].reduce()
                    else:
                        print("noMove was possible on shift")
                # ------------------------------ firdt condition to check is the reduce and if no reduce was possible take in account the probabilities ------------------------------
                if moves_argm[i] == 3:
                    if cond_reduce:
                        parsers[i].reduce()
                    else:
                        if (
                            moves[i][0] > moves[i][1]
                            and moves[i][0] > moves[i][2]
                            and cond_left
                        ):
                            parsers[i].left_arc()
                        else:
                            if moves[i][1] > moves[i][2] and cond_right:
                                parsers[i].right_arc()
                            else:
                                if cond_shift:
                                    parsers[i].shift()
                                else:
                                    print(
                                        moves[i][0],
                                        moves[i][1],
                                        moves[i][2],
                                        cond_left,
                                        cond_right,
                                        cond_shift,
                                    )


model = BERTNet(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
print(train_dataset["tokens"][500])
a=tokenizer(train_dataset["text"][500], add_special_tokens=False)
)


SyntaxError: unmatched ')' (3786829655.py, line 3)

In [None]:
from transformers import AutoModel

# modelBert=AutoModel.from_pretrained('bert-base-uncased')


class BERTNet(nn.Module):
    def __init__(self, device) -> None:
        super().__init__()
        self.device = device

        self.embeddings = nn.Embedding(len(tokenizer), EMBEDDING_SIZE, padding_idx=0)

        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.bert.resize_token_embeddings(len(tokenizer))
        self.w1 = nn.Linear(DIM_CONFIG * BERT_SIZE, MLP_SIZE)
        self.w2 = nn.Linear(MLP_SIZE, CLASSES)
        self.activation = nn.Tanh()
        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, bertInput, configs, correspondencens):
        # --------------------------------- BERT  ---------------------------------
        # x=[self.dropout(self.embeddings(torch.tensor(s).to(self.device))) for s in bertInput]
        bertInput = bertInput.to(self.device)
        input_ids = bertInput["input_ids"].to(self.device)
        attention_mask = bertInput["attention_mask"].to(self.device)

        # Apply the BERT model. This will return a sequence of hidden-states at the output of the last layer of the model.
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Get the last hidden state of the token `[CLS]` for each example. BERT gives this as the first token in the sequence.
        h = outputs.last_hidden_state

        # Apply dropout on cls_output (not on the input)
        # --------------------------------- LINEAR--------------------------------
        def getAvgH(h, corr):
            avgH = torch.zeros(BERT_SIZE, requires_grad=False).to(self.device)
            for i in corr:
                avgH += h[i]
            avgH /= len(corr)
            return avgH

        c = 0
        mlp_input = []
        zero_tensor = torch.zeros(BERT_SIZE, requires_grad=False).to(self.device)
        for i, (conf, corr) in enumerate(zip(configs, correspondencens)):
            c += len(conf)
            for j in conf:
                mlp_input.append(
                    torch.cat(
                        [
                            zero_tensor if j[0] == -1 else getAvgH(h[i], corr[j[0]]),
                            zero_tensor if j[1] == -1 else getAvgH(h[i], corr[j[1]]),
                        ]
                    )
                )
        mlp_input = torch.stack(mlp_input)

        out = self.softmax(self.w2(self.activation(self.w1(self.dropout(mlp_input)))))
        return out

    def get_configurations(self, parsers):
        configurations = []
        for parser in parsers:
            if parser.is_tree_final():
                conf = [-1, -1, -1]
            else:
                conf = [
                    parser.stack[len(parser.stack) - 2],
                    parser.stack[len(parser.stack) - 1],
                ]
                if len(parser.buffer) == 0:
                    conf.append(-1)
                else:
                    conf.append(parser.buffer[0])
            configurations.append([conf])
        # print(f"configurations {configurations}")
        return configurations

    def infere(self, bertInput):
        from arceagerparser import ArcEager, Oracle

        parsers = [ArcEager(tok) for tok in tokens]
        tok_sentences = tokenizer(
            ["<ROOT> " + bd["text"] for bd in batch_data],
            padding=True,
            return_tensors="pt",
            add_special_tokens=False,
        )  # FIXME : add ROOT token

        bertInput = bertInput.to(self.device)
        input_ids = bertInput["input_ids"].to(self.device)
        attention_mask = bertInput["attention_mask"].to(self.device)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        while not self.parsed_all(parsers):
            # get the current configuration and score next moves
            configurations = self.get_configurations(parsers)
            mlp_input = self.get_mlp_input(configurations, h)
            mlp_out = self.mlp(mlp_input)
            # take the next parsing step
            self.parse_step(parsers, mlp_out)

        # return the predicted dependency tree
        return [parser.arcs for parser in parsers]


model = BERTNet(device)

SyntaxError: invalid syntax (2492195962.py, line 63)

## run model


In [None]:
def train(model, dataloader, criterion, optimizer):
    model.train()  # setup model for training mode
    total_loss = 0
    count = 0
    for batch in dataloader:
        optimizer.zero_grad()
        sentences, paths, moves, trees, correspondences = batch
        out = model(sentences, paths, correspondences)
        ##out = model(input_ids=sentences['input_ids'].to(device),
        ##    attention_mask=sentences['attention_mask'].to(device),
        ##    paths)

        labels = torch.tensor(sum(moves, [])).to(
            device
        )  # sum(moves, []) flatten the array
        loss = criterion(out, labels)
        count += 1
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / count


def evaluate(gold, preds):
    total = 0
    correct = 0
    for g, p in zip(gold, preds):
        for i in range(1, len(g)):
            total += 1
            if g[i] == p[i]:
                correct += 1
    return correct / total


def test(model, dataloader):
    model.eval()
    gold = []
    preds = []
    for batch in dataloader:
        sentences, paths, moves, trees, correspondences = batch
        with torch.no_grad():
            pred = model.infere(sentences)
            gold += trees
            preds += pred
    return evaluate(gold, preds)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
EPOCHS = 1
for epoch in range(EPOCHS):
    print("Starting Epoch", epoch)
    # avg_train_loss = train(model, train_dataloader, criterion, optimizer)
    # torch.save(model.state_dict(), f"model_e{epoch}.pt")
    torch.load(f"model_e{epoch}.pt")
    val_uas = test(model, validation_dataloader)
    print(
        "Epoch: {:3d} | avg_train_loss: {:5.3f} | dev_uas: {:5.3f} |".format(
            epoch, avg_train_loss, val_uas
        )
    )
    # save the model on pytorch format

Starting Epoch 0


AttributeError: 'BERTNet' object has no attribute 'infere'