In [3]:
from uniparse import Vocabulary
from uniparse.dataprovider import batch_by_buckets
import random
from collections import defaultdict
import numpy as np
from sklearn.utils import class_weight
from collections import defaultdict
import numpy as np
from uniparse import Model, Vocabulary

from uniparse.callbacks import ModelSaveCallback
from uniparse.dataprovider import batch_by_buckets

# uniparse intro

In [4]:
vocab = Vocabulary()
samples = vocab._read_conll("../data/ptb_conllu/train.conllu", tokenize=False)

In [5]:
sample = random.choice(samples)

In [6]:
sample[2][:5]

[1, 'DET', 'ADJ', 'NOUN', 'NOUN']

In [7]:
sample[4]

[1,
 'det',
 'amod',
 'nmod:tmod',
 'compound',
 'nsubjpass',
 'auxpass',
 'acl:relcl',
 'case',
 'det',
 'amod',
 'nmod',
 'cc',
 'nsubj',
 'conj',
 'punct',
 'punct',
 'discourse',
 'dep',
 'punct',
 'nsubj',
 'punct',
 'nsubj',
 'aux',
 'ccomp',
 'dobj',
 'case',
 'nmod',
 'cc',
 'compound',
 'conj',
 'punct',
 'punct',
 'nsubj',
 'root',
 'nsubj',
 'aux',
 'ccomp',
 'compound:prt',
 'cc',
 'conj',
 'punct',
 'punct',
 'xcomp',
 'dobj',
 'punct',
 'dobj',
 'punct',
 'advmod',
 'aux',
 'neg',
 'nsubj',
 'parataxis',
 'dobj',
 'cc',
 'conj',
 'nmod:poss',
 'amod',
 'amod',
 'compound',
 'compound',
 'dobj',
 'punct']

# extract parent -> head -> modifier from treebank

In [8]:
from collections import Counter
import pandas as pd

def count_3rd_order_sets(treebank_file, vocab):
    samples = vocab._read_conll(treebank_file, tokenize=False)
    counter = Counter()
    for sample in samples:
        extract_3O_paths(sample[3], sample[4], counter)
        
    factors = [{"afactor":"%s->%s->%s"%k, "count":v} for k,v in counter.items()]
    df = pd.DataFrame.from_dict(factors).sort_values("count", ascending=False)
    
    return df

def extract_3O_paths(tree, tags, counter, root_token="<root>"):    
    for m, h in enumerate(tree):
        p = tree[h]
        parent_tag, head_tag, modifier_tag = [tags[i] if i >= 0 else tags[0] for i in [p, h, m]]
        parent_tag, head_tag, modifier_tag = [t if t != 1 else root_token for t in [parent_tag, head_tag, modifier_tag]]
        counter.update([(parent_tag, head_tag, modifier_tag)])

def factor_to_str(t):
    return "->".join(t)

def get_factor_weights(dataframe):
    dataset = []
    factor_map = defaultdict(int)
    for i, row in dataframe.iterrows():
        factor = row["afactor"]
        factor_map[i] = factor
        for _ in range(row["count"]):
            dataset.append(i)
    classes = np.unique(dataset)
    weights = class_weight.compute_class_weight("balanced", classes, dataset)
    return {factor_map[c]:w for c,w in zip(classes,weights)}

def batch_to_weights(sample, factor_weights, tag_map):
    (words, tags), (trees, labels) = sample
    # batch :: (b, n)
    output = np.zeros(trees.shape)
    for b, tree in enumerate(trees):
        for m, h in enumerate(tree):
            p = tree[h]
            parent_tag, head_tag, modifier_tag = [
                tags[b, i] if i >= 0 else tags[b, 0]
                for i in [p, h, m]
            ]
            factor_ = (parent_tag, head_tag, modifier_tag)
            factor = [tag_map[e] for e in factor_]
            factor = factor_to_str(factor)
            try:
                output[b, m] = factor_weights[factor]
            except:
                print(factor_)
                raise
        
    return output

In [9]:
ptb_df = count_3rd_order_sets("../data/ptb_conllu/train.conllu", vocab)

In [10]:
print(ptb_df.shape)
ptb_df.head()

(5479, 2)


Unnamed: 0,afactor,count
26,<root>->root->punct,66432
38,<root>-><root>->root,37977
0,root-><root>-><root>,37660
41,<root>->root->nsubj,32967
1,root->nmod->case,17664


In [None]:
(ptb_df["count"] == 1).sum()/ptb_df.shape[0]

In [None]:
ptb_df.plot.hist(bins=100)

In [None]:
x = ptb_df["count"] < 10
x.astype(int).plot.hist()

# UD EWT

In [None]:
ud_en_df = count_3rd_order_sets("../data/en_ewt-ud-train.conllu")

In [None]:
print(ud_en_df.shape)
ud_en_df.head()

In [None]:
ud_factor_weights = get_factor_weights(ud_en_df)
ud_factor_weights

# lets test it all out

In [None]:
vocab = Vocabulary()
vocab.fit("../data/en_ewt-ud-train.conllu")

In [None]:
X = vocab.tokenize_conll("../data/en_ewt-ud-train.conllu")

In [None]:
from uniparse.dataprovider import batch_by_buckets
X = batch_by_buckets(X, batch_size=32, shuffle=True)

In [None]:
_, samples = X

In [None]:
x, y = samples[0]

In [None]:
label_weights = batch_to_weights(samples[0], ud_factor_weights, vocab.tag2id)

In [None]:
ud_sample_weights = [batch_to_weights(samples[0], ud_factor_weights, vocab.tag2id) for sample in samples]

In [None]:
ud_sample_weights[1].shape

In [None]:
X_hat = [(x+(w,),y)for (x,y), w in zip(X[1], ud_sample_weights)]
words, tags, weights = X_hat[0][0]

In [None]:
words.shape, weights.shape

# lets do it from the top

In [13]:
from uniparse.models.dynet.syntax_att import Parser

def train(train_file, dev_file, test_file, n_epochs, parameter_file, vocab_file, model_class):
    """Training procedure."""
    vocab = Vocabulary()
    vocab = vocab.fit(train_file)
    
    # "../data/en_ewt-ud-train.conllu"
    train_file_df = count_3rd_order_sets(train_file, vocab)
    
    # save vocab for reproducability later
    if vocab_file:
        print("> saving vocab to", vocab_file)
        vocab.save(vocab_file)

    # prep data
    print(">> Loading in data")
    train_data = vocab.tokenize_conll(train_file)
    dev_data = vocab.tokenize_conll(dev_file)
    test_data = vocab.tokenize_conll(test_file)

    train_batches = batch_by_buckets(train_data, batch_size=32, shuffle=True)
    dev_batches = batch_by_buckets(dev_data, batch_size=32, shuffle=True)
    test_batches = batch_by_buckets(test_data, batch_size=32, shuffle=False)

    indicies, samples = train_batches
    
    factor_weights = get_factor_weights(train_file_df)

    label_weights = [batch_to_weights(sample, factor_weights, vocab._rel2id) for sample in samples]
    X_hat = [(x+(w,),y) for (x,y), w in zip(train_batches[1], label_weights)]
    train_batches = (indicies, X_hat)

    model = model_class(vocab)

    save_callback = ModelSaveCallback(parameter_file)
    callbacks = [save_callback]

    # prep params
    parser = Model(model, optimizer="adam", vocab=vocab)

    parser.train(train_batches, dev_file, dev_batches, epochs=n_epochs, callbacks=callbacks, verbose=True)
    parser.load_from_file(parameter_file)

    metrics = parser.evaluate(test_file, test_batches, delete_output=False)
    test_UAS = metrics["nopunct_uas"]
    test_LAS = metrics["nopunct_las"]

    print(metrics)

    print()
    print(">>> Model maxed on dev at epoch", save_callback.best_epoch)
    print(">>> Test score:", test_UAS, test_LAS)

In [14]:
TRAIN = "../data/en_ewt-ud-train.conllu"
DEV = "../data/en_ewt-ud-dev.conllu"
TEST = "../data/en_ewt-ud-test.conllu"
train(TRAIN, DEV, TEST, 30, "model.params", "model.vocab", Parser)

> saving vocab to model.vocab
>> Loading in data


TypeError: 'method' object is not subscriptable