In [None]:
# envionment setup
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import site
site.addsitedir('../..')

## APIs to retrieve nodes in the graph

In [None]:
# to retrieve nodes in the graph

from emr.graph import graph

# retrieve by __getattr__
print(graph.linguistic.phrase)
# retrieve by __getitem__
print(graph['linguistic/phrase'])
# retrieve by properties
print(graph.subs['linguistic'].concepts['phrase'])
# TODO: we might also add wildcard? should be like glob module does.
# Example: graph['*/phrase']

# or even play around
print(graph.subs['linguistic'].sup.linguistic.phrase)
print(graph.subs['linguistic'].sup['linguistic/phrase'])

## Experimental Area

## Developing Area

### regr code

In [None]:
from typing import List, Dict, Iterable
from allennlp.models.model import Model
from allennlp.data.vocabulary import Vocabulary
from torch import Tensor
from typing import Dict


class BaseModel(Model):
    def __init__(self, vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.field_name = {'output': 'logits',
                           'label': 'label',
                           'mask': 'mask'
                          }
        self.metrics = {}

    def _update_metrics(self, data: Dict[str, Tensor]):
        for metric_name, metric in self.metrics.items():
            # TODO: consider when there are multiple output
            metric(data[self.field_name['output']],
                   data[self.field_name['label']],
                   data[self.field_name['mask']])
            data[metric_name] = metric.get_metric(False) # no reset
        return data
    
    def get_metrics(self, reset: bool=False) -> Dict[str, float]:
        output = {}
        for metric_name, metric in self.metrics.items():
            out = metric.get_metric(reset)
            if isinstance(out, Iterable):
                for i, out_item in enumerate(out):
                    output['{}[{}]'.format(metric_name,i)] = out_item
            else:
                output[metric_name] = out
        return output

    def _update_loss(self, data):
        if self.loss_func is not None:
            data['loss'] = self.loss_func(data)
        return data

    def forward(
        self,
        **data:Dict[str, Tensor]
    ) -> Dict[str, Tensor]:
        
        ##
        # This is an identical stub
        # something happen here to take the input to the output
        ##

        return data


In [None]:
import torch
x = torch.arange(0, 6).reshape(2,1,3)
y = torch.arange(10,18).reshape(1,2,4)
print(x)
print(y)
torch.cat([x.repeat(1,2,1),y.repeat(2,1,1)],dim=2)
#xx = x.view(xs[0], xs[1], 1, xs[2])
#yy = y.view(ys[0], 1, ys[1], ys[2])
#torch.cat([xx,yy], dim=3)

In [None]:
from typing import Any, Union, List, Tuple, Dict, Callable
from allennlp.models.model import Model
import torch
from torch import Tensor
from torch.nn import Module
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.nn.util import get_text_field_mask
from regr import Graph, Concept


DataInstance = Dict[str, Tensor]
DataSource = List[DataInstance]
# NB: list of Instance(dict of str to data format use in module)
#     For allen nlp, Instance is Dict of str:Field,
#     and real tensor will be there in forward function
ModelFunc = Callable[[DataInstance], DataInstance]
ModuleFunc = Callable[[DataInstance], Tensor]
# NB: modules are transform from Dict of str:Tensor to updated Dict
#     module objects in AllenNLP have the forward function of this setting
# NB 2: torch.nn.Module = Callable[[Any], Tensor]
#     We should use them in the way that, we construct them in make_model,
#     preciecely in Library, put them into callback function, and call them
#     when the real data come and the callback functions are called.


class Library(object):
    def __init__(self) -> None:
        self.modules = []
        self.loss = None

    def data(
        self,
        data_name: str
    ) -> Tuple[Module, ModuleFunc]:
        def func(data: DataInstance) -> Tensor:
            tensor = data[data_name]
            return tensor

        return None, func

    def word2vec(
        self,
        input_func: ModuleFunc,
        num_embeddings: int,
        embedding_dim: int,
        token_name: str,
    ) -> Tuple[Module, ModuleFunc]:
        # token_name='tokens' is from data reader, name of TokenIndexer
        # seq_name='sentence' is from data reader, name of TextField
        # quite confusing, TODO: real want to get rid of them
        token_embedding = Embedding(
            num_embeddings=num_embeddings,
            embedding_dim=embedding_dim)
        word_embeddings = BasicTextFieldEmbedder({
            token_name: token_embedding})

        def func(data: DataInstance) -> Tensor:
            tensor = input_func[0](data) # input_func is tuple(func, conf)
            tensor = word_embeddings(tensor)
            return tensor

        return word_embeddings, func
    
    def cartesianprod_concat(
            self,
            input_func: ModuleFunc
        ) -> Tuple[Module, ModuleFunc]:
            class Cpcat(Module):
                def __init__(self):
                    Module.__init__(self)

                def forward(self, x, y): # (b,l1,f1) x (b,l2,f2) -> (b, l1, l2, f1+f2)
                    xs = x.size()
                    ys = y.size()
                    assert xs[0] == ys[0]
                    # torch cat is not broadcasting, do repeat manually
                    xx = x.view(xs[0], xs[1], 1, xs[2]).repeat(1, 1, ys[1], 1)
                    yy = y.view(ys[0], 1, ys[1], ys[2]).repeat(1, xs[1], 1, 1)
                    return torch.cat([xx,yy], dim=3)
            cpcat = Cpcat()

            def func(data: DataInstance) -> Tensor:
                tensor = input_func[0](data) # input_func is tuple(func, conf)
                tensor = cpcat(tensor, tensor)
                return tensor

            return cpcat, func
        
    def fc_sm(
        self,
        input_func: ModuleFunc,
        input_dim: int,
        label_dim: int,
    ) -> Tuple[Module, ModuleFunc]:
        fc = torch.nn.Linear(
            in_features=input_dim,
            out_features=label_dim)

        def func(data: DataInstance) -> Tensor:
            tensor = input_func[0](data)
            tensor = fc(tensor)
            return tensor

        self.modules.append((fc, func))
        return fc, func


class Scaffold(object):
    def __init__(
        self
    ) -> None:
        self.modules = []

    def assign(
        self,
        concept: Concept,
        prop: str,
        module: Module,
        func: ModuleFunc
    ) -> None:
        self.modules.append(module)
        
        def wrap_func(data: DataInstance) -> Tensor:
            # TODO: add cache to avoid repeat computation of same function
            tensor = func(data)
            data[concept.fullname+'[{}]'.format(prop)] = tensor
            return tensor
            
        if Module is None: # no parameter, trusted source
            conf = 1
        else:
            conf = 0
        concept[prop] = wrap_func, conf

    def build(
        self,
        graph: Graph
    ) -> Model:
        scaffold = self
        
        class ScaffoldedModel(BaseModel):
            def __init__(
                self_,
                vocab: Vocabulary
            ) -> None:
                model = self_
                BaseModel.__init__(model, vocab)
                
                from allennlp.training.metrics import CategoricalAccuracy, F1Measure
                model.metrics = {
                    #'accuracy': CategoricalAccuracy(),
                    'p/r/f1': F1Measure(1)
                }
                
                
                for i, (module) in enumerate(scaffold.modules):
                    model.add_module(str(i), module)

            def forward(
                self_,
                **data: DataInstance
            ) -> DataInstance:
                model = self_
                # just prototype
                # TODO: how to retieve the sequence properly?
                # I used to have topological-sorting over the module graph in my old frameworks
                
                data[model.field_name['mask']] = get_text_field_mask(data['sentence'])
                
                tensor = graph.people['label'][1][0](data)
                model.field_name['output'] = graph.people.fullname+'[label]'
                model.field_name['label'] = 'Peop_labels'

                data = model._update_metrics(data)
                data = model._update_loss(data)

                return data

        return ScaffoldedModel

    def get_loss(
        self,
        graph: Graph,
        model: BaseModel
    ) -> Callable[[DataInstance], DataInstance]:
        mapr = list(graph.get_multiassign()) # generator will be consumed, use list

        def loss_func(
            data: DataInstance
        ) -> DataInstance:
            from allennlp.nn.util import sequence_cross_entropy_with_logits
            loss = 0
            for name, funcs in mapr:
                vals = []
                for func, conf in funcs:
                    tensor = func(data)
                    vals.append(tensor)
                    
                bfactor = float(0.5) # [1 - no balance, 0 - balance]
                if len(vals[1].size())==3: # (b,l,c)
                    mask = data[model.field_name['mask']].clone()
                    # class balance weighted
                    target = (vals[0] > 0)
                    pos = (target).sum().float()
                    neg = (1-target).sum().float()
                    mask[target] *= (neg + pos*bfactor) / (pos + neg)
                    mask[1-target] *= (pos + neg*bfactor) / (pos + neg)
                    
                    loss += sequence_cross_entropy_with_logits(vals[1], vals[0], mask) # NB: the order!
                elif len(vals[1].size())==4: # (b,l1,l2,c)
                    mask = data[model.field_name['mask']].clone() # (b,l,)
                    ms = mask.size()
                    # TODO: this is only self relation mask since we have only one input mask
                    mask = mask.float()
                    mask = mask.view(ms[0], ms[1], 1).matmul( mask.view(ms[0], 1, ms[1])) # (b,l,l)
                    # vals[0] -> (b,l1,l2,), elements are -1 (padding) and 1 (label)
                    # TODO: nosure how to retrieve padding correctly
                    target = (vals[0] > 0)
                    ts = target.size()
                    # class balance weighted
                    pos = (target).sum().float()
                    neg = (1-target).sum().float()
                    mask[target] *= (neg + pos*bfactor) / (pos + neg)
                    mask[1-target] *= (pos + neg*bfactor) / (pos + neg)
                    #
                    #reshape(ts[0], ts[1]*ts[2]) # (b,l1*l2)
                    pred = (vals[1]) # (b,l1,l2,c)
                    #reshape(ts[0], ts[1]*ts[2], -1) # (b,l1*l2,c)
                    loss +=  1. * sequence_cross_entropy_with_logits(
                        pred.view(ts[0], ts[1]*ts[2], -1),
                        target.view(ts[0], ts[1]*ts[2]),
                        mask.view(ms[0], ms[1]*ms[1])
                    ) # NB: the order!
                else:
                    pass

            #loss = (vals[0] - vals[1]) * (vals[0] - vals[1]) / 2.
            return loss

        return loss_func

In [None]:
from allennlp.data.vocabulary import Vocabulary


DataSource = List[Dict[str, Tensor]]
# should be consistent with the one in library


class Data(object):
    def __init__(
        self,
        train_dataset: DataSource=None,
        valid_dataset: DataSource=None,
        test_dataset: DataSource=None,
    ) -> None:
        instances = []
        self.train_dataset = train_dataset
        if train_dataset is not None:
            instances += train_dataset
        
        self.valid_dataset = valid_dataset
        if valid_dataset is not None:
            instances += valid_dataset
        
        self.test_dataset = test_dataset
        if test_dataset is not None:
            instances += test_dataset
            
        vocab = Vocabulary.from_instances(instances)
        
        self.vocab = vocab

    def __getitem__(self, name: str) -> str:
        # return an identifier the module can use in forward function to get the data
        return name

In [None]:
from allennlp.training.trainer import Trainer
import torch.optim as optim
from allennlp.data.iterators import BucketIterator

DEBUG_TRAINING = False
def get_trainer(
    graph: Graph,
    model: BaseModel,
    data: Data,
    scaffold: Scaffold,
    lr=0.1, batch=128, epoch=1000, patience=10
) -> Trainer:
    # get the loss
    model.loss_func = scaffold.get_loss(graph, model)
    
    # prepare GPU
    if torch.cuda.is_available() and not DEBUG_TRAINING:
        device = 0
        model = model.cuda()
    else:
        device = -1

    # prepare optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)
    iterator = BucketIterator(batch_size=batch, sorting_keys=[
                              ('sentence', 'num_tokens')])
    iterator.index_with(model.vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=data.train_dataset,
                      validation_dataset=data.valid_dataset,
                      patience=patience,
                      num_epochs=epoch,
                      cuda_device=device)

    return trainer


### example code

In [None]:
from emr.data import Conll04DatasetReader as Reader
#from emr.model import get_trainer
#from emr.graph import graph

# types for typing style function
from typing import List, Dict
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.instance import Instance
from allennlp.models.model import Model
from regr import Graph
from allennlp.training.trainer import Trainer

In [None]:
from emr.data import Conll04Reader
from typing import Iterable, List, Tuple
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.tokenizers import Token
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, AdjacencyField

conll04_reader = Conll04Reader()


class EMRPeopWorkforOrgReader(DatasetReader):
    def __init__(self) -> None:
        super().__init__(lazy=False)
        # 'tokens' could be just any name, and I don't know where it is need again
        # checkout modules used in word2vec, they need this name there
        self.token_indexers = {'tokens': SingleIdTokenIndexer()}

    def to_instance(
        self,
        sentence: Tuple[List[str], List[str]],
        relations=None,
    ) -> Instance:
        fields = {}
        
        texts = sentence[0]
        labels = sentence[1]
        fields['sentence'] = TextField(
            [Token(word) for word in texts],
            self.token_indexers)
        if labels is not None:
            '''
            # ['Other', 'Loc', 'Peop', 'Org', 'O']
            fields['labels'] = SequenceLabelField(
                [(label if label in ['Peop', 'Org'] else 'O')
                 for label in labels],
                fields['sentence'])
                '''
            fields['Peop_labels'] = SequenceLabelField(
                [str(label=='Peop') for label in labels],
                fields['sentence'])
            fields['Org_labels'] = SequenceLabelField(
                [str(label=='Org') for label in labels],
                fields['sentence'])
            
        if relations is not None:
            # ['Live_In', 'OrgBased_In', 'Located_In', 'Work_For']
            relation_indices = []
            relation_labels = []
            for rel in relations:
                head_index = rel[1][0]
                tail_index = rel[2][0]
                label = (rel[0]=='Work_For')
                if label:
                    relation_indices.append((head_index, tail_index))
                    relation_labels.append(str(label))
            fields['relation_labels'] = AdjacencyField(
                relation_indices,
                fields['sentence'],
                #relation_labels # label is no need int binary case
            )
        
        return Instance(fields)

    def _read(
        self,
        file_path: str
    ) -> Iterable[Instance]:
        sentences, relations = conll04_reader(file_path)
        for (sentence, pos, labels), relation in zip(sentences, relations):
            yield self.to_instance((sentence, labels), relation)


In [None]:
reader = EMRPeopWorkforOrgReader()
train_dataset = reader.read("data/EntityMentionRelation/conll04_train.corp")

def printable(dataset, head=10, cond=(lambda ins : True)):
    return list(
        dict(
            (k, (v.indices if isinstance(v, AdjacencyField) else list(v)))
                for k,v in ins.items())
                for ins in dataset[:head] if cond(ins))

printable(train_dataset, head=50, cond=(lambda ins: len(ins['relation_labels'].indices) > 0))

In [None]:
from regr import Graph, Concept


Graph.clear()
Concept.clear()
with Graph('global') as graph:
    word = Concept(name='word')
    
    people = Concept(name='people')
    organization = Concept(name='organization')
    people.be(word)
    organization.be(word)
    
    pair = Concept(name='pair')
    pair.be((word,word))
    workfor = Concept(name='workfor')
    workfor.be({'employee':people, 'employer':organization})

graph

In [None]:
# data setting
relative_path = "data/EntityMentionRelation"
train_path = "conll04_train.corp"
valid_path = "conll04_test.corp"

# model setting
EMBEDDING_DIM = 16
HIDDEN_DIM = 8

# training setting
LR = 0.1
BATCH = 128
EPOCH = 10 # 1000
PATIENCE = 10

In [None]:
from emr.graph import Concept


# develop by an ML programmer to wire nodes in the graph and ML Models
def make_model(graph: Graph,
               data: Data,
               scaffold: Scaffold
              ) -> Model:
    # get concepts from graph
    word = graph.word
    people = graph.people
    
    # binding
    graph.release() # release anything binded before new assignment
    lib = Library()
    
    # filling in data and label
    scaffold.assign(word, 'index', *lib.data(data['sentence']))
    scaffold.assign(people, 'label', *lib.data(data['Peop_labels']))
    scaffold.assign(organization, 'label', *lib.data(data['Org_labels']))
    scaffold.assign(workfor, 'label', *lib.data(data['relation_labels']))
    
    # building model
    scaffold.assign(word, 'w2v',
                    *lib.word2vec(
                        word['index'],
                        data.vocab.get_vocab_size('tokens'),
                        EMBEDDING_DIM,
                        'tokens'
                    ))
    scaffold.assign(people, 'label',
                    *lib.fc_sm(
                        word['w2v'],
                        EMBEDDING_DIM,
                        2
                    ))
    scaffold.assign(organization, 'label',
                    *lib.fc_sm(
                        word['w2v'],
                        EMBEDDING_DIM,
                        2
                    ))
    # TODO: pair['w2v'] should be infer from word['w2v'] according to their relationship
    # but we specify it here to make it a bit easier for implementation
    scaffold.assign(pair, 'w2v',
                    *lib.cartesianprod_concat(
                        word['w2v']
                    ))
    scaffold.assign(workfor, 'label',
                    *lib.fc_sm(
                        pair['w2v'],
                        EMBEDDING_DIM * 2,
                        2
                    ))
    # now people['label'] has multiple assignment,
    # and the loss should come from the inconsistency here

    # get the model
    ModelCls = scaffold.build(graph) # or should it be model = graph.build()
    # NB: Link in the graph make be use to provide non parameterized
    #     transformation, what is a core feature of our graph.
    #     Is there a better semantic interface design?
    model = ModelCls(data.vocab)
    
    return model

In [None]:
# envionment setup

#import logging
# logging.basicConfig(level=logging.INFO)

def seed1():
    import random
    import numpy as np
    import torch
    
    np.random.seed(1)
    random.seed(1)
    torch.manual_seed(1)
    
seed1()

In [None]:
import os

# data
reader = EMRPeopWorkforOrgReader()
train_dataset = reader.read(os.path.join(relative_path, train_path))
valid_dataset = reader.read(os.path.join(relative_path, valid_path))
data = Data(train_dataset, valid_dataset)

scaffold = Scaffold()

# model from graph
model = make_model(graph, data, scaffold)

# trainer for model
trainer = get_trainer(graph, model, data, scaffold)

# train the model
trainer.train()

# save the model
with open("/tmp/model.th", 'wb') as fout:
    torch.save(model.state_dict(), fout)
data.vocab.save_to_files("/tmp/vocab")