In [None]:
# envionment setup
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import site
site.addsitedir('../..')

## APIs to retrieve nodes in the graph

In [None]:
# to retrieve nodes in the graph

from emr.graph import graph

# retrieve by __getattr__
print(graph.linguistic.phrase)
# retrieve by __getitem__
print(graph['linguistic/phrase'])
# retrieve by properties
print(graph.subs['linguistic'].concepts['phrase'])
# TODO: we might also add wildcard? should be like glob module does.
# Example: graph['*/phrase']

# or even play around
print(graph.subs['linguistic'].sup.linguistic.phrase)
print(graph.subs['linguistic'].sup['linguistic/phrase'])

## Experimental Area

## Developing Area

In [None]:
from emr.data import Conll04Reader
from typing import Iterable
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.tokenizers import Token
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField


conll04_reader = Conll04Reader()


class NERPeopReader(DatasetReader):
    def __init__(self) -> None:
        super().__init__(lazy=False)
        # 'tokens' could be just any name, and I don't know where it is need again
        # checkout modules used in word2vec, they need this name there
        self.token_indexers = {'tokens': SingleIdTokenIndexer()}

    def word_to_instance(
        self,
        word: str,
        label: str=None
    ) -> Instance:
        fields = {}
        fields['sentence'] = TextField([Token(word), ], self.token_indexers)
        if label is not None:
            # ['Other', 'Loc', 'Peop', 'Org', 'O']
            fields['label'] = SequenceLabelField([str(label=='Peop'),], fields['sentence'])
        return Instance(fields)

    def _read(
        self,
        file_path: str
    ) -> Iterable[Instance]:
        sentences, relations = conll04_reader(file_path)
        for (sentence, pos, labels), relation in zip(sentences, relations):
            for word, label in zip(sentence, labels):
                yield self.word_to_instance(word, label)


In [None]:
from typing import List, Dict
from allennlp.models.model import Model
from allennlp.data.vocabulary import Vocabulary
from torch import Tensor
from typing import Dict


class BaseModel(Model):
    def __init__(self, vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.field_name = {'output': 'logits',
                           'label': 'label',
                           'mask': 'mask'
                          }
        self.metrics = {}

    def _update_metrics(self, data: Dict[str, Tensor]):
        for metric_name, metric in self.metrics.items():
            metric(data[self.field_name['output']],
                   data[self.field_name['label']],
                   data[self.field_name['mask']])
            data[metric_name] = metric.get_metric(False) # no reset
        return data
    
    def get_metrics(self, reset: bool=False) -> Dict[str, float]:
        output = {}
        for metric_name, metric in self.metrics.items():
            output[metric_name] = metric.get_metric(reset)
        return output

    def _update_loss(self, data):
        if self.loss_func is not None:
            data['loss'] = self.loss_func(data)
        return data

    def forward(
        self,
        **data:Dict[str, Tensor]
    ) -> Dict[str, Tensor]:
        
        ##
        # This is an identical stub
        # something happen here to take the input to the output
        ##

        return data


In [None]:
from typing import Any, Union, List, Tuple, Dict, Callable
from allennlp.models.model import Model
import torch
from torch import Tensor
from torch.nn import Module
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from regr import Graph, Concept


DataInstance = Dict[str, Tensor]
DataSource = List[DataInstance]
# NB: list of Instance(dict of str to data format use in module)
#     For allen nlp, Instance is Dict of str:Field,
#     and real tensor will be there in forward function
ModelFunc = Callable[[DataInstance], DataInstance]
ModuleFunc = Callable[[DataInstance], Tensor]
# NB: modules are transform from Dict of str:Tensor to updated Dict
#     module objects in AllenNLP have the forward function of this setting
# NB 2: torch.nn.Module = Callable[[Any], Tensor]
#     We should use them in the way that, we construct them in make_model,
#     preciecely in Library, put them into callback function, and call them
#     when the real data come and the callback functions are called.


class Library(object):
    def __init__(self) -> None:
        self.modules = []
        self.loss = None

    def data(
        self,
        data_name: str
    ) -> Tuple[Module, ModuleFunc]:
        def func(data: DataInstance) -> Tensor:
            tensor = data[data_name]
            return tensor

        return None, func

    def word2vec(
        self,
        input_func: ModuleFunc,
        num_embeddings: int,
        embedding_dim: int,
        token_name: str,
    ) -> Tuple[Module, ModuleFunc]:
        # token_name='tokens' is from data reader, name of TokenIndexer
        # seq_name='sentence' is from data reader, name of TextField
        # quite confusing, TODO: real want to get rid of them
        token_embedding = Embedding(
            num_embeddings=num_embeddings,
            embedding_dim=embedding_dim)
        word_embeddings = BasicTextFieldEmbedder({
            token_name: token_embedding})

        def func(data: DataInstance) -> Tensor:
            tensor = input_func[0](data) # input_func is tuple(func, conf)
            tensor = word_embeddings(tensor)
            return tensor

        return word_embeddings, func

    def fc_sm(
        self,
        input_func: ModuleFunc,
        input_dim: int,
        label_dim: int,
    ) -> Tuple[Module, ModuleFunc]:
        fc = torch.nn.Linear(
            in_features=input_dim,
            out_features=label_dim)

        def func(data: DataInstance) -> Tensor:
            tensor = input_func[0](data)
            tensor = fc(tensor)
            return tensor

        self.modules.append((fc, func))
        return fc, func


class Scaffold(object):
    def __init__(
        self
    ) -> None:
        self.modules = []

    def assign(
        self,
        concept: Concept,
        prop: str,
        module: Module,
        func: ModuleFunc
    ) -> None:
        self.modules.append(module)
        
        def wrap_func(data: DataInstance) -> Tensor:
            tensor = func(data)
            data[concept.fullname+'[{}]'.format(prop)] = tensor
            return tensor
            
        if Module is None: # no parameter, trusted source
            conf = 1
        else:
            conf = 0
        concept[prop] = wrap_func, conf

    def build(
        self,
        graph: Graph
    ) -> Model:
        scaffold = self
        
        class ScaffoldedModel(BaseModel):
            def __init__(
                self_,
                vocab: Vocabulary
            ) -> None:
                model = self_
                BaseModel.__init__(model, vocab)
                
                from allennlp.training.metrics import CategoricalAccuracy
                model.metrics = {'accuracy': CategoricalAccuracy()}
                
                for i, (module) in enumerate(scaffold.modules):
                    model.add_module(str(i), module)

            def forward(
                self_,
                **data: DataInstance
            ) -> DataInstance:
                model = self_
                # just prototype
                # TODO: how to retieve the sequence properly?
                # I used to have topological-sorting over the module graph in my old frameworks
                
                data['mask'] = torch.ones(data['label'].size())
                
                tensor = graph.people['label'][1][0](data)
                model.field_name['output'] = graph.people.fullname+'[label]'

                data = model._update_metrics(data)
                data = model._update_loss(data)

                return data

        return ScaffoldedModel

    def get_loss(
        self,
        graph: Graph
    ) -> Callable[[DataInstance], DataInstance]:
        mapr = list(graph.get_multiassign()) # generator will be consumed

        def loss_func(
            data: DataInstance
        ) -> DataInstance:
            vals = []
            for name, funcs in mapr:
                for func, conf in funcs:
                    tensor = func(data)
                    vals.append(tensor)

            from allennlp.nn.util import sequence_cross_entropy_with_logits
            mask = data['mask']
            if vals[1].is_cuda:
                mask = mask.cuda()
            loss = sequence_cross_entropy_with_logits(vals[1], vals[0].view(-1,1), mask) # NB: the order!

            #loss = (vals[0] - vals[1]) * (vals[0] - vals[1]) / 2.
            return loss

        return loss_func

In [None]:
from allennlp.data.vocabulary import Vocabulary


DataSource = List[Dict[str, Tensor]]
# should be consistent with the one in library


class Data(object):
    def __init__(
        self,
        train_dataset: DataSource=None,
        valid_dataset: DataSource=None,
        test_dataset: DataSource=None,
    ) -> None:
        instances = []
        self.train_dataset = train_dataset
        if train_dataset is not None:
            instances += train_dataset
        
        self.valid_dataset = valid_dataset
        if valid_dataset is not None:
            instances += valid_dataset
        
        self.test_dataset = test_dataset
        if test_dataset is not None:
            instances += test_dataset
            
        vocab = Vocabulary.from_instances(instances)
        
        self.vocab = vocab

    def __getitem__(self, name: str) -> str:
        # return an identifier the module can use in forward function to get the data
        return name

## Baseline code

In [None]:
from emr.data import Conll04DatasetReader as Reader
from emr.model import get_trainer
#from emr.graph import graph

# types for typing style function
from typing import List, Dict
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.instance import Instance
from allennlp.models.model import Model
from regr import Graph
from allennlp.training.trainer import Trainer

In [None]:
# data setting
relative_path = "data/EntityMentionRelation"
train_path = "conll04_train.corp"
valid_path = "conll04_test.corp"

# model setting
EMBEDDING_DIM = 16
HIDDEN_DIM = 8

# training setting
LR = 0.1
BATCH = 128
EPOCH = 10 # 1000
PATIENCE = 10

In [None]:
from regr import Graph, Concept


Graph.clear()
Concept.clear()
with Graph('global') as graph:
    word = Concept(name='word')
    people = Concept(name='people')
    people.be(word)

graph

In [None]:
from emr.graph import Concept


# develop by an ML programmer to wire nodes in the graph and ML Models
def make_model(graph: Graph,
               data: Data,
               scaffold: Scaffold
              ) -> Model:
    # get concepts from graph
    word = graph.word
    people = graph.people
    
    # binding
    graph.release() # release anything binded before new assignment
    lib = Library()
    
    # filling in data and label
    scaffold.assign(word, 'index', *lib.data(data['sentence']))
    scaffold.assign(people, 'label', *lib.data(data['label']))
    
    # building model
    scaffold.assign(word, 'w2v',
                    *lib.word2vec(
                        word['index'],
                        data.vocab.get_vocab_size('tokens'),
                        EMBEDDING_DIM,
                        'tokens'
                    ))
    scaffold.assign(people, 'label',
                    *lib.fc_sm(
                        word['w2v'],
                        EMBEDDING_DIM,
                        2
                    ))
    # now people['label'] has multiple assignment,
    # and the loss should come from the inconsistency here

    # get the model
    ModelCls = scaffold.build(graph) # or should it be model = graph.build()
    # NB: Link in the graph make be use to provide non parameterized
    #     transformation, what is a core feature of our graph.
    #     Is there a better semantic interface design?
    model = ModelCls(data.vocab)
    
    return model

In [None]:
from allennlp.training.trainer import Trainer
import torch.optim as optim
from allennlp.data.iterators import BucketIterator

DEBUG_TRAINING = True
def get_trainer(
    graph: Graph,
    model: Model,
    data: Data,
    scaffold: Scaffold,
    lr=0.1, batch=128, epoch=1000, patience=10
) -> Trainer:
    # get the loss
    model.loss_func = scaffold.get_loss(graph)
    
    # prepare GPU
    if torch.cuda.is_available() and not DEBUG_TRAINING:
        device = 0
        model = model.cuda()
    else:
        device = -1

    # prepare optimizer
    optimizer = optim.SGD(model.parameters(), lr=lr)
    iterator = BucketIterator(batch_size=batch, sorting_keys=[
                              ('sentence', 'num_tokens')])
    iterator.index_with(model.vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=data.train_dataset,
                      validation_dataset=data.valid_dataset,
                      patience=patience,
                      num_epochs=epoch,
                      cuda_device=device)

    return trainer


In [None]:
# envionment setup

#import logging
# logging.basicConfig(level=logging.INFO)

def seed1():
    import random
    import numpy as np
    import torch
    
    np.random.seed(1)
    random.seed(1)
    torch.manual_seed(1)
    
seed1()

In [None]:
import os

# data
reader = NERPeopReader()
train_dataset = reader.read(os.path.join(relative_path, train_path))
valid_dataset = reader.read(os.path.join(relative_path, valid_path))
data = Data(train_dataset, valid_dataset)

scaffold = Scaffold()

# model from graph
model = make_model(graph, data, scaffold)

# trainer for model
trainer = get_trainer(graph, model, data, scaffold)

# train the model
trainer.train()

# save the model
with open("/tmp/model.th", 'wb') as fout:
    torch.save(model.state_dict(), fout)
data.vocab.save_to_files("/tmp/vocab")