In [1]:
# envionment setup
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import site
site.addsitedir('../..')

## APIs to retrieve nodes in the graph

In [2]:
# to retrieve nodes in the graph

from emr.graph import graph

# retrieve by __getattr__
print(graph.linguistic.phrase)
# retrieve by __getitem__
print(graph['linguistic/phrase'])
# retrieve by properties
print(graph.subs['linguistic'].concepts['phrase'])
# TODO: we might also add wildcard? should be like glob module does.
# Example: graph['*/phrase']

# or even play around
print(graph.subs['linguistic'].sup.linguistic.phrase)
print(graph.subs['linguistic'].sup['linguistic/phrase'])

Concept(name='phrase', what={'out_rels': {Concept(name='word'): {Have(name='(phrase)-have-(0:word)', what={'dst': {0: Concept(name='word')},
 'src': Concept(name='phrase')})}}})
Concept(name='phrase', what={'out_rels': {Concept(name='word'): {Have(name='(phrase)-have-(0:word)', what={'dst': {0: Concept(name='word')},
 'src': Concept(name='phrase')})}}})
Concept(name='phrase', what={'out_rels': {Concept(name='word'): {Have(name='(phrase)-have-(0:word)', what={'dst': {0: Concept(name='word')},
 'src': Concept(name='phrase')})}}})
Concept(name='phrase', what={'out_rels': {Concept(name='word'): {Have(name='(phrase)-have-(0:word)', what={'dst': {0: Concept(name='word')},
 'src': Concept(name='phrase')})}}})
Concept(name='phrase', what={'out_rels': {Concept(name='word'): {Have(name='(phrase)-have-(0:word)', what={'dst': {0: Concept(name='word')},
 'src': Concept(name='phrase')})}}})


## Experimental Area

## Developing Area

In [3]:
from emr.data import Conll04Reader
from typing import Iterable
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.tokenizers import Token
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data import Instance
from allennlp.data.fields import TextField
from allennlp.data.fields.label_field import LabelField


conll04_reader = Conll04Reader()


class NERPeopReader(DatasetReader):
    def __init__(self) -> None:
        super().__init__(lazy=False)
        # 'tokens' could be just any name, and I don't know where it is need again
        # checkout modules used in word2vec, they need this name there
        self.token_indexers = {'tokens': SingleIdTokenIndexer()}

    def word_to_instance(
        self,
        word: str,
        label: str=None
    ) -> Instance:
        fields = {}
        fields['sentence'] = TextField([Token(word), ], self.token_indexers)
        if label is not None:
            # ['Other', 'Loc', 'Peop', 'Org', 'O']
            fields["label"] = LabelField(label)
        return Instance(fields)

    def _read(
        self,
        file_path: str
    ) -> Iterable[Instance]:
        sentences, relations = conll04_reader(file_path)
        for (sentence, pos, labels), relation in zip(sentences, relations):
            for word, label in zip(sentence, labels):
                yield self.word_to_instance(word, label)


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [4]:
from typing import List, Dict
from allennlp.models.model import Model
from allennlp.data.vocabulary import Vocabulary
from torch import Tensor
from typing import Dict


class BaseModel(Model):
    def __init__(self, vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.field_name = {'output': 'logits',
                           'label': 'label',
                           'mask': 'metric_mask'
                          }
        self.metrics = {}

    def _update_metrics(self, **data: Dict[str, Tensor]):
        for metric_name, metric in self.metrics.items():
            metric(data[self.field_name['output']],
                   data[self.field_name['label']],
                   data[self.field_name['mask']])
            data[metric_name] = metric.get_metric(False) # no reset
        return data
    
    def get_metrics(self, reset: bool=False) -> Dict[str, float]:
        output = {}
        for metric_name, metric in self.metrics.items():
            output[metric_name] = metric.get_metric(reset)
        return output

    def _update_loss(self, **data):
        if self.loss_func is not None:
            data['loss'] = self.loss_func(**data)
        return data

    def forward(
        self,
        **data:Dict[str, Tensor]
    ) -> Dict[str, Tensor]:
        
        ##
        # This is an identical stub
        # something happen here to take the input to the output
        ##

        return data


In [5]:
from typing import Any, Union, List, Tuple, Dict, Callable
from allennlp.models.model import Model
import torch
from torch import Tensor
from torch.nn import Module
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from regr import Graph


DataInstance = Dict[str, Tensor]
DataSource = List[DataInstance]
# NB: list of Instance(dict of str to data format use in module)
#     For allen nlp, Instance is Dict of str:Field,
#     and real tensor will be there in forward function
ModelFunc = Callable[[DataInstance], DataInstance]
ModuleFunc = Callable[[DataInstance], Tensor]
# NB: modules are transform from Dict of str:Tensor to updated Dict
#     module objects in AllenNLP have the forward function of this setting
# NB 2: torch.nn.Module = Callable[[Any], Tensor]
#     We should use them in the way that, we construct them in make_model,
#     preciecely in Library, put them into callback function, and call them
#     when the real data come and the callback functions are called.


class Library(object):
    def __init__(self, vocab=None) -> None:
        self.modules = []
        self.loss = None
        self.vocab = vocab

    def data(
        self,
        data_name: str
    ) -> ModuleFunc:
        def func(data: DataInstance) -> Tensor:
            tensor = data[data_name]
            # we do not need to put it back to data
            return tensor

        self.modules.append(func)
        return func

    def word2vec(
        self,
        input_prop: ModuleFunc,
        token_name: str,
        embedding_dim: int,
    ) -> ModuleFunc:
        # token_name='tokens' is from data reader, name of TokenIndexer
        # seq_name='sentence' is from data reader, name of TextField
        # quite confusing
        token_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size(token_name),
            embedding_dim=embedding_dim)
        word_embeddings = BasicTextFieldEmbedder({
            token_name: token_embedding})

        def func(data: DataInstance) -> Tensor:
            tensor = input_prop(data)
            tensor = word_embeddings(tensor)
            return tensor

        self.modules.append(func)
        return func

    def fc_sm(
        self,
        input_prop: ModuleFunc,
        input_dim: int,
        label_dim: int,
    ) -> ModuleFunc:
        fc = torch.nn.Linear(
            in_features=input_dim,
            out_features=label_dim)

        def func(data: DataInstance) -> Tensor:
            tensor = input_prop(data)
            tensor = fc(tensor)
            return tensor

        self.modules.append(func)
        return func

    def build(
        self,
        graph: Graph
    ) -> Model:
        print(list(graph.get_multiassign()))
        class NewModel(BaseModel):
            def forward(
                self_,
                **data: Dict[str, Tensor]
            ) -> Dict[str, Tensor]:
                # just prototype
                # TODO: how to retieve the sequence properly?
                # I used to have topological-sorting over the module graph in my old frameworks
                tensor, data = self.modules[0](None, data)  # data
                tensor, data = self.modules[1](tensor, data)  # w2v
                tensor, data = self.modules[2](tensor, data)  # fc_sm

                return data
        return NewModel


NameError: name 'Graph' is not defined

In [None]:
from allennlp.data.vocabulary import Vocabulary


DataSource = List[Dict[str, Tensor]]
# should be consistent with the one in library


class Data(object):
    def __init__(
        self,
        train_dataset: DataSource=None,
        valid_dataset: DataSource=None,
        test_dataset: DataSource=None,
    ) -> None:
        instances = []
        self.train_dataset = train_dataset
        if train_dataset is not None:
            instances += train_dataset
        
        self.valid_dataset = valid_dataset
        if valid_dataset is not None:
            instances += valid_dataset
        
        self.test_dataset = test_dataset
        if test_dataset is not None:
            instances += test_dataset
            
        vocab = Vocabulary.from_instances(instances)
        
        self.vocab = vocab

    def __getitem__(self, name: str) -> str:
        # return an identifier the module can use in forward function to get the data
        return name

## Baseline code

In [None]:
from emr.data import Conll04DatasetReader as Reader
from emr.model import get_trainer
#from emr.graph import graph

# types for typing style function
from typing import List, Dict
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.instance import Instance
from allennlp.models.model import Model
from regr import Graph
from allennlp.training.trainer import Trainer

In [None]:
# data setting
relative_path = "data/EntityMentionRelation"
train_path = "conll04_train.corp"
valid_path = "conll04_test.corp"

# model setting
EMBEDDING_DIM = 16
HIDDEN_DIM = 8

# training setting
LR = 0.1
BATCH = 16
EPOCH = 10 # 1000
PATIENCE = 10

In [None]:
from regr import Graph, Concept


Graph.clear()
Concept.clear()
with Graph('global') as graph:
    word = Concept(name='word')
    people = Concept(name='people')
    people.be(word)

graph

In [None]:
from emr.graph import Concept

from allennlp.data.vocabulary import Vocabulary

# develop by an ML programmer to wire nodes in the graph and ML Models
def make_model(graph: Graph,
               data: Data,
               lib: Library,
              ) -> Model:
    # get concepts from graph
    word = graph.word
    people = graph.people
    
    # binding
    graph.release() # release anything binded before new assignment
    # filling in data and label
    word['index'] = lib.data(data['sentence']) # (batch, len)
    people['label'] = lib.data(data['label']) # (batch, len,)
    
    # building model
    word['w2v'] = lib.word2vec(word['index'], 'token', EMBEDDING_DIM) # (batch, len, EMBEDDING_DIM)
    people['label'] = lib.fc_sm(word['w2v'], EMBEDDING_DIM, 1) # (batch, len, 1)
    # now people['label'] has multiple assignment,
    # and the loss should come from the inconsistency here

    # get the model
    model = lib.build(graph) # or should it be model = graph.build()
    # NB: Link in the graph make be use to provide non parameterized
    #     transformation, what is a core feature of our graph.
    #     Is there a better semantic interface design?
    
    # get the loss
    model.loss = graph()
    
    return model

In [None]:
# envionment setup

#import logging
# logging.basicConfig(level=logging.INFO)

def seed1():
    import random
    import numpy as np
    import torch
    
    np.random.seed(1)
    random.seed(1)
    torch.manual_seed(1)
    
seed1()

In [None]:
import os

# data
reader = NERPeopReader()
train_dataset = reader.read(os.path.join(relative_path, train_path))
valid_dataset = reader.read(os.path.join(relative_path, valid_path))
data = Data(train_dataset, valid_dataset)

# model from graph
lib = Library(data.vocab)
model = make_model(graph, data, lib)

# trainer for model
trainer = get_trainer(model, data) # TODO: update the interface with new Data class

# train the model
trainer.train()

# save the model
with open("/tmp/model.th", 'wb') as fout:
    torch.save(model.state_dict(), fout)
data.vocab.save_to_files("/tmp/vocab")