# Metal CDR Relation Extraction 

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import sys
import metal
import torch
from torch.utils.data import Dataset, DataLoader


In [None]:
print('PyTorch: ', torch.__version__)
print('MeTaL:   ', metal.__version__)
print('Python:  ', sys.version)
print('Python:  ', sys.version_info)


## Initalize CDR Dataset

In [None]:
from metal.contrib.backends.wrapper import SnorkelDataset

db_conn_str   = "cdr.db"
candidate_def = ['ChemicalDisease', ['chemical', 'disease']]

train, dev, test = SnorkelDataset.splits(db_conn_str, 
                                         candidate_def, 
                                         max_seq_len=125)

print(f'[TRAIN] {len(train)}')
print(f'[DEV]   {len(dev)}')
print(f'[TEST]  {len(test)}')


## Train End Model (Random Initalized Embeddings)

In [None]:
from metal.end_model import EndModel
from metal.modules import LSTMModule
use_cuda = torch.cuda.is_available()

lstm = LSTMModule(embed_size=50, 
                  hidden_size=100, 
                  vocab_size=train.word_dict.len(),
                  lstm_reduction='attention', 
                  dropout=0.5, 
                  num_layers=1, 
                  freeze=False)

end_model = EndModel([200, 2], input_module=lstm, seed=123, use_cuda=use_cuda)

end_model.config['train_config']['optimizer_config']['optimizer_common']['lr'] = 0.01
end_model.config['train_config']['validation_metric'] = 'f1'
end_model.config['train_config']['batch_size'] = 32
end_model.config['train_config']['n_epochs'] = 5


In [None]:
end_model.train_model(train, dev_data=dev)


In [None]:
score = end_model.score(test, metric=['precision', 'recall', 'f1'])


## Train End Model (Pretrained Embeddings)

Download [GloVe embeddings](http://nlp.stanford.edu/data/glove.6B.zip)



In [None]:
import string

class EmbeddingLoader(object):
    """
    Simple text file embedding loader. Words with GloVe and FastText.
    """
    def __init__(self, fpath, fmt='text', dim=None, normalize=True):
        assert os.path.exists(fpath)
        self.fpath = fpath
        self.dim = dim
        self.fmt = fmt
        # infer dimension
        if not self.dim:
            header = open(self.fpath, "rU").readline().strip().split(' ')
            self.dim = len(header) - 1 if len(header) != 2 else int(header[-1])

        self.vocab, self.vectors = zip(*[(w,vec) for w,vec in self._read()])
        self.vocab = {w:i for i,w in enumerate(self.vocab)}
        self.vectors = np.vstack(self.vectors)
        if normalize:
            self.vectors = (self.vectors.T / np.linalg.norm(self.vectors, axis=1, ord=2)).T

    def _read(self):
        start = 0 if self.fmt == "text" else 1
        for i, line in enumerate(open(self.fpath, "rU")):
            if i < start:
                continue
            line = line.rstrip().split(' ')
            vec = np.array([float(x) for x in line[1:]])
            if len(vec) != self.dim:
                errors += [line[0]]
                continue
            yield (line[0], vec)
            

def load_embeddings(vocab, embeddings):
    """
    Load pretrained embeddings
    """
    def get_word_match(w, word_dict):
        if w in word_dict:
            return word_dict[w]
        elif w.lower() in word_dict:
            return word_dict[w.lower()]
        elif w.strip(string.punctuation) in word_dict:
            return word_dict[w.strip(string.punctuation)]
        elif w.strip(string.punctuation).lower() in word_dict:
            return word_dict[w.strip(string.punctuation).lower()]
        else:
            return -1

    num_words = vocab.len()
    emb_dim   = embeddings.vectors.shape[1]
    vecs      = init.xavier_normal_(torch.empty(num_words, emb_dim))
    vecs[0]   = torch.zeros(emb_dim)

    n = 0
    for w in vocab.d:
        idx = get_word_match(w, embeddings.vocab)
        if idx == -1:
            continue
        i = vocab.lookup(w)
        vecs[i] = torch.FloatTensor(embeddings.vectors[idx])
        n += 1

    print("Loaded {:2.1f}% ({}/{}) pretrained embeddings".format(float(n) / vocab.len() * 100.0, n, vocab.len() ))
    return vecs
            

In [None]:
emb_path  = "glove.6B/glove.6B.50d.txt"
embs  = EmbeddingLoader(emb_path, fmt='text')

In [None]:
from metal.contrib.backends.wrapper import SnorkelDataset

db_conn_str   = "cdr.db"
candidate_def = ['ChemicalDisease', ['chemical', 'disease']]

train, dev, test = SnorkelDataset.splits(db_conn_str, 
                                         candidate_def, 
                                         pretrained_word_dict=embs.vocab, 
                                         max_seq_len=125)

print(f'[TRAIN] {len(train)}')
print(f'[DEV]   {len(dev)}')
print(f'[TEST]  {len(test)}')


### Initalize pretrained embedding matrix

In [None]:
wembs = load_embeddings(train.word_dict, embs)

In [None]:
from metal.end_model import EndModel
from metal.modules import LSTMModule
use_cuda = torch.cuda.is_available()

lstm = LSTMModule(embed_size=50, 
                  hidden_size=100, 
                  embeddings=wembs,
                  lstm_reduction='attention', 
                  dropout=0.5, 
                  num_layers=1, 
                  freeze=False)

end_model = EndModel([200, 2], input_module=lstm, seed=123, use_cuda=use_cuda)

end_model.config['train_config']['optimizer_config']['optimizer_common']['lr'] = 0.01
end_model.config['train_config']['validation_metric'] = 'f1'
end_model.config['train_config']['batch_size'] = 32
end_model.config['train_config']['n_epochs'] = 5


In [None]:
end_model.train_model(train, dev_data=dev)


In [None]:
score = end_model.score(test, metric=['precision', 'recall', 'f1'])
