In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession

session = SnorkelSession()

In [2]:
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 97
Sentences: 1038


In [3]:
import os

#subdir in results
result_path = 'results/tafamidis97/'

In [4]:
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

print("Loading all sentences from db...")
all_sents = session.query(Sentence).all()
print("Loading complete!")


print('Amount of sentences: {}'.format(len(all_sents)))

Documents: 97
Sentences: 1038
Loading all sentences from db...
Loading complete!
Amount of sentences: 1038


In [5]:
from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor

GeneDiseaseInteraction = candidate_subclass('GeneDiseaseInteraction', ['gene', 'disease'])
candidate_extractor = PretaggedCandidateExtractor(GeneDiseaseInteraction, ['Gene', 'Disease'])

for k, sents in enumerate([all_sents]):
    candidate_extractor.apply(sents, split=k, clear=True)
    print("Number of candidates:", session.query(GeneDiseaseInteraction).filter(GeneDiseaseInteraction.split == k).count())

Clearing existing...


  0%|          | 0/1038 [00:00<?, ?it/s]

Running UDF...


100%|██████████| 1038/1038 [00:01<00:00, 735.96it/s]

Number of candidates: 586





In [6]:
from snorkel.learning.pytorch import LSTM

lstm = LSTM(n_threads=10)
lstm.load('gene_disease_interaction.lstm')

[LSTM] Loaded model <gene_disease_interaction.lstm>


In [7]:
print("Loading all candidates from db...")
all_cands = session.query(GeneDiseaseInteraction).filter(GeneDiseaseInteraction.split == 0).order_by(GeneDiseaseInteraction.id).all()
print("{} candidates load from db!".format(len(all_cands)))

Loading all candidates from db...
586 candidates load from db!


In [8]:
print("Applying LSTM to candidates...")
%time lstm.save_marginals(session, all_cands)
print("LSTM applied!")

Applying LSTM to candidates...
Saved 586 marginals
CPU times: user 5.02 s, sys: 116 ms, total: 5.14 s
Wall time: 1.61 s
LSTM applied!




In [9]:
from ksnorkel import KSUtils

header_str = '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format('document_id', 'sentence_id', 'cand_id','gene_cid', 'gene_span', 'disease_cid', 'disease_span')
%time KSUtils.save_binary_relation_as_tsv(result_path + "gene_disease_interaction.tsv", session, all_cands, all_sents, header_str, 'gene_cid', 'disease_cid')

Storing candidate labels into result file: results/tafamidis97/gene_disease_interaction.tsv
Amount of candidates: 586
Load mariginals from db...
Marginals loaded!
Building sentence to document map...
Map built!
Saved 530 positive predicitions for binary relation!
CPU times: user 3.67 s, sys: 20 ms, total: 3.69 s
Wall time: 2.13 s
