In [1]:
import os
print('remove snorkel db...')
os.remove("snorkel.db") 
print('snorkel db removed')

remove snorkel db...
snorkel db removed


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession

session = SnorkelSession()

In [3]:
from snorkel import SnorkelSession
from snorkel.parser import CorpusParser, Spacy, StanfordCoreNLPServer
from pubtator import PubTatorDocPreprocessor, PubTatorTagProcessor, PubTatorParser
from time import time

parser = "spacy"
num_procs = 1

start_ts = time()

filelist = ['data/Tafamidis97_filtered.pubtator']

for fp in filelist:
    doc_preprocessor = PubTatorDocPreprocessor(fp, annotations=False)
    #arser = Spacy() if parser == "spacy" else StanfordCoreNLPServer()
    parser = PubTatorParser(stop_on_err=False)
    corpus_parser = CorpusParser(parser=parser)
    corpus_parser.apply(doc_preprocessor, parallelism=num_procs, clear=False)
    end_ts = time()

print("\nDONE in {}".format((time() - start_ts)))

Running UDF...

DONE in 2.7572879791259766


In [4]:
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 97
Sentences: 1038


In [5]:
import os

#subdir in results
result_path = 'results/tafamidis97/'

try:  
    os.mkdir(result_path)
except OSError:  
    print ("Creation of the directory {} failed because it may exists".format(result_path))
else:  
    print ("Successfully created the directory %s " % result_path)


Successfully created the directory results/tafamidis97/ 


In [6]:
from snorkel.models import Document

all_docs = session.query(Document).all()

print('Amount of docs: {}'.format(len(all_docs)))
with open(result_path + 'doc_mapping.tsv', 'w') as f:
    f.write('{}\t{}\n'.format('snorkel_id', 'pmid'))
    for doc in all_docs:
        f.write('{}\t{}\n'.format(doc.id, doc.name))
print('Finished')

Amount of docs: 97
Finished


In [7]:
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

print("Loading all sentences from db...")
all_sents = session.query(Sentence).all()
print("Loading complete!")


print('Amount of sentences: {}'.format(len(all_sents)))

Documents: 97
Sentences: 1038
Loading all sentences from db...
Loading complete!
Amount of sentences: 1038


In [8]:
from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor

ChemicalDisease = candidate_subclass('ChemicalDisease', ['chemical', 'disease'])
candidate_extractor = PretaggedCandidateExtractor(ChemicalDisease, ['Chemical', 'Disease'])

for k, sents in enumerate([all_sents]):
    candidate_extractor.apply(sents, split=k, clear=True)
    print("Number of candidates:", session.query(ChemicalDisease).filter(ChemicalDisease.split == k).count())

  5%|▌         | 52/1038 [00:00<00:01, 515.64it/s]

Clearing existing...
Running UDF...


100%|██████████| 1038/1038 [00:01<00:00, 896.44it/s]

Number of candidates: 124





In [9]:
from snorkel.learning.pytorch import LSTM

lstm = LSTM(n_threads=10)
lstm.load('chemical_disease.lstm')

[LSTM] Loaded model <chemical_disease.lstm>


In [10]:
print("Loading all candidates from db...")
all_cands = session.query(ChemicalDisease).filter(ChemicalDisease.split == 0).order_by(ChemicalDisease.id).all()
print("{} candidates load from db!".format(len(all_cands)))

Loading all candidates from db...
124 candidates load from db!


In [11]:
print("Applying LSTM to candidates...")
%time lstm.save_marginals(session, all_cands)
print("LSTM applied!")

Applying LSTM to candidates...
Saved 124 marginals
CPU times: user 2.11 s, sys: 100 ms, total: 2.21 s
Wall time: 474 ms
LSTM applied!




In [12]:
from ksnorkel import KSUtils

header_str = '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format('document_id', 'sentence_id', 'cand_id', 'chemical_cid', 'chemical_span', 'disease_cid', 'disease_span')
%time KSUtils.save_binary_relation_as_tsv(result_path + "chemical_disease_association.tsv", session, all_cands, all_sents, header_str, 'chemical_cid', 'disease_cid')

Storing candidate labels into result file: results/tafamidis97/chemical_disease_association.tsv
Amount of candidates: 124
Load mariginals from db...
Marginals loaded!
Building sentence to document map...
Map built!
Saved 24 positive predicitions for binary relation!
CPU times: user 3.17 s, sys: 24 ms, total: 3.19 s
Wall time: 1.35 s
