In [None]:
import os
print('remove snorkel db...')
os.remove("snorkel.db") 
print('snorkel db removed')

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession

session = SnorkelSession()

In [2]:
from snorkel import SnorkelSession
from snorkel.parser import CorpusParser, Spacy, StanfordCoreNLPServer
from pubtator import PubTatorDocPreprocessor, PubTatorTagProcessor, PubTatorParser
from time import time

parser = "spacy"
num_procs = 1

start_ts = time()

filelist = ['data/SimCyp298_filtered.pubtator']

for fp in filelist:
    doc_preprocessor = PubTatorDocPreprocessor(fp, annotations=False)
    #arser = Spacy() if parser == "spacy" else StanfordCoreNLPServer()
    parser = PubTatorParser(stop_on_err=False)
    corpus_parser = CorpusParser(parser=parser)
    corpus_parser.apply(doc_preprocessor, parallelism=num_procs, clear=True)
    end_ts = time()

print("\nDONE in {}".format((time() - start_ts)))

Clearing existing...
Running UDF...

DONE in 9.000768899917603


In [3]:
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 298
Sentences: 3452


In [4]:
import os

#subdir in results
result_path = 'results/simcyp298/'

try:  
    os.mkdir(result_path)
except OSError:  
    print ("Creation of the directory {} failed because it may exists".format(result_path))
else:  
    print ("Successfully created the directory %s " % result_path)


Creation of the directory results/simcyp298/ failed because it may exists


In [5]:
from snorkel.models import Document

all_docs = session.query(Document).all()

print('Amount of docs: {}'.format(len(all_docs)))
with open(result_path + 'doc_mapping.tsv', 'w') as f:
    f.write('{}\t{}\n'.format('snorkel_id', 'pmid'))
    for doc in all_docs:
        f.write('{}\t{}\n'.format(doc.id, doc.name))
print('Finished')

Amount of docs: 298
Finished


In [6]:
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

print("Loading all sentences from db...")
all_sents = session.query(Sentence).all()
print("Loading complete!")


print('Amount of sentences: {}'.format(len(all_sents)))

Documents: 298
Sentences: 3452
Loading all sentences from db...
Loading complete!
Amount of sentences: 3452


In [7]:
from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor

ChemicalDisease = candidate_subclass('ChemicalDisease', ['chemical', 'disease'])
candidate_extractor = PretaggedCandidateExtractor(ChemicalDisease, ['Chemical', 'Disease'])

for k, sents in enumerate([all_sents]):
    candidate_extractor.apply(sents, split=k, clear=True)
    print("Number of candidates:", session.query(ChemicalDisease).filter(ChemicalDisease.split == k).count())
    

from snorkel.learning.pytorch import LSTM

lstm = LSTM(n_threads=10)
lstm.load('chemical_disease.lstm')

print("Loading all candidates from db...")
all_cands = session.query(ChemicalDisease).filter(ChemicalDisease.split == 0).order_by(ChemicalDisease.id).all()
print("{} candidates load from db!".format(len(all_cands)))


print("Applying LSTM to candidates...")
%time lstm.save_marginals(session, all_cands)
print("LSTM applied!")

from ksnorkel import KSUtils

header_str = '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format('document_id', 'sentence_id', 'cand_id', 'chemical_cid', 'chemical_span', 'disease_cid', 'disease_span')
%time KSUtils.save_binary_relation_as_tsv(result_path + "chemical_disease_association.tsv", session, all_cands, all_sents, header_str, 'chemical_cid', 'disease_cid')

  2%|▏         | 60/3452 [00:00<00:05, 596.65it/s]

Clearing existing...
Running UDF...


100%|██████████| 3452/3452 [00:05<00:00, 641.26it/s]


Number of candidates: 1286
[LSTM] Loaded model <chemical_disease.lstm>
Loading all candidates from db...
1286 candidates load from db!
Applying LSTM to candidates...




Saved 1286 marginals
CPU times: user 11.6 s, sys: 440 ms, total: 12 s
Wall time: 3.04 s
LSTM applied!
Storing candidate labels into result file: results/simcyp298/chemical_disease_association.tsv
Amount of candidates: 1286
Load mariginals from db...
Marginals loaded!
Building sentence to document map...
Map built!
Saved 639 positive predicitions for binary relation!
CPU times: user 6.27 s, sys: 104 ms, total: 6.37 s
Wall time: 5.13 s


In [8]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession

session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor

ChemicalGeneInteraction = candidate_subclass('ChemicalGeneInteraction', ['chemical', 'gene'])
candidate_extractor = PretaggedCandidateExtractor(ChemicalGeneInteraction, ['Chemical', 'Gene'])

for k, sents in enumerate([all_sents]):
    candidate_extractor.apply(sents, split=k, clear=True)
    print("Number of candidates:", session.query(ChemicalGeneInteraction).filter(ChemicalGeneInteraction.split == k).count())
    
from snorkel.learning.pytorch import LSTM

lstm = LSTM(n_threads=10)
lstm.load('chemical_gene_interaction.lstm')


print("Loading all candidates from db...")
all_cands = session.query(ChemicalGeneInteraction).filter(ChemicalGeneInteraction.split == 0).order_by(ChemicalGeneInteraction.id).all()
print("{} candidates load from db!".format(len(all_cands)))

print("Applying LSTM to candidates...")
%time lstm.save_marginals(session, all_cands)
print("LSTM applied!")

from ksnorkel import KSUtils

header_str = '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format('document_id', 'sentence_id', 'cand_id','chemical_cid', 'chemical_span', 'gene_cid', 'gene_span')
%time KSUtils.save_binary_relation_as_tsv(result_path + "chemical_gene_interaction.tsv", session, all_cands, all_sents, header_str, 'chemical_cid', 'gene_cid')

  2%|▏         | 63/3452 [00:00<00:05, 615.66it/s]

Clearing existing...
Running UDF...


100%|██████████| 3452/3452 [00:05<00:00, 602.19it/s]


Number of candidates: 3148
[LSTM] Loaded model <chemical_gene_interaction.lstm>
Loading all candidates from db...
3148 candidates load from db!
Applying LSTM to candidates...
Saved 3148 marginals
CPU times: user 18.1 s, sys: 992 ms, total: 19.1 s
Wall time: 7.11 s
LSTM applied!
Storing candidate labels into result file: results/simcyp298/chemical_gene_interaction.tsv
Amount of candidates: 3148
Load mariginals from db...
Marginals loaded!
Building sentence to document map...
Map built!
Saved 947 positive predicitions for binary relation!
CPU times: user 3.97 s, sys: 88 ms, total: 4.06 s
Wall time: 3.5 s


In [10]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession

session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor

GeneDiseaseInteraction = candidate_subclass('GeneDiseaseInteraction', ['gene', 'disease'])
candidate_extractor = PretaggedCandidateExtractor(GeneDiseaseInteraction, ['Gene', 'Disease'])

for k, sents in enumerate([all_sents]):
    candidate_extractor.apply(sents, split=k, clear=True)
    print("Number of candidates:", session.query(GeneDiseaseInteraction).filter(GeneDiseaseInteraction.split == k).count())
    
from snorkel.learning.pytorch import LSTM

lstm = LSTM(n_threads=10)
lstm.load('gene_disease_interaction.lstm')

print("Loading all candidates from db...")
all_cands = session.query(GeneDiseaseInteraction).filter(GeneDiseaseInteraction.split == 0).order_by(GeneDiseaseInteraction.id).all()
print("{} candidates load from db!".format(len(all_cands)))

print("Applying LSTM to candidates...")
%time lstm.save_marginals(session, all_cands)
print("LSTM applied!")

from ksnorkel import KSUtils

header_str = '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format('document_id', 'sentence_id', 'cand_id','gene_cid', 'gene_span', 'disease_cid', 'disease_span')
%time KSUtils.save_binary_relation_as_tsv(result_path + "gene_disease_interaction.tsv", session, all_cands, all_sents, header_str, 'gene_cid', 'disease_cid')

  0%|          | 0/3452 [00:00<?, ?it/s]

Clearing existing...
Running UDF...


100%|██████████| 3452/3452 [00:01<00:00, 2366.32it/s]


Number of candidates: 327
[LSTM] Loaded model <gene_disease_interaction.lstm>
Loading all candidates from db...
327 candidates load from db!
Applying LSTM to candidates...
Saved 327 marginals
CPU times: user 2.59 s, sys: 0 ns, total: 2.59 s
Wall time: 968 ms
LSTM applied!
Storing candidate labels into result file: results/simcyp298/gene_disease_interaction.tsv
Amount of candidates: 327
Load mariginals from db...
Marginals loaded!
Building sentence to document map...
Map built!
Saved 296 positive predicitions for binary relation!
CPU times: user 3.3 s, sys: 16 ms, total: 3.32 s
Wall time: 719 ms


In [17]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession

session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor

from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor
from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor


GeneChemicalMetabolism = candidate_subclass('GeneChemicalMetabolism', ['gene', 'chemical'])
candidate_gene_chemical_metabolism_extractor = PretaggedCandidateExtractor(GeneChemicalMetabolism, ['Gene', 'Chemical'])

for k, sents in enumerate([all_sents]):
    candidate_gene_chemical_metabolism_extractor.apply(sents, split=k, clear=True)
    print("Number of candidates:", session.query(GeneChemicalMetabolism).filter(GeneChemicalMetabolism.split == k).count())
    
    
from snorkel.learning.pytorch import LSTM

lstm = LSTM(n_threads=10)
lstm.load('gene_chemical_metabolism.lstm')

print("Loading all candidates from db...")
all_cands = session.query(GeneChemicalMetabolism).filter(GeneChemicalMetabolism.split == 0).order_by(GeneChemicalMetabolism.id).all()
print("{} candidates load from db!".format(len(all_cands)))

print("Applying LSTM to candidates...")
%time lstm.save_marginals(session, all_cands)
print("LSTM applied!")

from ksnorkel import KSUtils

header_str = '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format('document_id', 'sentence_id', 'cand_id','gene_cid', 'gene_span', 'chemical_cid', 'chemical_span')
%time KSUtils.save_binary_relation_as_tsv(result_path + "gene_chemical_metabolism.tsv", session, all_cands, all_sents, header_str, 'gene_cid', 'chemical_cid')

  2%|▏         | 68/3452 [00:00<00:04, 679.88it/s]

Clearing existing...
Running UDF...


100%|██████████| 3452/3452 [00:05<00:00, 640.92it/s]


Number of candidates: 3148
[LSTM] Loaded model <gene_chemical_metabolism.lstm>
Loading all candidates from db...
3148 candidates load from db!
Applying LSTM to candidates...
Saved 3148 marginals
CPU times: user 16.6 s, sys: 352 ms, total: 17 s
Wall time: 7.06 s
LSTM applied!
Storing candidate labels into result file: results/simcyp298/gene_chemical_metabolism.tsv
Amount of candidates: 3148
Load mariginals from db...
Marginals loaded!
Building sentence to document map...
Map built!
Saved 186 positive predicitions for binary relation!
CPU times: user 3.86 s, sys: 40 ms, total: 3.9 s
Wall time: 2.5 s


In [19]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession

session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor

from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor
from snorkel.models import Candidate, candidate_subclass
from snorkel.candidates import PretaggedCandidateExtractor



ChemicalGeneInhibition = candidate_subclass('ChemicalGeneInhibition', ['chemical', 'gene'])
candidate_gene_chemical_inhibit_extractor = PretaggedCandidateExtractor(ChemicalGeneInhibition, ['Chemical', 'Gene'])


for k, sents in enumerate([all_sents]):
    candidate_gene_chemical_inhibit_extractor.apply(sents, split=k, clear=True)
    print("Number of candidates:", session.query(ChemicalGeneInhibition).filter(ChemicalGeneInhibition.split == k).count())
    
from snorkel.learning.pytorch import LSTM

lstm = LSTM(n_threads=10)
lstm.load('chemical_gene_inhibition.lstm')

print("Loading all candidates from db...")
all_cands = session.query(ChemicalGeneInhibition).filter(ChemicalGeneInhibition.split == 0).order_by(ChemicalGeneInhibition.id).all()
print("{} candidates load from db!".format(len(all_cands)))

print("Applying LSTM to candidates...")
%time lstm.save_marginals(session, all_cands)
print("LSTM applied!")

from ksnorkel import KSUtils

header_str = '{}\t{}\t{}\t{}\t{}\t{}\t{}'.format('document_id', 'sentence_id', 'cand_id','chemical_cid', 'chemical_span', 'gene_cid', 'gene_span')
%time KSUtils.save_binary_relation_as_tsv(result_path + "chemical_gene_inhibition.tsv", session, all_cands, all_sents, header_str, 'chemical_cid', 'gene_cid')

  0%|          | 0/3452 [00:00<?, ?it/s]

Clearing existing...
Running UDF...


100%|██████████| 3452/3452 [00:05<00:00, 643.54it/s]


Number of candidates: 3148
[LSTM] Loaded model <chemical_gene_inhibition.lstm>
Loading all candidates from db...
3148 candidates load from db!
Applying LSTM to candidates...
Saved 3148 marginals
CPU times: user 15.6 s, sys: 1.29 s, total: 16.9 s
Wall time: 6.93 s
LSTM applied!
Storing candidate labels into result file: results/simcyp298/chemical_gene_inhibition.tsv
Amount of candidates: 3148
Load mariginals from db...
Marginals loaded!
Building sentence to document map...
Map built!
Saved 105 positive predicitions for binary relation!
CPU times: user 3.14 s, sys: 32 ms, total: 3.18 s
Wall time: 2.44 s
