In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, glob
import numpy as np

from nplinker_constants import nplinker_setup
LDA_PATH = '../../lda/code'
nplinker_setup(LDA_PATH=LDA_PATH)
from metabolomics import load_metadata, load_edges, make_families, load_spectra
from genomics import loadBGC_from_cluster_files
from strainmanager import StrainManager
from scoring import compute_all_scores_multi_np, metcalf_scoring_np

In [3]:
DATASET = '/mnt/archive/nplinker_data/crusemann'
MGF_FILE = os.path.join(DATASET, 'gnps/METABOLOMICS-SNETS-c36f90ba-download_clustered_spectra-main.mgf')
NODES_FILE = os.path.join(DATASET, 'gnps/0d51c5b6c73b489185a5503d319977ab..out')
EDGES_FILE = os.path.join(DATASET, 'gnps/9a93d720f69143bb9f971db39b5d2ba2.pairsinfo')
ROOT_PATH = os.path.join(DATASET, 'bigscape/bigscape_corason_crusemann_complete_annotated_mibigs_mix_automode_20180713/network_files/2018-07-13_16-34-11_hybrids_auto_crusemann_bgcs_automode_mix_mibig')
FOLDERS = ['NRPS','Others','PKSI','PKS-NRP_Hybrids','PKSother','RiPPs','Saccharides','Terpene']
ANTISMASH_DIR = os.path.join(DATASET, 'antismash/justin-20181022/')

In [4]:
spectra = load_spectra(MGF_FILE)
load_edges(spectra, EDGES_FILE)
families = make_families(spectra)
metadata = load_metadata(spectra, NODES_FILE)

input_files = []
ann_files = []
mibig_bgc_dict = None

for folder in FOLDERS:
    fam_file = os.path.join(ROOT_PATH, folder)
    cluster_file = glob.glob(fam_file + os.sep + folder + "_clustering*")
    annotation_files = glob.glob(fam_file + os.sep + "Network_*")
    input_files.append(cluster_file[0])
    ann_files.append(annotation_files[0])
gcf_list,bgc_list, strain_list = loadBGC_from_cluster_files(input_files, ann_files, antismash_dir=ANTISMASH_DIR, antismash_format = 'flat', mibig_bgc_dict=mibig_bgc_dict)

Loaded 5930 molecules


In [5]:
# merging all strain info into one place
merged_strains = set()
for s in strain_list: # from genomics source
    merged_strains.add(s)
for s in metadata: # from metabolomics source
    merged_strains.add(s)
    
strainmanager = StrainManager(merged_strains)
all_strains = strainmanager.all_strains_np # numpy array
print('Loaded {} strains'.format(len(all_strains)))

# generate the strain_prob_dict for setting up RandomSpectrum objects
strain_prob_dict = strainmanager.generate_prob_dict(spectra)

Loaded 199 strains


In [6]:
# adding random objects and scoring lookup tables

# numpy.random.choice converts Python lists to numpy arrays internally
# so doing that every time with a large list adds a ton of overhead, doing
# it once beforehand is much much faster
bgc_list = np.array(bgc_list)
for g in gcf_list:
    g.add_random(bgc_list)
    # add a lookup table indicating which strains are present in each object
    g.strains_lookup = strainmanager.generate_lookup_table(g.strains)
    g.random_gcf.strains_lookup = strainmanager.generate_lookup_table(g.random_gcf.strains)
    
# same for spectra
for s in spectra:
    # TODO could probably convert this to numpy format as well
    s.add_random(strain_prob_dict)
    s.strains_lookup = strainmanager.generate_lookup_table(s.strain_set)
    s.random_spectrum.strains_lookup = strainmanager.generate_lookup_table(s.strain_set)

In [8]:
# use new scoring code like this:
m_scores = compute_all_scores_multi_np(spectra, gcf_list, all_strains, metcalf_scoring_np, do_random=True)


compute_all_scores_np on CPU 0, processing 742 spectra
compute_all_scores_np on CPU 1, processing 742 spectra
compute_all_scores_np on CPU 2, processing 741 spectra
compute_all_scores_np on CPU 3, processing 741 spectra
compute_all_scores_np on CPU 4, processing 741 spectra
compute_all_scores_np on CPU 5, processing 741 spectra
compute_all_scores_np on CPU 6, processing 741 spectra
compute_all_scores_np on CPU 7, processing 741 spectra
compute_all_scores_np on CPU 5, total time = 23.4s, 31.6 scores/sec
compute_all_scores_np on CPU 3, total time = 23.6s, 31.3 scores/sec
compute_all_scores_np on CPU 4, total time = 23.8s, 31.1 scores/sec
compute_all_scores_np on CPU 2, total time = 24.2s, 30.7 scores/sec
compute_all_scores_np on CPU 0, total time = 32.6s, 22.8 scores/sec
compute_all_scores_np on CPU 1, total time = 32.6s, 22.7 scores/sec
compute_all_scores_np on CPU 7, total time = 32.7s, 22.7 scores/sec
compute_all_scores_np on CPU 6, total time = 32.7s, 22.7 scores/sec
Total time: 57.1