In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import sys
from nplinker import NPLinker
from logconfig import LogConfig
from metabolomics import Spectrum
%reload_ext autoreload
%autoreload 2

In [2]:
# configuring NPLinker in a notebook env is now done either by passing in the name of a config file,
# or by passing in a dict which corresponds to the structure of the config file. Usually it will be
# easier to edit the file and simply pass the filename like this:
npl = NPLinker('latest_api_demo_sr.toml')

# the above step will attempt to discover the files to be loaded from the dataset and complain
# if they're not as expected. Next, actually load the data files
if not npl.load_data():
    raise Exception('Failed to load data')
 

Loaded 5930 molecules
16:56:54 [INFO] metabolomics.py:314, Loading annotations from GNPS database matches
Found 0 MiBIG json files
# MiBIG BGCs = 1362, non-MiBIG BGCS = 4962, total bgcs = 6324, GCFs = 1795


In [3]:
# The scoring methods are defined and configured in the default configuration file at 
# ~/.config/nplinker/nplinker.toml, but will be overridden by the config file you loaded above,
# and the scoring methods can be easily changed once the NPLinker object has been created, e.g.:

# ensure only metcalf scoring is enabled, and set a 99% significance percentile threshold
print('Currently enabled scoring methods: {}'.format(npl.scoring.enabled()))
npl.scoring.likescore.enabled = False
# npl.scoring.likescore.cutoff = <scoring cutoff threshold>
npl.scoring.hg.enabled = False
# npl.scoring.hg.prob = <probability threshold>
npl.scoring.metcalf.enabled = True
npl.scoring.metcalf.sig_percentile = 99
print('Currently enabled scoring methods: {}'.format(npl.scoring.enabled()))

Currently enabled scoring methods: [namespace(enabled=True, name='metcalf', sig_percentile=99)]
Currently enabled scoring methods: [namespace(enabled=True, name='metcalf', sig_percentile=99)]


In [13]:
# to check if a spectrum has any of these can use .has_gnps_annotations method:
spectra_with_gnps_matches = [s for s in npl.spectra if s.has_gnps_annotations()]
print('found {} spectra'.format(len(spectra_with_gnps_matches)))

for s in spectra_with_gnps_matches:
    print(len(s.get_gnps_annotations()))

for gnps_anno in spectra_with_gnps_matches[0].get_gnps_annotations():
    # print as string
    print(gnps_anno)
    # access individual fields
    print(gnps_anno.id, gnps_anno.score, gnps_anno.name, gnps_anno.organism)
    # URLs for viewing spectrum info
    print(gnps_anno.png_url)
    print(gnps_anno.spec_url)

Completed generating scores


In [None]:
# this step generates scores for all objects and enabled scoring methods, so it can be
# quite lengthy. The random_count parameter determines the number of randomised instances
# of Spectrum <=> Strain mappings that will be generated during the process.
if not npl.process_dataset(random_count=10):
    raise Exception('Failed to process dataset')
print('Completed generating scores')

In [5]:
# to get results once the scores are generated, first select an object you're interested 
# in, then call get_links with a specific scoring method. You can also pass a list of 
# objects as the first parameter. The method returns a list which contains only those
# objects that satisfy the scoring criteria (so here only those with a significance 
# percentile score of >= 99 as set above)
test_gcf = npl.gcfs[8]
results = npl.get_links(test_gcf, npl.scoring.metcalf)
if test_gcf not in results:
    print('No results found!')
else:
    print('Found results for {}!'.format(test_gcf))
    # to get the objects that scored highly against this GCF, use links_for_obj. By
    # default it will return all objects, the type_ parameter can be used to filter
    # by class, so here it will only return spectra
    test_gcf_links = npl.links_for_obj(test_gcf, npl.scoring.metcalf, type_=Spectrum)
    
    # print the objects and their scores, plus common strains
    for obj, score in test_gcf_links:
        print('{} : score {}'.format(obj, score))
        # returns a dict indexed by (Spectrum, GCF) tuples, with 
        # the values being lists of strain names shared between the two
        common_strains = npl.get_common_strains(test_gcf, obj)
        if len(common_strains) > 0:
            strain_names = list(common_strains.values())[0]
            print('   {} shared strains: {}'.format(len(strain_names), strain_names))
        else:
            print('   (no shared strains)')
            
    print('{} total links found'.format(len(test_gcf_links)))
        
    

Found results for GCF(id=8, class=NRPS, gcf_id=33)!
Spectrum(id=0, spectrum_id=1) : score 2492.0
   (no shared strains)
Spectrum(id=1, spectrum_id=3) : score 2492.0
   (no shared strains)
Spectrum(id=2, spectrum_id=41) : score 2492.0
   (no shared strains)
Spectrum(id=3, spectrum_id=58) : score 2492.0
   (no shared strains)
Spectrum(id=6, spectrum_id=95) : score 2492.0
   (no shared strains)
Spectrum(id=7, spectrum_id=97) : score 2492.0
   (no shared strains)
Spectrum(id=9, spectrum_id=100) : score 2492.0
   (no shared strains)
Spectrum(id=10, spectrum_id=111) : score 2492.0
   (no shared strains)
Spectrum(id=11, spectrum_id=119) : score 2492.0
   (no shared strains)
Spectrum(id=13, spectrum_id=146) : score 2492.0
   (no shared strains)
Spectrum(id=14, spectrum_id=148) : score 2492.0
   (no shared strains)
Spectrum(id=16, spectrum_id=156) : score 2492.0
   (no shared strains)
Spectrum(id=19, spectrum_id=171) : score 2492.0
   (no shared strains)
Spectrum(id=20, spectrum_id=176) : score

   (no shared strains)
Spectrum(id=737, spectrum_id=22263) : score 2492.0
   (no shared strains)
Spectrum(id=738, spectrum_id=22264) : score 2492.0
   (no shared strains)
Spectrum(id=739, spectrum_id=22316) : score 2492.0
   (no shared strains)
Spectrum(id=745, spectrum_id=22371) : score 2492.0
   (no shared strains)
Spectrum(id=746, spectrum_id=22395) : score 2492.0
   (no shared strains)
Spectrum(id=747, spectrum_id=22427) : score 2492.0
   (no shared strains)
Spectrum(id=749, spectrum_id=22442) : score 2492.0
   (no shared strains)
Spectrum(id=750, spectrum_id=22466) : score 2492.0
   (no shared strains)
Spectrum(id=751, spectrum_id=22474) : score 2492.0
   (no shared strains)
Spectrum(id=752, spectrum_id=22509) : score 2492.0
   (no shared strains)
Spectrum(id=753, spectrum_id=22513) : score 2492.0
   (no shared strains)
Spectrum(id=757, spectrum_id=22544) : score 2492.0
   (no shared strains)
Spectrum(id=758, spectrum_id=22563) : score 2492.0
   (no shared strains)
Spectrum(id=759

Spectrum(id=1894, spectrum_id=45404) : score 2492.0
   (no shared strains)
Spectrum(id=1895, spectrum_id=45405) : score 2492.0
   (no shared strains)
Spectrum(id=1896, spectrum_id=45435) : score 2492.0
   (no shared strains)
Spectrum(id=1897, spectrum_id=45447) : score 2492.0
   (no shared strains)
Spectrum(id=1898, spectrum_id=45454) : score 2492.0
   (no shared strains)
Spectrum(id=1899, spectrum_id=45464) : score 2492.0
   (no shared strains)
Spectrum(id=1902, spectrum_id=45497) : score 2492.0
   (no shared strains)
Spectrum(id=1905, spectrum_id=45515) : score 2492.0
   (no shared strains)
Spectrum(id=1909, spectrum_id=45575) : score 2492.0
   (no shared strains)
Spectrum(id=1910, spectrum_id=45590) : score 2492.0
   (no shared strains)
Spectrum(id=1912, spectrum_id=45659) : score 2492.0
   (no shared strains)
Spectrum(id=1914, spectrum_id=45700) : score 2492.0
   (no shared strains)
Spectrum(id=1915, spectrum_id=45783) : score 2492.0
   (no shared strains)
Spectrum(id=1916, spectru

Spectrum(id=2620, spectrum_id=56637) : score 2492.0
   (no shared strains)
Spectrum(id=2622, spectrum_id=56658) : score 2492.0
   (no shared strains)
Spectrum(id=2623, spectrum_id=56659) : score 2492.0
   (no shared strains)
Spectrum(id=2624, spectrum_id=56689) : score 2492.0
   (no shared strains)
Spectrum(id=2625, spectrum_id=56694) : score 2492.0
   (no shared strains)
Spectrum(id=2626, spectrum_id=56702) : score 2492.0
   (no shared strains)
Spectrum(id=2627, spectrum_id=56721) : score 2492.0
   (no shared strains)
Spectrum(id=2628, spectrum_id=56737) : score 2492.0
   (no shared strains)
Spectrum(id=2629, spectrum_id=56764) : score 2492.0
   (no shared strains)
Spectrum(id=2630, spectrum_id=56765) : score 2492.0
   (no shared strains)
Spectrum(id=2631, spectrum_id=56776) : score 2492.0
   (no shared strains)
Spectrum(id=2632, spectrum_id=56781) : score 2492.0
   (no shared strains)
Spectrum(id=2633, spectrum_id=56790) : score 2492.0
   (no shared strains)
Spectrum(id=2636, spectru

   (no shared strains)
Spectrum(id=3768, spectrum_id=74552) : score 2492.0
   (no shared strains)
Spectrum(id=3773, spectrum_id=74573) : score 2492.0
   (no shared strains)
Spectrum(id=3774, spectrum_id=74578) : score 2492.0
   (no shared strains)
Spectrum(id=3775, spectrum_id=74579) : score 2492.0
   (no shared strains)
Spectrum(id=3776, spectrum_id=74581) : score 2492.0
   (no shared strains)
Spectrum(id=3777, spectrum_id=74598) : score 2492.0
   (no shared strains)
Spectrum(id=3778, spectrum_id=74604) : score 2492.0
   (no shared strains)
Spectrum(id=3780, spectrum_id=74637) : score 2492.0
   (no shared strains)
Spectrum(id=3784, spectrum_id=74695) : score 2492.0
   (no shared strains)
Spectrum(id=3787, spectrum_id=74800) : score 2492.0
   (no shared strains)
Spectrum(id=3788, spectrum_id=74837) : score 2492.0
   (no shared strains)
Spectrum(id=3789, spectrum_id=74882) : score 2492.0
   (no shared strains)
Spectrum(id=3790, spectrum_id=74902) : score 2492.0
   (no shared strains)
Sp

   (no shared strains)
Spectrum(id=4386, spectrum_id=84473) : score 2492.0
   (no shared strains)
Spectrum(id=4387, spectrum_id=84477) : score 2492.0
   (no shared strains)
Spectrum(id=4388, spectrum_id=84509) : score 2492.0
   (no shared strains)
Spectrum(id=4389, spectrum_id=84514) : score 2492.0
   (no shared strains)
Spectrum(id=4390, spectrum_id=84516) : score 2492.0
   (no shared strains)
Spectrum(id=4391, spectrum_id=84528) : score 2492.0
   (no shared strains)
Spectrum(id=4394, spectrum_id=84543) : score 2492.0
   (no shared strains)
Spectrum(id=4396, spectrum_id=84562) : score 2492.0
   (no shared strains)
Spectrum(id=4397, spectrum_id=84570) : score 2492.0
   (no shared strains)
Spectrum(id=4399, spectrum_id=84578) : score 2492.0
   (no shared strains)
Spectrum(id=4400, spectrum_id=84599) : score 2492.0
   (no shared strains)
Spectrum(id=4401, spectrum_id=84613) : score 2492.0
   (no shared strains)
Spectrum(id=4406, spectrum_id=84628) : score 2492.0
   (no shared strains)
Sp

Spectrum(id=5300, spectrum_id=100065) : score 2492.0
   (no shared strains)
Spectrum(id=5301, spectrum_id=100068) : score 2492.0
   (no shared strains)
Spectrum(id=5302, spectrum_id=100078) : score 2492.0
   (no shared strains)
Spectrum(id=5303, spectrum_id=100088) : score 2492.0
   (no shared strains)
Spectrum(id=5304, spectrum_id=100168) : score 2492.0
   (no shared strains)
Spectrum(id=5305, spectrum_id=100176) : score 2492.0
   (no shared strains)
Spectrum(id=5306, spectrum_id=100179) : score 2492.0
   (no shared strains)
Spectrum(id=5307, spectrum_id=100185) : score 2492.0
   (no shared strains)
Spectrum(id=5308, spectrum_id=100198) : score 2492.0
   (no shared strains)
Spectrum(id=5309, spectrum_id=100204) : score 2492.0
   (no shared strains)
Spectrum(id=5310, spectrum_id=100213) : score 2492.0
   (no shared strains)
Spectrum(id=5311, spectrum_id=100219) : score 2492.0
   (no shared strains)
Spectrum(id=5312, spectrum_id=100223) : score 2492.0
   (no shared strains)
Spectrum(id=

In [6]:
print([b for b in npl.bgcs if b.name.startswith('KL370899') ])

[BGC(name=KL370899.1.region002, strain=KL370899), BGC(name=KL370899.1.region001, strain=KL370899)]


# code to compute Expectation and variance of metcalf score

## input

- n strains (spectra)
- m strains (GCF)
- N strains (total)


In [9]:
from data_linking import metcalf_scoring_from_counts
def metcalf_statistics(n_strains_metabolites,n_strains_genomics,n_strains_total):
    from scipy.stats import hypergeom
    min_overlap = max(0,n_strains_metabolites+n_strains_genomics-n_strains_total) # minimum possible strain overlap
    max_overlap = min(n_strains_metabolites,n_strains_genomics) # maximum possible strain overlap
    sum_fo = 0
    sum_fo2 = 0
    for o in range(int(min_overlap),int(max_overlap)+1):
        o_prob = hypergeom.pmf(o,n_strains_total,n_strains_metabolites,n_strains_genomics)
        score = metcalf_scoring_from_counts(n_strains_metabolites,n_strains_genomics,o,n_strains_total)
        sum_fo += o_prob*score
        sum_fo2 += o_prob*(score**2)
    expected_value = sum_fo
    variance = sum_fo2 - sum_fo**2
    return expected_value,variance

In [10]:
metcalf_statistics(10,20,140)

(40.00000000000005, 505.0359712230204)