In [2]:
import sys, csv, os
# import the main NPLinker class
from nplinker.nplinker import NPLinker

%reload_ext autoreload
%autoreload 2

In [5]:
# load a dataset and parameters by creating an NPLinker instance 
# from a configuration file
npl = NPLinker('latest_api_demo.toml')
npl.load_data()

18:44:11 [INFO] config.py:105, Loading from local data in directory D:\current_datasets_07062020\current\carnegie_mibig_27112019
18:44:11 [INFO] loader.py:360, Loaded global strain IDs (0 total)
18:44:11 [INFO] loader.py:368, Loaded dataset strain IDs (25 total)
18:44:13 [INFO] metabolomics.py:612, 3107 molecules parsed from MGF file
D:\current_datasets_07062020\current\carnegie_mibig_27112019\clusterinfo_summary\6f4bf34641bf4bcf8711042bc8cb086f.tsv False
18:44:13 [INFO] metabolomics.py:632, quantification table exists, new-style GNPS dataset
18:44:14 [INFO] metabolomics.py:572, Merged nodes data (new-style), total lines = 3107
18:44:14 [INFO] loader.py:346, Loading provided annotation files (D:\current_datasets_07062020\current\carnegie_mibig_27112019\DB_result)
18:44:14 [INFO] genomics.py:368, Found 1816 MiBIG json files
18:44:15 [INFO] genomics.py:275, # MiBIG BGCs = 317, non-MiBIG BGCS = 1632, total bgcs = 1949, GCFs = 327, strains=1841
18:44:15 [INFO] genomics.py:332, Filtering Mi

True

In [13]:
# get a list of the names of the available scoring methods
# (these are currently defined as classes in nplinker/scoring/methods.py)
print('Available scoring methods:')
for m in npl.scoring_methods:
    print(' - {}'.format(m))
    
# to get an *instance* of a particular method, just pass the
# name to .scoring_method()
mc = npl.scoring_method('metcalf')
rs = npl.scoring_method('rosetta')

# once you have an instance of a method you're free to change
# any of the parameters it may expose (depending on the method itself) 
# 
# for metcalf scoring this currently includes:
# - cutoff (float): the scoring threshold, links with scores less than this are excluded
# - standardised (bool): set to True to use standardised scores (default), False for regular
mc.cutoff = 3.5
mc.standardised = True

# the Rosetta scoring method is more complex and isn't described
# here, but also exposes a few parameters:
#
# If this parameter is True (default), the genomics output
# will consist of GCF objects rather than BGCs. This is mostly
# to fit in better with methods like Metcalf. 
rs.bgc_to_gcf = True
# spectral and BGC match scores can each have a cutoff threshold applied
rs.spec_score_cutoff = 0.2
rs.bgc_score_cutoff = 0.2

Available scoring methods:
 - metcalf
 - testscore
 - rosetta


In [14]:
#
# examples of different ways you can tell nplinker to generate scoring results
#

# 1. simplest case: 1 set of objects and 1 scoring method. if the 3rd parameter
# to get_links is not given, it defaults to ANDing results from different methods
# but this obviously doesn't matter if there's only one of them
results = npl.get_links(npl.gcfs[:10], mc) 

# The "results" object is an instance of a class called LinkCollection, defined in
# nplinker/scoring/methods.py. 
#
# Useful attributes/methods:
# - len(results) or .source_count will tell you how many of the input_objects were found to have links
print('Number of results: {}'.format(len(results)))
# - .sources will return a list of those objects
objects_with_links = results.sources
# - .links will return a dict with structure {input_object: {linked_object: ObjectLink}}
objects_and_link_info = results.links
# - .get_all_targets() will return a flat list of *all* the linked objects (for all sources)
all_targets = results.get_all_targets() 

18:52:30 [DEBUG] methods.py:432, MetcalfScoring: standardised = True
18:52:31 [DEBUG] methods.py:506, MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, inputs=10, results=(3, 37)
18:52:31 [DEBUG] methods.py:565, MetcalfScoring found 3 results
18:52:31 [DEBUG] methods.py:567, MetcalfScoring: completed
Number of results: 3


In [15]:
# The link data itself is stored in ObjectLink objects, which have the following
# basic attributes:
# - .source: the input object to the method
# - .target: the linked object
# - .methods: a list of the methods that found this link
# - .shared_strains: a list of Strain objects (possibly empty) shared between .source and .target
# - .data(<method_object>): return the output of <method_object> for this link (e.g. any score values)
# 
# You can also retrieve any method-specific info for a link by subscripting these objects with 
# the appropriate method object, e.g. metcalf_link_data = object_link[mc] 

# iterate over the link information in result.links. <obj> will be one of the original objects
# supplied to get_links and <result> will be a dict with structure {linked_object: ObjectLink}
for obj, result in results.links.items():
    # display the object, the number of links it has, and the number of methods that were used to get them
    print('Results for object: {}, {} total links, {} methods used'.format(obj, len(result), results.method_count))
    
    # sorting is method-dependent since they might have very different "scores", so you should
    # use the original object to do this. For metcalf scoring, this will return the ObjectLinks sorted
    # by score, in descending order by default
    sorted_links = results.get_sorted_links(mc, obj)
    
    # display some link information for each link this object has. 
    # link_data[<method_object>] will return the per-link data generated by that 
    # method. Here the metcalf method simply returns the link score as a floating point value 
    # but other methods may return more complex objects. Each method also has a format_data 
    # method which should provide a relatively short human-readable summary of the data, as
    # a quick way to print and examine results. 
    for link_data in sorted_links:
        print('  --> [{}] {} | {} | shared strains = {}'.format(','.join(method.name for method in link_data.methods), 
                                                                link_data.target, 
                                                                mc.format_data(link_data[mc]), 
                                                                len(link_data.shared_strains)))
        
    # alternatively, if you don't care about ordering, you can just iterate directly over the 
    # linked objects like this:
    #for link_target, link_data in result.items():
    #    print(link_target, link_data)
    

Results for object: GCF(id=0, class=NRPS, gcf_id=1807, strains=1), 14 total links, 1 methods used
  --> [metcalf] Spectrum(id=55, spectrum_id=7701, strains=1) | 4.0000 | shared strains = 1
  --> [metcalf] Spectrum(id=2663, spectrum_id=240439, strains=2) | 4.0000 | shared strains = 1
  --> [metcalf] Spectrum(id=2665, spectrum_id=240461, strains=2) | 4.0000 | shared strains = 1
  --> [metcalf] Spectrum(id=2668, spectrum_id=241050, strains=4) | 4.0000 | shared strains = 1
  --> [metcalf] Spectrum(id=2679, spectrum_id=242220, strains=3) | 4.0000 | shared strains = 1
  --> [metcalf] Spectrum(id=2682, spectrum_id=242381, strains=3) | 4.0000 | shared strains = 1
  --> [metcalf] Spectrum(id=2685, spectrum_id=243020, strains=4) | 4.0000 | shared strains = 1
  --> [metcalf] Spectrum(id=2826, spectrum_id=279198, strains=3) | 4.0000 | shared strains = 1
  --> [metcalf] Spectrum(id=2972, spectrum_id=298372, strains=2) | 4.0000 | shared strains = 1
  --> [metcalf] Spectrum(id=2980, spectrum_id=30012

In [16]:
# The LinkCollection object supports performing various types of filtering on the original set of
# results contained within it through a small set of utility methods:
# - .filter_no_shared_strains(): remove any links where the linked objects do not share strains
# - .filter_sources(callable), .filter_targets(callable), .filter_links(callable): each of those
#     simply do callable(object) and filter out objects for which the return value is False/0. The
#     <objects> in each case are: the original input objects (sources), their linked objects (targets),
#     and the ObjectLink objects (links).
#
# Notes:
# - these methods all modify the original LinkCollection in-place
# - they will automatically remove any original results for which no links exist after filtering. For
#    example, if there is a source object which starts off with 2 links, but has 0 after a filter is
#    run, this object will no longer appear in the LinkCollection afterwards.
#
# Examples:
# a) exclude any sources for which an arbitrary function is false (here sources are GCFs)
results.filter_sources(lambda gcf: gcf.id % 2 == 0)
# b) exclude any linked objects for which an arbitrary function is false (here these are Spectrums)
results.filter_targets(lambda spec: spec.id % 1 == 0)
# c) exclude any links for which an arbitrary function is false (the "link" here is an ObjectLink)
results.filter_links(lambda link: link[mc] > 3.6)

18:52:32 [DEBUG] methods.py:107, filter_sources: 3 => 2


In [18]:
# 2. use the same set of objects with two different methods, and AND the results
# together so that objects will only be returned which have links according to 
# both of the supplied methods. 
#
# If you provide 2 or more scoring methods but only a single set of objects,
# that set will be used as input to every method.
results = npl.get_links(npl.gcfs[:10], [rs, mc], and_mode=True)

18:52:40 [INFO] methods.py:334, RosettaScoring got 10 GCFs input, converted to 20 BGCs
18:52:40 [DEBUG] methods.py:376, RosettaScoring found 0 results
18:52:40 [DEBUG] methods.py:432, MetcalfScoring: standardised = True
18:52:40 [DEBUG] methods.py:506, MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, inputs=10, results=(3, 37)
18:52:40 [DEBUG] methods.py:565, MetcalfScoring found 3 results
18:52:40 [DEBUG] methods.py:52, Merging 3 results from method metcalf in AND mode
18:52:40 [DEBUG] methods.py:567, MetcalfScoring: completed


In [19]:
# 3. same thing as #2 but showing how to provide potentially different sets of objects
# to each method, and ORing the results instead so that different methods can return
# different numbers of results
results = npl.get_links([npl.gcfs[:10], npl.gcfs[:10]], [rs, mc], and_mode=False)

18:52:50 [INFO] methods.py:334, RosettaScoring got 10 GCFs input, converted to 20 BGCs
18:52:50 [DEBUG] methods.py:376, RosettaScoring found 0 results
18:52:50 [DEBUG] methods.py:432, MetcalfScoring: standardised = True
18:52:50 [DEBUG] methods.py:506, MetcalfScoring: input_type=GCF, result_type=Spec/MolFam, inputs=10, results=(3, 37)
18:52:50 [DEBUG] methods.py:565, MetcalfScoring found 3 results
18:52:50 [DEBUG] methods.py:49, Merging 3 results from method metcalf in OR mode
18:52:50 [DEBUG] methods.py:567, MetcalfScoring: completed
