## Use NPClassScore on a local dataset

In [1]:
import os
import sys
import glob
# if running from clone of the git repo
sys.path.append('../../prototype')

# import the main NPLinker class. normally this all that's required to work
# with NPLinker in a notebook environment
from nplinker.nplinker import NPLinker
from nplinker.nplinker import Spectrum  # to be able to separate molfams and spectrums from each other in results

In [2]:
# load your local dataset
npl = NPLinker({'dataset': {'root': '/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crusemann_3ids_AS6-AS3_30-11/'}})
npl.load_data()

12:22:58 [INFO] config.py:121, Loading from local data in directory /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crusemann_3ids_AS6-AS3_30-11/
12:22:58 [INFO] loader.py:84, Trying to discover correct bigscape directory under /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crusemann_3ids_AS6-AS3_30-11/bigscape
12:22:58 [INFO] loader.py:87, Found network files directory: /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crusemann_3ids_AS6-AS3_30-11/bigscape/network_files/2021-12-02_16-48-06_hybrids_glocal
12:22:58 [INFO] loader.py:222, Updating bigscape_dir to discovered location /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crusemann_3ids_AS6-AS3_30-11/bigscape/network_files/2021-12-02_16-48-06_hybrids_glocal
12:22:58 [INFO] loader.py:633, Loaded global strain IDs (0 total)
12:22:58 [INFO] loader.py:644, Loaded dataset strain IDs (159 total)
12:23:02 [INFO] metabolomics.py:699, 13667 molecules parsed from MGF file
12:23:03 [INFO] metabolomics.py:716, Found older-style GNP

True

In [3]:
# Basic functionality
# ===================
#
# Once you have an NPLinker object with all data loaded, there are a collection of simple
# methods and properties you can use to access objects and metadata. Some examples are 
# given below, see https://nplinker.readthedocs.io/en/latest/ for a complete API description.

# configuration/dataset metadata
# - a copy of the configuration as parsed from the .toml file (dict)
print(npl.config) 
# - the path to the directory where various nplinker data files are located (e.g. the 
#   default configuration file template) (str)
print(npl.data_dir)
# - a dataset ID, derived from the path for local datasets or the paired platform ID
#   for datasets loaded from that source (str)
print(npl.dataset_id)
# - the root directory for the current dataset (str)
print(npl.root_dir)

# objects
# - you can directly access lists of each of the 4 object types:
print('BGCs:', len(npl.bgcs))
print('GCFs:', len(npl.gcfs)) # contains GCF objects
print('Spectra:', len(npl.spectra)) # contains Spectrum objects
print('Molecular Families:', len(npl.molfams)) # contains MolecularFamily objects

{'loglevel': 'INFO', 'logfile': '', 'log_to_stdout': True, 'repro_file': '', 'dataset': {'root': '/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crusemann_3ids_AS6-AS3_30-11/', 'overrides': {}, 'platform_id': ''}, 'antismash': {'antismash_format': 'default', 'ignore_spaces': False}, 'docker': {'run_bigscape': True, 'extra_bigscape_parameters': ''}, 'webapp': {'tables_metcalf_threshold': 2.0}, 'scoring': {'rosetta': {}}}
../../prototype/nplinker/data

/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crusemann_3ids_AS6-AS3_30-11/
BGCs: 5869
GCFs: 1581
Spectra: 13667
Molecular Families: 8346


### Run NPClassScore on data

In [4]:
# Use NPClassScore alone
npcl = npl.scoring_method('npclassscore')

'''
Now npcl is an instance of the class that implements NPClassScore scoring. Once
you have such an instance, you may change any of the parameters it exposes.
In the case of NPClassScore scoring, the following parameters are currently exposed:
- cutoff (float): the scoring threshold, default 0.25. Links with scores less than this are excluded
- method (str): the chemical class prediction tool that is used, default is mix. Choose from .method_options:
      mix - use all tools (first canopus then molnetenhancer), main - use main method (canopus),
      canopus - use canopus, molnetenhancer - use molnetenhancer

Less important parameters:
- equal_targets (bool): targets are on equal level, default is False. I.e. if input object is GCF,
       target is spectra and not MFs.
- both_targets (bool): take both targets from the other side, default is False. I.e. if input object
        is GCF, target both spectra and MF as targets.
- num_results (int): how many scores do you want to show for each link. Default is 1 showing
       only NPClassScore (the best) score.
- filter_missing_scores (bool): filter out spectra without a score due to missing spectrum classes,
        default is False.'''
npcl.cutoff = 0.25

results = npl.get_links(npl.gcfs, npcl, and_mode=True)

# get_links returns an instance of a class called LinkCollection. This provides a wrapper
# around the results of the scoring operation and has various useful properties/methods:
#
# - len(results) or .source_count will tell you how many of the input_objects were found to have links
print('Number of results: {}'.format(len(results)))
# - .sources is a list of those objects
objects_with_links = results.sources
# - .links is a dict with structure {input_object: {linked_object: ObjectLink}} 
objects_and_link_info = results.links
# - .get_all_targets() will return a flat list of *all* the linked objects (for all sources)
all_targets = results.get_all_targets() 
# - .methods is a list of the scoring methods passed to get_links
print(results.methods)

12:26:24 [INFO] methods.py:968, Set up NPClassScore scoring
12:26:24 [INFO] methods.py:970, Please choose one of the methods from ['mix', 'main', 'canopus', 'molnetenhancer']
12:26:24 [INFO] methods.py:976, Currently the method 'mix' is selected
12:26:24 [INFO] methods.py:982, Running NPClassScore...
12:26:24 [INFO] methods.py:996, Using Metcalf scoring to get shared strains
12:26:24 [INFO] methods.py:459, MetcalfScoring.setup (bgcs=5869, gcfs=1581, spectra=13667, molfams=8346, strains=154)
12:26:25 [INFO] methods.py:499, MetcalfScoring.setup completed
12:27:12 [INFO] methods.py:1003, Calculating NPClassScore for 1581 objects to 13667 targets (1784369 pairwise interactions that share at least 1 strain). This might take a while.
12:29:49 [INFO] methods.py:1052, NPClassScore completed in 204.7s
Number of results: 1581
{<nplinker.scoring.methods.NPClassScoring object at 0x7f08fa1b8e48>}


In [18]:
# show the result for one of the objects - in this case a GCF encoding staurosporine
obj = npl.gcfs[534]

result = results.links[obj]
print('Results for object: {}, {} total links, {} methods used\n'.format(obj, len(result), results.method_count))
sorted_links = results.get_sorted_links(npcl, obj)
link_data = sorted_links[0]
print("ObjectLink: ", link_data)
print('  --> [{}] {} | {} | shared strains = {}'.format(','.join(method.name for method in link_data.methods),
                                                                 link_data.target,
                                                                 npcl.format_data(link_data[npcl]),
                                                                 len(link_data.shared_strains)))
print("   unfiltered direct result from NPClassScore:", link_data[npcl])

Results for object: GCF(id=534, class=Others, gcf_id=511, strains=54), 644 total links, 1 methods used

ObjectLink:  ObjectLink(source=GCF(id=534, class=Others, gcf_id=511, strains=54), target=Spectrum(id=88, spectrum_id=424, strains=2), #methods=1)
  --> [npclassscore] Spectrum(id=88, spectrum_id=424, strains=2) | 0.781 | shared strains = 1
   unfiltered direct result from NPClassScore: [(0.780952380952381, 'as_classes', 'cf_superclass', 'indole', 'Organoheterocyclic compounds')]


### Run NPClassScore and Metcalf scoring

In [6]:
# Initialise metcalf scoring the same way
mc = npl.scoring_method('metcalf')
mc.cutoff = 2.5
mc.standardised = True

# Now only links are kept that pass the cutoff for both methods
results_both = npl.get_links(npl.gcfs, [mc, npcl], and_mode=True)

print('Number of results for Metcalf and NPClassScore scoring: {}'.format(len(results_both)))
print(results_both.methods)

12:31:02 [INFO] methods.py:982, Running NPClassScore...
12:31:02 [INFO] methods.py:996, Using Metcalf scoring to get shared strains
12:31:49 [INFO] methods.py:1003, Calculating NPClassScore for 1581 objects to 13667 targets (1784369 pairwise interactions that share at least 1 strain). This might take a while.
12:34:44 [INFO] methods.py:1052, NPClassScore completed in 221.6s
Number of results for Metcalf and NPClassScore scoring: 1574
{<nplinker.scoring.methods.MetcalfScoring object at 0x7f0892776898>, <nplinker.scoring.methods.NPClassScoring object at 0x7f08fa1b8e48>}


In [17]:
# use same obj as before to show results
print('Results for object: {}, {} total links, {} methods used'.format(
    obj, len(results_both.links.get(obj)), results_both.method_count))

# sort results based on metcalf scoring
sorted_links_both = results_both.get_sorted_links(mc, obj)
i = 0  # keep track of (spectrum) results
for both_link_data in sorted_links_both:
        if isinstance(both_link_data.target, Spectrum):
            print('{}  --> [{}] {} | mc:{} npcl:{} | shared strains = {}'.format(
                i,
                ','.join(method.name for method in both_link_data.methods),
                both_link_data.target,
                mc.format_data(both_link_data[mc]),
                npcl.format_data(both_link_data[npcl]),
                len(both_link_data.shared_strains)))
            print("   unfiltered results:", both_link_data[mc], both_link_data[npcl])
            i+=1

Results for object: GCF(id=534, class=Others, gcf_id=511, strains=54), 21 total links, 2 methods used
0  --> [metcalf,npclassscore] Spectrum(id=3632, spectrum_id=89513, strains=67) | mc:8.9996 npcl:0.781 | shared strains = 50
   unfiltered results: 8.99963318035332 [(0.780952380952381, 'as_classes', 'cf_superclass', 'indole', 'Organoheterocyclic compounds')]
1  --> [metcalf,npclassscore] Spectrum(id=4070, spectrum_id=95003, strains=21) | mc:4.7266 npcl:0.702 | shared strains = 17
   unfiltered results: 4.726582782023565 [(0.7021276595744681, 'as_classes', 'npc_pathway', 'indole', 'Alkaloids')]
2  --> [metcalf,npclassscore] Spectrum(id=3544, spectrum_id=87806, strains=27) | mc:4.6625 npcl:0.702 | shared strains = 20
   unfiltered results: 4.6624688447848435 [(0.7021276595744681, 'as_classes', 'npc_pathway', 'indole', 'Alkaloids')]
3  --> [metcalf,npclassscore] Spectrum(id=4060, spectrum_id=94966, strains=27) | mc:4.6625 npcl:0.702 | shared strains = 20
   unfiltered results: 4.662468844

### Use only the feature based scores - Rosetta and NPClassScore

In [8]:
# Initialise rosetta scoring the same way
ros = npl.scoring_method('rosetta')

# Now only links are kept that pass the cutoff for both methods
results_feat = npl.get_links(npl.gcfs, [ros, npcl], and_mode=True)

print('Number of results for Rosetta and NPClassScore scoring: {}'.format(len(results_feat)))
print(results_feat.methods)

12:34:48 [INFO] methods.py:329, RosettaScoring setup
12:34:48 [INFO] rosetta.py:376, Trying to load cached Rosetta hits data
12:34:48 [INFO] rosetta.py:379, Loaded cached Rosetta hits for dataset  at /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crusemann_3ids_AS6-AS3_30-11/rosetta/RosettaHits.pckl
12:34:48 [INFO] methods.py:346, RosettaScoring setup completed
12:34:48 [INFO] methods.py:393, RosettaScoring got 1581 GCFs input, converted to 5869 BGCs
12:34:51 [INFO] methods.py:982, Running NPClassScore...
12:34:51 [INFO] methods.py:996, Using Metcalf scoring to get shared strains
12:35:37 [INFO] methods.py:1003, Calculating NPClassScore for 1581 objects to 13667 targets (1784369 pairwise interactions that share at least 1 strain). This might take a while.
12:38:15 [INFO] methods.py:1052, NPClassScore completed in 204.6s
Number of results for Rosetta and NPClassScore scoring: 31
{<nplinker.scoring.methods.NPClassScoring object at 0x7f08fa1b8e48>, <nplinker.scoring.methods.RosettaSco

In [9]:
# use same obj as before to show results
result_feat = results_feat.links.get(obj)
print('Results for object: {}, {} total links, {} methods used'.format(
    obj, result_feat if not result_feat else len(result_feat), results_feat.method_count))
if result_feat:
    # sort results based on metcalf scoring
    sorted_links_feat = results_feat.get_sorted_links(ros, obj)

    i = 0  # keep track of (spectrum) results
    for feat_link_data in sorted_links_feat:
            if isinstance(feat_link_data.target, Spectrum):
                print('{}  --> [{}] {} | ros:{} npcl:{} | shared strains = {}'.format(
                    i,
                    ','.join(method.name for method in feat_link_data.methods),
                    feat_link_data.target,
                    ros.format_data(feat_link_data[ros]),
                    npcl.format_data(feat_link_data[npcl]),
                    len(feat_link_data.shared_strains)))
                print("   unfiltered results:", feat_link_data[ros], feat_link_data[npcl])
                i+=1
else:
    print("\nNo result for obj", obj)

Results for object: GCF(id=534, class=Others, gcf_id=511, strains=54), None total links, 2 methods used

No result for obj GCF(id=534, class=Others, gcf_id=511, strains=54)


In [10]:
# get results for an obj that does have links
obj_feat = list(results_feat.links)[0]
print(obj_feat)

GCF(id=1459, class=Others, gcf_id=1754, strains=39)


In [11]:
result_feat = results_feat.links.get(obj_feat)
print('Results for object: {}, {} total links, {} methods used'.format(
    obj_feat, result_feat if not result_feat else len(result_feat), results_feat.method_count))
if result_feat:
    # sort results based on metcalf scoring
    sorted_links_feat = results_feat.get_sorted_links(ros, obj_feat)

    i = 0  # keep track of (spectrum) results
    for feat_link_data in sorted_links_feat:
            if isinstance(feat_link_data.target, Spectrum):
                print('{}  --> [{}] {} | ros:{} npcl:{} | shared strains = {}'.format(
                    i,
                    ','.join(method.name for method in feat_link_data.methods),
                    feat_link_data.target,
                    ros.format_data(feat_link_data[ros]),
                    npcl.format_data(feat_link_data[npcl]),
                    len(feat_link_data.shared_strains)))
                print("   unformatted results:", feat_link_data[ros], feat_link_data[npcl])
                i+=1
else:
    print("\nNo result for obj", obj_feat)

Results for object: GCF(id=1459, class=Others, gcf_id=1754, strains=39), 1 total links, 2 methods used
0  --> [rosetta,npclassscore] Spectrum(id=8987, spectrum_id=166494, strains=1) | ros:3 hits npcl:0.412 | shared strains = 1
   unformatted results: [RosettaHit: 166494<-->NZ_KB900331.1.region001 via (CCMSLIB00000222303 (0.507), BGC0000054 (4.155)), RosettaHit: 166494<-->NZ_KB896267.1.region001 via (CCMSLIB00000222303 (0.507), BGC0000054 (4.159)), RosettaHit: 166494<-->NZ_KB900270.1.region001 via (CCMSLIB00000222303 (0.507), BGC0000054 (4.088))] [(0.4117647058823529, 'as_classes', 'cf_subclass', 'oligosaccharide', 'Carbohydrates and carbohydrate conjugates')]
