In [1]:
from class_linking import NPLinker_classes

In [2]:
# load local crusemann data
npl = NPLinker_classes({'dataset': {'root': '/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/'}})
npl.load_data()
npl.read_class_info()

15:29:46 [INFO] config.py:121, Loading from local data in directory /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/
15:29:46 [INFO] loader.py:80, Trying to discover correct bigscape directory under /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/bigscape
15:29:46 [INFO] loader.py:83, Found network files directory: /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/bigscape/network_files/2021-07-16_08-32-34_hybrids_glocal
15:29:46 [INFO] loader.py:212, Updating bigscape_dir to discovered location /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/bigscape/network_files/2021-07-16_08-32-34_hybrids_glocal
15:29:46 [INFO] loader.py:571, Loaded global strain IDs (0 total)
15:29:46 [INFO] loader.py:582, Loaded dataset strain IDs (159 total)
15:29:50 [INFO] metabolomics.py:699, 13667 molecules parsed from MGF file
15:29:51 [INFO] metabolomics.py:716, Found older-style GNPS dataset, no quanti

['cf_kingdom',
 'cf_superclass',
 'cf_class',
 'cf_subclass',
 'cf_direct_parent',
 'npc_class',
 'npc_superclass',
 'npc_pathway',
 'npc_isglycoside']

In [3]:
# Basic functionality
# ===================
#
# Once you have an NPLinker object with all data loaded, there are a collection of simple
# methods and properties you can use to access objects and metadata. Some examples are 
# given below, see https://nplinker.readthedocs.io/en/latest/ for a complete API description.

# configuration/dataset metadata
# - a copy of the configuration as parsed from the .toml file (dict)
print(npl.config) 
# - the path to the directory where various nplinker data files are located (e.g. the 
#   default configuration file template) (str)
print(npl.data_dir)
# - a dataset ID, derived from the path for local datasets or the paired platform ID
#   for datasets loaded from that source (str)
print(npl.dataset_id)
# - the root directory for the current dataset (str)
print(npl.root_dir)

# objects
# - you can directly access lists of each of the 4 object types:
print('BGCs:', len(npl.bgcs))
print('GCFs:', len(npl.gcfs)) # contains GCF objects
print('Spectra:', len(npl.spectra)) # contains Spectrum objects
print('Molecular Families:', len(npl.molfams)) # contains MolecularFamily objects

{'loglevel': 'INFO', 'logfile': '', 'log_to_stdout': True, 'repro_file': '', 'dataset': {'root': '/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/', 'overrides': {}, 'platform_id': ''}, 'antismash': {'antismash_format': 'default', 'ignore_spaces': False}, 'docker': {'run_bigscape': True, 'extra_bigscape_parameters': ''}, 'webapp': {'tables_metcalf_threshold': 2.0}, 'scoring': {'rosetta': {}}}
../prototype/nplinker/data

/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/
BGCs: 5905
GCFs: 1263
Spectra: 13667
Molecular Families: 8346


In [4]:
mc = npl.scoring_method('metcalf')

# Now mc is an instance of the class that implements Metcalf scoring. Once
# you have such an instance, you may change any of the parameters it exposes.
# In the case of Metcalf scoring, the following parameters are currently exposed:
# - cutoff (float): the scoring threshold. Links with scores less than this are excluded
# - standardised (bool): set to True to use standardised scores (default), False for regular
mc.cutoff = 2.5
mc.standardised = True

results = npl.get_links(npl.gcfs, mc, and_mode=True)

# get_links returns an instance of a class called LinkCollection. This provides a wrapper
# around the results of the scoring operation and has various useful properties/methods:
#
# - len(results) or .source_count will tell you how many of the input_objects were found to have links
print('Number of results: {}'.format(len(results)))
# - .sources is a list of those objects
objects_with_links = results.sources
# - .links is a dict with structure {input_object: {linked_object: ObjectLink}} 
objects_and_link_info = results.links
# - .get_all_targets() will return a flat list of *all* the linked objects (for all sources)
all_targets = results.get_all_targets() 
# - .methods is a list of the scoring methods passed to get_links
methods = results.methods

15:32:57 [INFO] methods.py:436, MetcalfScoring.setup (bgcs=5905, gcfs=1263, spectra=13667, molfams=8346, strains=142)
15:32:58 [INFO] methods.py:475, MetcalfScoring.setup completed
Number of results: 1263


In [13]:
# example of a good score, (predicted) NRP linking to a (predicted) peptide like spectrum
npl.class_linking_score(list(npl.gcfs)[0], list(npl.spectra)[500])

[(0.7553763440860215,
  'mibig_classes',
  'cf_superclass',
  'NRP',
  'Organic acids and derivatives'),
 (0.4583333333333333,
  'mibig_classes',
  'cf_subclass',
  'NRP',
  'Amino acids, peptides, and analogues'),
 (0.4435483870967742,
  'mibig_classes',
  'cf_class',
  'NRP',
  'Carboxylic acids and derivatives'),
 (0.017241379310344827,
  'mibig_classes',
  'npc_superclass',
  'NRP',
  'Nucleosides'),
 (0.014466546112115732,
  'mibig_classes',
  'npc_pathway',
  'NRP',
  'Carbohydrates')]

In [14]:
# example of a bad score, (predicted) NRP linking to an (predicted) alkaloid like spectrum
npl.class_linking_score(list(npl.gcfs)[0], list(npl.spectra)[0])

[(0.17902350813743217, 'mibig_classes', 'npc_pathway', 'NRP', 'Alkaloids'),
 (0.03763440860215054, 'mibig_classes', 'cf_superclass', 'NRP', 'Benzenoids'),
 (0.03225806451612903,
  'mibig_classes',
  'cf_class',
  'NRP',
  'Benzene and substituted derivatives')]