In [None]:
import sys
from nplinker.nplinker import NPLinker
%reload_ext autoreload
%autoreload 2

In [None]:
# configuring NPLinker in a notebook env is now done either by passing in the name of a config file,
# or by passing in a dict which corresponds to the structure of the config file. Usually it will be
# easier to edit the file and simply pass the filename like this:
npl = NPLinker('latest_api_demo.toml')

# the above step will attempt to discover the files to be loaded from the dataset and complain
# if they're not as expected. Next, actually load the data files
if not npl.load_data():
    raise Exception('Failed to load data')
 

In [None]:
# The scoring methods are defined and configured in the default configuration file at 
# ~/.config/nplinker/nplinker.toml, but will be overridden by the config file you loaded above,
# and the scoring methods can be easily changed once the NPLinker object has been created, e.g.:

# ensure only metcalf scoring is enabled, and set a 99% significance percentile threshold
print('Currently enabled scoring methods: {}'.format(npl.scoring.enabled()))
npl.scoring.likescore.enabled = False
# npl.scoring.likescore.cutoff = <scoring cutoff threshold>
npl.scoring.hg.enabled = False
# npl.scoring.hg.prob = <probability threshold>
npl.scoring.metcalf.enabled = True
npl.scoring.metcalf.sig_percentile = 99
print('Currently enabled scoring methods: {}'.format(npl.scoring.enabled()))

In [None]:
# to check if a spectrum has any of these can use .is_library, which is true 
# if it has GNPS annotation data
spectra_with_gnps_matches = [s for s in npl.spectra if s.is_library]
print('found {} spectra'.format(len(spectra_with_gnps_matches)))

from nplinker.annotations import GNPS_KEY 
for spec in spectra_with_gnps_matches:
    # for GNPS annotations, this will be a list containing a single dict, which 
    # is keyed by column name. for other annotation sources where a spectrum ID may
    # appear on multiple rows, there will be one list entry per line, each containing
    # a similar dict keyed by column name
    annotation_data = spec.annotations[GNPS_KEY][0]
    # shortcut for the above
    annotation_data = spec.gnps_annotations
    print(spec)
    for k, v in annotation_data.items():
        print(' -- {} = {}'.format(k, v))
    # check for carnegie_rosetta_hits.tsv annotations
    crh = 'carnegie_rosetta_hits.tsv'
    if crh in spec.annotations:
        print('Spectrum has {} rosetta hits'.format(len(spec.annotations[crh])))

In [None]:
# this step generates scores for all objects and enabled scoring methods, so it can be
# quite lengthy. The random_count parameter determines the number of randomised instances
# of Spectrum <=> Strain mappings that will be generated during the process.
if not npl.process_dataset(random_count=10):
    raise Exception('Failed to process dataset')
print('Completed generating scores')

In [None]:
# to get results once the scores are generated, first select an object you're interested 
# in, then call get_links with a specific scoring method. You can also pass a list of 
# objects as the first parameter. The method returns a list which contains only those
# objects that satisfy the scoring criteria (so here only those with a significance 
# percentile score of >= 99 as set above)
test_gcf = npl.gcfs[8]
results = npl.get_links(test_gcf, npl.scoring.metcalf)
if test_gcf not in results:
    print('No results found!')
else:
    print('Found results for {}!'.format(test_gcf))
    # to get the objects that scored highly against this GCF, use links_for_obj. By
    # default it will return all objects, the type_ parameter can be used to filter
    # by class, so here it will only return spectra
    test_gcf_links = npl.links_for_obj(test_gcf, npl.scoring.metcalf, type_=Spectrum)
    
    # print the objects and their scores, plus common strains
    for obj, score in test_gcf_links:
        print('{} : score {}'.format(obj, score))
        # returns a dict indexed by (Spectrum, GCF) tuples, with 
        # the values being lists of strain names shared between the two
        common_strains = npl.get_common_strains(test_gcf, obj)
        if len(common_strains) > 0:
            strain_names = list(common_strains.values())[0]
            print('   {} shared strains: {}'.format(len(strain_names), strain_names))
        else:
            print('   (no shared strains)')
            
    print('{} total links found'.format(len(test_gcf_links)))
        
    

## Rosetta-stone linking (nplinker version)

In [None]:
from nplinker.scoring.rosetta import rosetta
ro = rosetta.Rosetta(npl.data_dir, npl.root_dir, npl.dataset_id)
rhits = ro.run(npl.spectra, npl.bgcs)
print('Rosetta hits: {}'.format(len(rhits)))

In [None]:
# broken atm
ro.generate_bgc_summary_scores()

In [None]:
import csv
spec_hits = ro._spec_hits
# Write this out as a .tsv file to test the DB loading
with open('carnegie_rosetta_hits.tsv','w') as f:
    writer = csv.writer(f,delimiter='\t')
    heads = ['#Scan#','GNPS_ID','Score']
    writer.writerow(heads)
    for spec,hits in spec_hits.items():
        for hit in hits:
            writer.writerow([spec.spectrum_id, hit[0], hit[1]])

In [None]:
for hit in rosetta_hits:
    print(hit)

## Todo:

- At the moment we get lots of hits per GNPS,MiBIG pair because they are in lots of BGCs
- We also should percolate the scores (both of the spectral match and the knownclusterblast) to the output
- Parameterise (at least) two parameters in the spectral matching: score threshold and ms1_tol. At the moment, MS1_tol will only find things with near identical MS1 m/z, which precludes analogues.
- The code for getting the knownclusterblast name and parsing the knownclusterblast file is horrific... :-)