In [24]:
import sys
from nplinker.nplinker import NPLinker
%reload_ext autoreload
%autoreload 2

In [25]:
# configuring NPLinker in a notebook env is now done either by passing in the name of a config file,
# or by passing in a dict which corresponds to the structure of the config file. Usually it will be
# easier to edit the file and simply pass the filename like this:
npl = NPLinker('latest_api_demo_sr.toml')

# the above step will attempt to discover the files to be loaded from the dataset and complain
# if they're not as expected. Next, actually load the data files
if not npl.load_data():
    raise Exception('Failed to load data')
 

12:58:10 [INFO] loader.py:303, Loaded global strain IDs (162 total)
12:58:10 [INFO] loader.py:311, Loaded dataset strain IDs (169 total)
12:58:11 [INFO] metabolomics.py:230, load_spectra loaded 3107 molecules
12:58:11 [INFO] metabolomics.py:410, Merged nodes data (new-style), total lines = 3107
12:58:13 [INFO] loader.py:286, Loading provided annotation files (/Users/simon/nplinker_datasets/carnegie_nomibig_27112019/DB_result)
12:58:13 [INFO] genomics.py:365, Found 1816 MiBIG json files
12:58:14 [INFO] genomics.py:272, # MiBIG BGCs = 0, non-MiBIG BGCS = 1949, total bgcs = 1949, GCFs = 83, strains=1985
12:58:14 [INFO] genomics.py:328, Filtering MiBIG BGCs: removing 0 GCFs and 0 BGCs
12:58:14 [INFO] genomics.py:279, # after filtering, total bgcs = 133, GCFs = 83, strains=169
12:58:14 [INFO] loader.py:164, Strains filtered down to total of 17


In [None]:
# The scoring methods are defined and configured in the default configuration file at 
# ~/.config/nplinker/nplinker.toml, but will be overridden by the config file you loaded above,
# and the scoring methods can be easily changed once the NPLinker object has been created, e.g.:

# ensure only metcalf scoring is enabled, and set a 99% significance percentile threshold
print('Currently enabled scoring methods: {}'.format(npl.scoring.enabled()))
npl.scoring.likescore.enabled = False
# npl.scoring.likescore.cutoff = <scoring cutoff threshold>
npl.scoring.hg.enabled = False
# npl.scoring.hg.prob = <probability threshold>
npl.scoring.metcalf.enabled = True
npl.scoring.metcalf.sig_percentile = 99
print('Currently enabled scoring methods: {}'.format(npl.scoring.enabled()))

In [None]:
# to check if a spectrum has any of these can use .is_library, which is true 
# if it has GNPS annotation data
spectra_with_gnps_matches = [s for s in npl.spectra if s.is_library]
print('found {} spectra'.format(len(spectra_with_gnps_matches)))

from nplinker.annotations import GNPS_KEY 
for spec in spectra_with_gnps_matches:
    # for GNPS annotations, this will be a list containing a single dict, which 
    # is keyed by column name. for other annotation sources where a spectrum ID may
    # appear on multiple rows, there will be one list entry per line, each containing
    # a similar dict keyed by column name
    annotation_data = spec.annotations[GNPS_KEY][0]
    # shortcut for the above
    annotation_data = spec.gnps_annotations
    print(spec)
    for k, v in annotation_data.items():
        print(' -- {} = {}'.format(k, v))
    # check for carnegie_rosetta_hits.tsv annotations
    crh = 'carnegie_rosetta_hits.tsv'
    if crh in spec.annotations:
        print('Spectrum has {} rosetta hits'.format(len(spec.annotations[crh])))

In [None]:
# this step generates scores for all objects and enabled scoring methods, so it can be
# quite lengthy. The random_count parameter determines the number of randomised instances
# of Spectrum <=> Strain mappings that will be generated during the process.
if not npl.process_dataset(random_count=10):
    raise Exception('Failed to process dataset')
print('Completed generating scores')

In [None]:
# to get results once the scores are generated, first select an object you're interested 
# in, then call get_links with a specific scoring method. You can also pass a list of 
# objects as the first parameter. The method returns a list which contains only those
# objects that satisfy the scoring criteria (so here only those with a significance 
# percentile score of >= 99 as set above)
test_gcf = npl.gcfs[8]
results = npl.get_links(test_gcf, npl.scoring.metcalf)
if test_gcf not in results:
    print('No results found!')
else:
    print('Found results for {}!'.format(test_gcf))
    # to get the objects that scored highly against this GCF, use links_for_obj. By
    # default it will return all objects, the type_ parameter can be used to filter
    # by class, so here it will only return spectra
    test_gcf_links = npl.links_for_obj(test_gcf, npl.scoring.metcalf, type_=Spectrum)
    
    # print the objects and their scores, plus common strains
    for obj, score in test_gcf_links:
        print('{} : score {}'.format(obj, score))
        # returns a dict indexed by (Spectrum, GCF) tuples, with 
        # the values being lists of strain names shared between the two
        common_strains = npl.get_common_strains(test_gcf, obj)
        if len(common_strains) > 0:
            strain_names = list(common_strains.values())[0]
            print('   {} shared strains: {}'.format(len(strain_names), strain_names))
        else:
            print('   (no shared strains)')
            
    print('{} total links found'.format(len(test_gcf_links)))
        
    

## Rosetta-stone linking (nplinker version)

In [40]:
from nplinker.scoring.rosetta import rosetta
ro = rosetta.Rosetta(npl.data_dir, npl.root_dir, npl.dataset_id,ignore_genomic_cache = True)
rhits = ro.run(npl.spectra, npl.bgcs)
print('Rosetta hits: {}'.format(len(rhits)))

14:20:38 [INFO] rosetta.py:178, Found pickled SpecLib for dataset !
14:20:38 [INFO] rosetta.py:90, Searching for spectral hits 0/3107
14:20:39 [INFO] rosetta.py:90, Searching for spectral hits 100/3107
14:20:40 [INFO] rosetta.py:90, Searching for spectral hits 200/3107
14:20:41 [INFO] rosetta.py:90, Searching for spectral hits 300/3107
14:20:43 [INFO] rosetta.py:90, Searching for spectral hits 400/3107
14:20:44 [INFO] rosetta.py:90, Searching for spectral hits 500/3107
14:20:45 [INFO] rosetta.py:90, Searching for spectral hits 600/3107
14:20:46 [INFO] rosetta.py:90, Searching for spectral hits 700/3107
14:20:48 [INFO] rosetta.py:90, Searching for spectral hits 800/3107
14:20:49 [INFO] rosetta.py:90, Searching for spectral hits 900/3107
14:20:50 [INFO] rosetta.py:90, Searching for spectral hits 1000/3107
14:20:51 [INFO] rosetta.py:90, Searching for spectral hits 1100/3107
14:20:52 [INFO] rosetta.py:90, Searching for spectral hits 1200/3107
14:20:53 [INFO] rosetta.py:90, Searching for sp

In [55]:
for r in rhits:
    print(r)

RosettaHit: 66209<-->KRD175.Scaffold_2.region003 via (CCMSLIB00000565239 (0.505), BGC0000893 (0.062))
RosettaHit: 66209<-->KRD197.Scaffold_16.region002 via (CCMSLIB00000565239 (0.505), BGC0000893 (0.065))
RosettaHit: 66209<-->KRD162.Scaffold_3.region002 via (CCMSLIB00000565239 (0.505), BGC0000893 (0.062))
RosettaHit: 66209<-->KRD162.Scaffold_15.region002 via (CCMSLIB00000565239 (0.505), BGC0000893 (0.057))
RosettaHit: 66494<-->KRD197.Scaffold_4.region001 via (CCMSLIB00000569825 (0.519), BGC0000209 (0.069))
RosettaHit: 69424<-->KRD175.Scaffold_2.region003 via (CCMSLIB00000567446 (0.655), BGC0000893 (0.062))
RosettaHit: 69424<-->KRD197.Scaffold_16.region002 via (CCMSLIB00000567446 (0.655), BGC0000893 (0.065))
RosettaHit: 69424<-->KRD162.Scaffold_3.region002 via (CCMSLIB00000567446 (0.655), BGC0000893 (0.062))
RosettaHit: 69424<-->KRD162.Scaffold_15.region002 via (CCMSLIB00000567446 (0.655), BGC0000893 (0.057))
RosettaHit: 69424<-->KRD175.Scaffold_2.region003 via (CCMSLIB00000205127 (0.56

In [39]:
# broken atm
k = list(ro._bgc_hits.keys())[0]
summary_scores = ro.generate_bgc_summary_scores()
for key,item_list in summary_scores.items():
    for mib,score in item_list.items():
        print(key,'\t',mib,score)

BGC(name=KRD026.Scaffold_10.region001, strain=Strain(KRD026) [1 aliases]) 	 BGC0000362 0.3694871794871795
BGC(name=KRD012.Scaffold_7.region001, strain=Strain(KRD012) [1 aliases]) 	 BGC0000717 0.09746987951807229
BGC(name=KRD012.Scaffold_7.region001, strain=Strain(KRD012) [1 aliases]) 	 BGC0000431 0.19818181818181815
BGC(name=KRD012.Scaffold_7.region001, strain=Strain(KRD012) [1 aliases]) 	 BGC0001752 0.13722222222222225
BGC(name=KRD022.Scaffold_2.region001, strain=Strain(KRD022) [1 aliases]) 	 BGC0000717 0.013253012048192772
BGC(name=KRD026.Scaffold_13.region001, strain=Strain(KRD026) [1 aliases]) 	 BGC0000717 0.02939759036144578
BGC(name=KRD070.Scaffold_12.region001, strain=Strain(KRD070) [1 aliases]) 	 BGC0000717 0.08807228915662649
BGC(name=KRD070.Scaffold_12.region001, strain=Strain(KRD070) [1 aliases]) 	 BGC0000431 0.1981818181818182
BGC(name=KRD070.Scaffold_12.region001, strain=Strain(KRD070) [1 aliases]) 	 BGC0001752 0.13722222222222225
BGC(name=KRD077.Scaffold_20.region001, str

In [37]:
kk = list(summary_scores.keys())
temp = ro._bgc_hits[kk[-2]]['BGC0000644']
tt = temp['individual_hits']
print(tt)
print(temp['all_mibig_genes'])
print(temp['all_bgc_genes'])

[{'source_bgc_gene': 'ctg19_101', 'mibig_bgc_gene': 'ABD24398.1', 'identity_percent': 76, 'blast_score': 530}, {'source_bgc_gene': 'ctg19_102', 'mibig_bgc_gene': 'ABD24399.1', 'identity_percent': 89, 'blast_score': 510}, {'source_bgc_gene': 'ctg19_103', 'mibig_bgc_gene': 'ABD24400.1', 'identity_percent': 88, 'blast_score': 967}, {'source_bgc_gene': 'ctg19_104', 'mibig_bgc_gene': 'ABD24401.1', 'identity_percent': 84, 'blast_score': 180}, {'source_bgc_gene': 'ctg19_105', 'mibig_bgc_gene': 'ABD24402.1', 'identity_percent': 87, 'blast_score': 723}, {'source_bgc_gene': 'ctg19_106', 'mibig_bgc_gene': 'ABD24403.1', 'identity_percent': 80, 'blast_score': 384}]
['ABD24398.1', 'ABD24399.1', 'ABD24400.1', 'ABD24401.1', 'ABD24402.1']
{'ctg19_93', 'ctg19_100', 'ctg19_104', 'ctg19_101', 'ctg19_102', 'ctg19_99', 'ctg19_103', 'ctg19_111', 'ctg19_95', 'ctg19_109', 'ctg19_106', 'ctg19_108', 'ctg19_98', 'ctg19_105', 'ctg19_96', 'ctg19_94', 'ctg19_110', 'ctg19_107', 'ctg19_97'}


In [None]:
import csv
spec_hits = ro._spec_hits
# Write this out as a .tsv file to test the DB loading
with open('carnegie_rosetta_hits.tsv','w') as f:
    writer = csv.writer(f,delimiter='\t')
    heads = ['#Scan#','GNPS_ID','Score']
    writer.writerow(heads)
    for spec,hits in spec_hits.items():
        for hit in hits:
            writer.writerow([spec.spectrum_id, hit[0], hit[1]])

In [None]:
for hit in rosetta_hits:
    print(hit)

## Todo:

- At the moment we get lots of hits per GNPS,MiBIG pair because they are in lots of BGCs
- We also should percolate the scores (both of the spectral match and the knownclusterblast) to the output
- Parameterise (at least) two parameters in the spectral matching: score threshold and ms1_tol. At the moment, MS1_tol will only find things with near identical MS1 m/z, which precludes analogues.
- The code for getting the knownclusterblast name and parsing the knownclusterblast file is horrific... :-)