In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from nplinker.nplinker import NPLinker
%reload_ext autoreload
%autoreload 2



In [12]:
# configuring NPLinker in a notebook env is now done either by passing in the name of a config file,
# or by passing in a dict which corresponds to the structure of the config file. Usually it will be
# easier to edit the file and simply pass the filename like this:
npl = NPLinker('latest_api_demo.toml')

# the above step will attempt to discover the files to be loaded from the dataset and complain
# if they're not as expected. Next, actually load the data files
if not npl.load_data():
    raise Exception('Failed to load data')
 

10:11:11 [INFO] loader.py:280, Loaded global strain IDs (162 total)
10:11:11 [INFO] loader.py:288, Loaded dataset strain IDs (169 total)
Loaded 3107 molecules
/mnt/archive/nplinker_data/current/carnegie_mibig_27112019/quantification_table_reformatted/91b05b3ce3da4f3d82bd6cbe82e7f11d.csv
10:11:14 [INFO] loader.py:263, Loading provided annotation files (/mnt/archive/nplinker_data/current/carnegie_mibig_27112019/DB_result)
Found 1816 MiBIG json files
# MiBIG BGCs = 317, non-MiBIG BGCS = 1632, total bgcs = 1949, GCFs = 327, strains=1985
10:11:14 [INFO] genomics.py:334, Filtering MiBIG BGCs: removing 246 GCFs and 316 BGCs
# after filtering, total bgcs = 134, GCFs = 81, strains=170


In [13]:
# The scoring methods are defined and configured in the default configuration file at 
# ~/.config/nplinker/nplinker.toml, but will be overridden by the config file you loaded above,
# and the scoring methods can be easily changed once the NPLinker object has been created, e.g.:

# ensure only metcalf scoring is enabled, and set a 99% significance percentile threshold
print('Currently enabled scoring methods: {}'.format(npl.scoring.enabled()))
npl.scoring.likescore.enabled = False
# npl.scoring.likescore.cutoff = <scoring cutoff threshold>
npl.scoring.hg.enabled = False
# npl.scoring.hg.prob = <probability threshold>
npl.scoring.metcalf.enabled = True
npl.scoring.metcalf.sig_percentile = 99
print('Currently enabled scoring methods: {}'.format(npl.scoring.enabled()))

Currently enabled scoring methods: [namespace(enabled=True, name='metcalf', sig_percentile=99)]
Currently enabled scoring methods: [namespace(enabled=True, name='metcalf', sig_percentile=99)]


In [16]:
# to check if a spectrum has any of these can use .is_library, which is true 
# if it has GNPS annotation data
spectra_with_gnps_matches = [s for s in npl.spectra if s.is_library]
print('found {} spectra'.format(len(spectra_with_gnps_matches)))

from nplinker.annotations import GNPS_KEY 
for spec in spectra_with_gnps_matches:
    # for GNPS annotations, this will be a list containing a single dict, which 
    # is keyed by column name. for other annotation sources where a spectrum ID may
    # appear on multiple rows, there will be one list entry per line, each containing
    # a similar dict keyed by column name
    annotation_data = spec.annotations[GNPS_KEY][0]
    # shortcut for the above
    annotation_data = spec.gnps_annotations
    print(spec)
    for k, v in annotation_data.items():
        print(' -- {} = {}'.format(k, v))
    # check for carnegie_rosetta_hits.tsv annotations
    crh = 'carnegie_rosetta_hits.tsv'
    if crh in spec.annotations:
        print('Spectrum has {} rosetta hits'.format(len(spec.annotations[crh])))

found 258 spectra
Organism = GNPS-LIBRARY
MQScore = 0.800337
SpectrumID = CCMSLIB00000531495
Compound_Name = Benzalkonium chloride (C12)
png_url = https://metabolomics-usi.ucsd.edu/png/?usi=mzspec:GNPSLIBRARY:CCMSLIB00000531495
json_url = https://metabolomics-usi.ucsd.edu/json/?usi=mzspec:GNPSLIBRARY:CCMSLIB00000531495
svg_url = https://metabolomics-usi.ucsd.edu/svg/?usi=mzspec:GNPSLIBRARY:CCMSLIB00000531495
spectrum_url = https://metabolomics-usi.ucsd.edu/spectrum/?usi=mzspec:GNPSLIBRARY:CCMSLIB00000531495
Organism = GNPS-NIST14-MATCHES
MQScore = 0.9491
SpectrumID = CCMSLIB00003138064
Compound_Name = Spectral Match to Ile-Ala from NIST14
png_url = https://metabolomics-usi.ucsd.edu/png/?usi=mzspec:GNPSLIBRARY:CCMSLIB00003138064
json_url = https://metabolomics-usi.ucsd.edu/json/?usi=mzspec:GNPSLIBRARY:CCMSLIB00003138064
svg_url = https://metabolomics-usi.ucsd.edu/svg/?usi=mzspec:GNPSLIBRARY:CCMSLIB00003138064
spectrum_url = https://metabolomics-usi.ucsd.edu/spectrum/?usi=mzspec:GNPSLIBR

In [None]:
# this step generates scores for all objects and enabled scoring methods, so it can be
# quite lengthy. The random_count parameter determines the number of randomised instances
# of Spectrum <=> Strain mappings that will be generated during the process.
if not npl.process_dataset(random_count=10):
    raise Exception('Failed to process dataset')
print('Completed generating scores')

In [None]:
# to get results once the scores are generated, first select an object you're interested 
# in, then call get_links with a specific scoring method. You can also pass a list of 
# objects as the first parameter. The method returns a list which contains only those
# objects that satisfy the scoring criteria (so here only those with a significance 
# percentile score of >= 99 as set above)
test_gcf = npl.gcfs[8]
results = npl.get_links(test_gcf, npl.scoring.metcalf)
if test_gcf not in results:
    print('No results found!')
else:
    print('Found results for {}!'.format(test_gcf))
    # to get the objects that scored highly against this GCF, use links_for_obj. By
    # default it will return all objects, the type_ parameter can be used to filter
    # by class, so here it will only return spectra
    test_gcf_links = npl.links_for_obj(test_gcf, npl.scoring.metcalf, type_=Spectrum)
    
    # print the objects and their scores, plus common strains
    for obj, score in test_gcf_links:
        print('{} : score {}'.format(obj, score))
        # returns a dict indexed by (Spectrum, GCF) tuples, with 
        # the values being lists of strain names shared between the two
        common_strains = npl.get_common_strains(test_gcf, obj)
        if len(common_strains) > 0:
            strain_names = list(common_strains.values())[0]
            print('   {} shared strains: {}'.format(len(strain_names), strain_names))
        else:
            print('   (no shared strains)')
            
    print('{} total links found'.format(len(test_gcf_links)))
        
    

## Rosetta-stone linking

This is an example of how we would do the linking based upon Grimur's magic dictionary

firstly, make the spectral library object

this makes use of code in my molnet repository

The following is quite slow, and the SpecLib object could be pickled up and loaded in

In [None]:
sys.path.append('/Users/simon/git/molnet/code')
# the following file can be found on uist at /srv/data/mibig-links/20190805/matched_mibig_gnps_update.mgf
mgf_file = '/Users/simon/git/molnet/lib/matched_mibig_gnps_update.mgf'
from spec_lib import SpecLib
from mnet import sqrt_normalise
s = SpecLib(mgf_file)
s._load_mgf()
s.filter()

Now do the spectral matching from the spectrum objects nplinker has loaded

Note to simon: might want to set the ms1_tol parameter to spectral match high to find analogues

In [None]:
spec_hits = {}
for i,sp in enumerate(npl.spectra):
    if not hasattr(sp, 'normalised_peaks'):
        # have to have a normalised peaks
        sp.normalised_peaks = sqrt_normalise(sp.peaks)
    hits = s.spectral_match(sp,ms1_tol = 100,score_thresh=0.5)
    if len(hits) > 0:
        spec_hits[sp] = hits
    if i % 100 == 0:
        print(i)

Spectral matching needs normalised peaks -- not sure where it's best to put this...

In [None]:
import math
def sqrt_normalise(peaks):
    temp = []
    total = 0.0
    for mz,intensity in peaks:
        temp.append((mz,math.sqrt(intensity)))
        total += intensity
    norm_facc = math.sqrt(total)
    normalised_peaks = []
    for mz,intensity in temp:
        normalised_peaks.append((mz,intensity/norm_facc))
    return normalised_peaks

In [None]:
print(len(spec_hits))

Write this out as a .tsv file to test the DB loading

In [None]:
import csv
with open('carnegie_rosetta_hits.tsv','w') as f:
    writer = csv.writer(f,delimiter='\t')
    heads = ['#Scan#','GNPS_ID','Score']
    writer.writerow(heads)
    for spec,hits in spec_hits.items():
        for hit in hits:
            writer.writerow([spec.spectrum_id,hit[0],hit[1]])

## load the rosetta stone

this file is available from /srv/data/mibig-links/20190805/matched_mibig_gnps_update.csv

In [None]:
import csv
rosetta_file = '/Users/simon/git/molnet/lib/matched_mibig_gnps_update.csv'

gnps2mibig = {}
mibig2gnps = {}

with open(rosetta_file,'r') as f:
    reader = csv.reader(f)
    heads = next(reader)
    for line in reader:
        gnps = line[0]
        mibig = line[3]
        if not gnps in gnps2mibig:
            gnps2mibig[gnps] = [mibig]
        else:
            gnps2mibig[gnps].append(mibig)
        if not mibig in mibig2gnps:
            mibig2gnps[mibig] = [gnps]
        else:
            mibig2gnps[mibig].append(gnps)

In [None]:
def parse_kcb(kcb_file):
    with open(kcb_file,'r') as f:
        line = next(f)
        while not line.startswith('Table of genes'):
            line = next(f)
        # now we're in the first block
        top_block = []
        while True:
            line = next(f)
            if line.startswith('Significant'):
                break
            else:
                if len(line) > 1:
                    top_block.append(line.rstrip())
        # now we're in the second block
        second_block = []
        while True:
            line = next(f)
            if line.startswith('Details'):
                break
            else:
                if len(line) > 1:
                    second_block.append(line.rstrip())
        while True:
            try:
                line = next(f)
                if line.startswith('>>'):
                    break
            except:
                return None
        details = []
        finished = False
        while not finished:
            temp_list = []
            while True:
                try:
                    line = next(f)
                    if line.startswith('>>'):
                        # finished one
                        details.append(temp_list)
                        break
                    else:
                        if len(line) > 1:
                            temp_list.append(line.rstrip())
                except:
                    details.append(temp_list)
                    finished = True
                    break
        # do some processing on the blocks
        # firstly, extract the genes from the BGC -- stored in the first block
        bgc_genes = set()
        for line in top_block:
            tokens = line.split()
            bgc_genes.add(tokens[0])
        # secondly, extract the BGCs that are mentioned here
        mibig_bgcs = []
        for line in second_block:
            tokens = line.split()
            bgc_id = tokens[1]
            bgc_product_name = tokens[2]
            mibig_bgcs.append((bgc_id,bgc_product_name))
        hits = {}
        for i,detail in enumerate(details):
            current_bgc_id = detail[0].split()[1]
            hits[current_bgc_id] = []
            assert current_bgc_id == mibig_bgcs[i][0] # they should be in the same order
            table_pos = detail.index('Table of genes, locations, strands and annotations of subject cluster:')
            pos = detail.index('Table of Blast hits (query gene, subject gene, %identity, blast score, %coverage, e-value):')
            print(detail[table_pos:pos])
            for line in detail[pos+1:]:
                tokens = line.split()
                bgc_id = tokens[0]
                hits[current_bgc_id].append({'source_bgc_gene':tokens[0],'mibig_bgc_gene':tokens[1],'identity_percent':int(tokens[2]),'blast_score':int(tokens[3]),'all_bgc_genes':bgc_genes})
    return hits

In [None]:
bgc_hits = {}
for bgc in npl.bgcs:
    bgc_file = bgc.antismash_file
    if bgc_file:
        base_path = os.sep.join(bgc_file.split(os.sep)[:-1])
        base_path = os.path.join(base_path,'knownclusterblast')
        genbank_file = bgc_file.split(os.sep)[-1]
        # remove the regionXXX and turn it into _cXXX
        tokens = genbank_file.split('region')
        number = int(tokens[1].split('.')[0])
        start_name = tokens[0][:-1] # remove the last dot
        start_name += '_c{}.txt'.format(number)
        kcb_name = os.path.join(base_path,start_name)
        hits = parse_kcb(kcb_name)
        if hits:
            bgc_hits[bgc] = hits
print("{} BGCs have one or more known cluster blast hits".format(len(bgc_hits)))

In [None]:
# make a reverse dictionary
mibig2bgc = {}
for bgc in bgc_hits:
    for mibig_bgc_id in bgc_hits[bgc]:
        if not mibig_bgc_id in mibig2bgc:
            mibig2bgc[mibig_bgc_id] = set()
        mibig2bgc[mibig_bgc_id].add(bgc)

In [None]:
rosetta_hits = []
for spec in spec_hits:
    for gnps_id,score in spec_hits[spec]:
        for mibig_id in gnps2mibig[gnps_id]:
            print(mibig_id)
            if mibig_id in mibig2bgc:
                for bgc in mibig2bgc[mibig_id]:
                    rosetta_hits.append((spec,gnps_id,mibig_id,bgc))
print("Found {} rosetta hits".format(len(rosetta_hits)))

In [None]:
for hit in rosetta_hits:
    print(hit[0],"<-->",hit[3]," via (",hit[1],",",hit[2],")")

## Todo:

- At the moment we get lots of hits per GNPS,MiBIG pair because they are in lots of BGCs
- We also should percolate the scores (both of the spectral match and the knownclusterblast) to the output
- Parameterise (at least) two parameters in the spectral matching: score threshold and ms1_tol. At the moment, MS1_tol will only find things with near identical MS1 m/z, which precludes analogues.
- The code for getting the knownclusterblast name and parsing the knownclusterblast file is horrific... :-)

In [None]:
def process_bgc_hit(hit):
    # process the hit to compress it into more useful info
    # computes the total identity score for each mibig entry
    # and divides by the number of mibig genes
    # i.e. the score represents how much of the mibig is 
    # reflected in the source bgc
    mibig_bgcs = list(hit.keys())
    scores = {}
    for mibig_id in mibig_bgcs:
        n_source_genes = len(hit[mibig_id][0]['all_bgc_genes'])
        n_mibig_genes = len(hit[mibig_id][0]['all_mibig_genes'])
        total_hit_identity = 0
        for hit_gene in hit[mibig_id]:
            identity_percent = hit_gene['identity_percent']
            total_hit_identity += identity_percent / 100.0
        score = total_hit_identity / n_mibig_genes
        scores[mibig_id] = score
    return scores
bgc_hit_summary_scores = {}
for bgc,hit in bgc_hits.items():
    bgc_hit_summary_scores[bgc] = process_bgc_hit(hit)
# so, in bgc_hit_summary scores, we have one entry per mibig that is linked to this BGC and a single score (that will vary between 0 and 1)