# Summary
Prospect some ways to load CANOPUS output, load MIBiG known class links, and establish class linking (scores) in the NPLinker object. To try this, we use a version of the Crusemann dataset (see Crüsemann et al. (2016) or MolNetEnhancer paper). Many parts of this notebook originate from the demo notebook.

In [324]:
import sys, csv, os
# if running from clone of the git repo
sys.path.append('../prototype')

# import the main NPLinker class. normally this all that's required to work
# with NPLinker in a notebook environment
import pandas as pd
import glob
from nplinker.nplinker import NPLinker
from collections import Counter

In [2]:
# load local crusemann data
npl = NPLinker({'dataset': {'root': '/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/'}})
npl.load_data()

11:15:43 [INFO] config.py:121, Loading from local data in directory /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/
11:15:43 [INFO] loader.py:80, Trying to discover correct bigscape directory under /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/bigscape
11:15:43 [INFO] loader.py:83, Found network files directory: /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/bigscape/network_files/2021-07-16_08-32-34_hybrids_glocal
11:15:43 [INFO] loader.py:212, Updating bigscape_dir to discovered location /mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/bigscape/network_files/2021-07-16_08-32-34_hybrids_glocal
11:15:43 [INFO] loader.py:571, Loaded global strain IDs (0 total)
11:15:43 [INFO] loader.py:582, Loaded dataset strain IDs (159 total)
11:15:50 [INFO] metabolomics.py:699, 13667 molecules parsed from MGF file
11:15:52 [INFO] metabolomics.py:716, Found older-style GNPS dataset, no quanti

True

In [3]:
# Basic functionality
# ===================
#
# Once you have an NPLinker object with all data loaded, there are a collection of simple
# methods and properties you can use to access objects and metadata. Some examples are 
# given below, see https://nplinker.readthedocs.io/en/latest/ for a complete API description.

# configuration/dataset metadata
# - a copy of the configuration as parsed from the .toml file (dict)
print(npl.config) 
# - the path to the directory where various nplinker data files are located (e.g. the 
#   default configuration file template) (str)
print(npl.data_dir)
# - a dataset ID, derived from the path for local datasets or the paired platform ID
#   for datasets loaded from that source (str)
print(npl.dataset_id)
# - the root directory for the current dataset (str)
print(npl.root_dir)

# objects
# - you can directly access lists of each of the 4 object types:
print('BGCs:', len(npl.bgcs))
print('GCFs:', len(npl.gcfs)) # contains GCF objects
print('Spectra:', len(npl.spectra)) # contains Spectrum objects
print('Molecular Families:', len(npl.molfams)) # contains MolecularFamily objects

{'loglevel': 'INFO', 'logfile': '', 'log_to_stdout': True, 'repro_file': '', 'dataset': {'root': '/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/', 'overrides': {}, 'platform_id': ''}, 'antismash': {'antismash_format': 'default', 'ignore_spaces': False}, 'docker': {'run_bigscape': True, 'extra_bigscape_parameters': ''}, 'webapp': {'tables_metcalf_threshold': 2.0}, 'scoring': {'rosetta': {}}}
../prototype/nplinker/data

/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/
BGCs: 5905
GCFs: 1263
Spectra: 13667
Molecular Families: 8346


In [4]:
mc = npl.scoring_method('metcalf')

# Now mc is an instance of the class that implements Metcalf scoring. Once
# you have such an instance, you may change any of the parameters it exposes.
# In the case of Metcalf scoring, the following parameters are currently exposed:
# - cutoff (float): the scoring threshold. Links with scores less than this are excluded
# - standardised (bool): set to True to use standardised scores (default), False for regular
mc.cutoff = 2.5
mc.standardised = True

results = npl.get_links(npl.gcfs, mc, and_mode=True) 

# get_links returns an instance of a class called LinkCollection. This provides a wrapper
# around the results of the scoring operation and has various useful properties/methods:
#
# - len(results) or .source_count will tell you how many of the input_objects were found to have links
print('Number of results: {}'.format(len(results)))
# - .sources is a list of those objects
objects_with_links = results.sources
# - .links is a dict with structure {input_object: {linked_object: ObjectLink}} 
objects_and_link_info = results.links
# - .get_all_targets() will return a flat list of *all* the linked objects (for all sources)
all_targets = results.get_all_targets() 
# - .methods is a list of the scoring methods passed to get_links
methods = results.methods

11:21:14 [INFO] methods.py:436, MetcalfScoring.setup (bgcs=5905, gcfs=1263, spectra=13667, molfams=8346, strains=142)
11:21:24 [INFO] methods.py:456, MetcalfScoring.setup invalidating cached data!
11:21:24 [INFO] methods.py:465, MetcalfScoring.setup preprocessing dataset (this may take some time)
11:25:35 [INFO] methods.py:475, MetcalfScoring.setup completed
Number of results: 1263


In [78]:
spec_results = npl.get_links(npl.spectra, mc, and_mode=True)
print('Number of results: {}'.format(len(spec_results)))

Number of results: 11978


In [5]:
### strange there are only 48 strains... -> ah there are only spectra for 48 strains so that makes sense
# not applicable anymore with new crusemann version (current)
b_strains = [bgc.strain.id for bgc in npl.bgcs]
bs_set = set(b_strains)
s_strains = [list(spec.strains) for spec in npl.spectra]
s_strains = [strain.id for s in s_strains for strain in s if s]
ss_set = set(s_strains)
len(bs_set), len(ss_set)

(142, 142)

In [109]:
mf_results = npl.get_links(npl.molfams, mc, and_mode=True)

ValueError: setting an array element with a sequence.

## Reading canopus output
Expect files to be present in the data folder that are called:
- cluster_index_classifications.txt -> for the spectra (cluster indices in gnps)
- component_index_classifications.txt -> for the molfams (component indices in gnps)

For now just read the files to some dicts

TODO:
Alter canopus output such that header names have cf_ and npc_ in front of them

In [110]:
[bgc.product_prediction for bgc in npl.bgcs][:10]

['nrps',
 'nrps.t1pks.otherks',
 'cf_fatty_acid.nrps.t1pks',
 'nrps',
 'nrps',
 'nrps',
 'nrps',
 'nrps',
 'nrps',
 'nrps.t1pks']

In [7]:
npl.root_dir

'/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/'

In [15]:
exampl_s = 133415
exampl_mf = 519
c_spec = None
for spec in npl.spectra:
    if spec.spectrum_id == exampl_s:  # 21-hydroxyrosamicin hit from MF 519
        c_spec = spec
c_spec.metadata

{'precursormass': 580.34497,
 'parentintensity': None,
 'charge': 0,
 'mslevel': '2',
 'precursorintensity': '267664.000000',
 'filename': 'specs_ms.pklbin',
 'parentrt': 822.603027,
 'activation': 'CID',
 'instrument': 'ion trap',
 'title': 'Scan Number: 133415',
 'scans': '133415',
 'parentmass': 580.34497,
 'singlechargeprecursormass': 580.34497,
 'cluster_index': 133415,
 'files': {'CNT360_A1_B.mzXML': 'CNT360_A1_B.mzXML',
  'CNT360_A1_M.mzXML': 'CNT360_A1_M.mzXML',
  'S237-3-Bu.mzXML': 'S237-3-Bu.mzXML'}}

In [31]:
ci_classes_file = os.path.join(npl.root_dir, 'cluster_index_classifications.txt')
os.path.exists(ci_classes_file)
ci_classes = {} # for now make a dict {ci: [[(class,score)]]}
with open(ci_classes_file) as inf:
    ci_classes_header = inf.readline().strip().split("\t")
    print(ci_classes_header)
    for line in inf:
        line = line.strip('\n').split("\t")
        classes_list = []
        for lvl in line[3:]:
            lvl_list = []
            for l_class in lvl.split("; "):
                if l_class:
                    l_class = l_class.split(":")
                    c_tup = tuple([l_class[0], float(l_class[1])])
                else:
                    c_tup = None  # default value for class value
                lvl_list.append(c_tup)
            classes_list.append(lvl_list)
        ci_classes[line[1]] = classes_list

print(line)
len(classes_list), len(ci_classes_header[3:])  #example

['componentindex', 'cluster index', 'formula', 'kingdom', 'superclass', 'class', 'subclass', 'level 5', 'level 6', 'level 7', 'level 8', 'level 9', 'level 10', 'level 11', 'pathway', 'superclass', 'class']
['7161', '186177', 'C46H65N3O3', 'Organic compounds:1.000', 'Lipids and lipid-like molecules:0.815; Organic acids and derivatives:0.811; Organic oxygen compounds:1.000; Hydrocarbon derivatives:1.000; Organic nitrogen compounds:1.000; Organopnictogen compounds:0.917', 'Fatty Acyls:0.718; Carboxylic acids and derivatives:0.671; Organooxygen compounds:0.995; Organic oxides:0.843; Organonitrogen compounds:1.000', 'Fatty amides:0.787; Carboxylic acid derivatives:0.641; Carbonyl compounds:0.765', 'N-acyl amines:0.645; Carboxylic acid amides:0.710', 'Secondary carboxylic acid amides:0.758', '', '', '', '', '', 'Alkaloids:0.826', 'Tryptophan alkaloids:0.393', '']


(14, 14)

In [37]:
ci_classes[str(exampl_s)]

[[('Organic compounds', 1.0)],
 [('Organic oxygen compounds', 1.0),
  ('Organic nitrogen compounds', 1.0),
  ('Organoheterocyclic compounds', 0.894),
  ('Hydrocarbon derivatives', 1.0),
  ('Organopnictogen compounds', 0.993)],
 [('Organooxygen compounds', 0.991),
  ('Organonitrogen compounds', 1.0),
  ('Oxanes', 0.928),
  ('Oxacyclic compounds', 0.872)],
 [('Carbohydrates and carbohydrate conjugates', 0.991),
  ('Alcohols and polyols', 0.986),
  ('Cyclohexylamines', 0.778),
  ('Amines', 0.976),
  ('Ethers', 0.902)],
 [('Aminosaccharides', 0.984),
  ('Glycosyl compounds', 0.948),
  ('Cyclic alcohols and derivatives', 0.589),
  ('Secondary alcohols', 0.937),
  ('Monosaccharides', 0.664),
  ('Alkanolamines', 0.891),
  ('Acetals', 0.966),
  ('Primary alcohols', 0.583),
  ('Primary amines', 0.846)],
 [('Aminoglycosides', 0.981),
  ('O-glycosyl compounds', 0.928),
  ('Cyclitols and derivatives', 0.582),
  ('Cyclohexanols', 0.736),
  ('1,2-aminoalcohols', 0.905),
  ('Monoalkylamines', 0.864)]

In [35]:
# for component indices
compi_classes_file = os.path.join(npl.root_dir, 'component_index_classifications.txt')
os.path.exists(compi_classes_file)
compi_classes = {} # for now make a dict {ci: [[(class,score)]]}
with open(compi_classes_file) as inf:
    compi_classes_header = inf.readline().strip().split("\t")
    print(compi_classes_header)
    for line in inf:
        line = line.strip('\n').split("\t")
        classes_list = []
        for lvl in line[2:]:
            lvl_list = []
            for l_class in lvl.split("; "):
                if l_class:
                    l_class = l_class.split(":")
                    c_tup = tuple([l_class[0], float(l_class[1])])
                else:
                    c_tup = None  # default value for class value
                lvl_list.append(c_tup)
            classes_list.append(lvl_list)
        compi_classes[line[0]] = classes_list

print(line)  #example
len(compi_classes_header[2:]) == len(classes_list)

['componentindex', 'size', 'kingdom', 'superclass', 'class', 'subclass', 'level 5', 'level 6', 'level 7', 'level 8', 'level 9', 'level 10', 'level 11', 'pathway', 'superclass', 'class']
['7161', '2', 'Organic compounds:1.000', 'Organic acids and derivatives:1.000; Lipids and lipid-like molecules:1.000; Organoheterocyclic compounds:0.500; Organic nitrogen compounds:1.000; Organic oxygen compounds:1.000; Hydrocarbon derivatives:1.000; Organopnictogen compounds:1.000', 'Carboxylic acids and derivatives:1.000; Fatty Acyls:1.000; Heteroaromatic compounds:0.500; Azacyclic compounds:0.500; Organonitrogen compounds:1.000; Organooxygen compounds:1.000; Organic oxides:1.000', 'Amino acids, peptides, and analogues:0.500; Fatty amides:1.000; Carboxylic acid derivatives:1.000; Amines:0.500; Carbonyl compounds:1.000', 'Peptides:0.500; Amino acids and derivatives:0.500; N-acyl amines:1.000; Carboxylic acid amides:1.000', 'Dipeptides:0.500; Alpha amino acids and derivatives:0.500; Secondary carboxylic

True

In [36]:
compi_classes[str(exampl_mf)]

[[('Organic compounds', 1.0)],
 [('Organic oxygen compounds', 1.0),
  ('Organic acids and derivatives', 0.69),
  ('Phenylpropanoids and polyketides', 0.345),
  ('Organoheterocyclic compounds', 0.828),
  ('Benzenoids', 0.241),
  ('Organic nitrogen compounds', 1.0),
  ('Hydrocarbon derivatives', 1.0),
  ('Organopnictogen compounds', 1.0),
  ('Organosulfur compounds', 0.276)],
 [('Organooxygen compounds', 1.0),
  ('Carboxylic acids and derivatives', 0.69),
  ('Macrolides and analogues', 0.241),
  ('Diazines', 0.241),
  ('Imidolactams', 0.241),
  ('Oxanes', 0.517),
  ('Heteroaromatic compounds', 0.276),
  ('Organonitrogen compounds', 1.0),
  ('Lactones', 0.414),
  ('Azacyclic compounds', 0.483),
  ('Oxacyclic compounds', 0.621),
  ('Organic oxides', 0.759)],
 [('Carbohydrates and carbohydrate conjugates', 0.552),
  ('Amino acids, peptides, and analogues', 0.655),
  ('Pyrimidines and pyrimidine derivatives', 0.241),
  ('Alcohols and polyols', 0.793),
  ('Amines', 1.0),
  ('Carboxylic acid d

In [23]:
len(compi_classes[str(exampl_mf)]), len(compi_classes_header[2:])

(14, 14)

## Find some known links: rosamicin and compare classes
They are distributed across some MFs, but these are the GNPS hits:

cluster index	componentindex	LibraryID	SpectrumID	UniqueFileSources	UniqueFileSourcesCount	number of spectra	parent mass	precursor charge	precursor mass	sum(precursor intensity)

- 108350	4992	21-hydroxyrosamicin	CCMSLIB00000531536	CNT360_A1_B.mzXML|CNT360_A1_M.mzXML	2	3	514.295	0	514.295	654526
- 112392	4992	21-hydroxyrosamicin	CCMSLIB00000531536	S237-3-Bu.mzXML	1	2	526.373	0	526.373	52755
- 116284	-1	18-dihydro-21-hydroxyrosamicin	CCMSLIB00000531539	S237-3-EA.mzXML	1	3	538.376	0	538.376	103159
- 130529	2434	21-hydroxyrosamicin	CCMSLIB00000531536	S237-3-EA.mzXML	1	2	568.384	0	568.384	217749
- 133415	519	21-hydroxyrosamicin	CCMSLIB00000531536	CNT360_A1_B.mzXML|CNT360_A1_M.mzXML|S237-3-Bu.mzXML	3	7	580.345	0	580.345	1.36658e+06
- 134244	519	21-hydroxyrosamicin	CCMSLIB00000531536	S237-3-Bu.mzXML|T133-3-Bu.mzXML|T609-3-Me.mzXML|T133-3-Me.mzXML|CNT360_A1_B.mzXML|T609-3-Bu.mzXML|S237-3-EA.mzXML|S237-3-Me.mzXML	8	65	582.363	1	582.363	5.22762e+06
- 134524	519	21-hydroxyrosamicin	CCMSLIB00000531536	T029-3-Me.mzXML|S237-3-Bu.mzXML|T609-3-Me.mzXML|Y231-3-Me.mzXML|CNT360_A1_B.mzXML|CNT360_A1_M.mzXML|H646-3-Me.mzXML|T609-3-Bu.mzXML|S237-3-EA.mzXML|S237-3-Me.mzXML	10	42	584.365	1	584.365	4.64563e+06
- 134525	519	21-hydroxyrosamicin	CCMSLIB00000531536	S237-3-Bu.mzXML|CNT360_A1_B.mzXML|CNT360_A1_M.mzXML|T609-3-Bu.mzXML|S237-3-EA.mzXML|S237-3-Me.mzXML	6	26	584.376	1	584.376	2.93404e+06
- 134831	232	18-dihydro-21-hydroxyrosamicin	CCMSLIB00000531539	CNT360_A1_B.mzXML|CNT360_A1_M.mzXML	2	8	586.352	0	586.352	1.5787e+06
- 141312	2434	21-hydroxyrosamicin	CCMSLIB00000531536	S237-3-EA.mzXML	1	3	597.377	0	597.377	214136
- 141394	-1	18-dihydro-14-hydroxyrosamicin	CCMSLIB00000531538	T609-3-Bu.mzXML	1	2	598.355	0	598.355	59832
- 141398	2434	21-hydroxyrosamicin	CCMSLIB00000531536	T609-3-Bu.mzXML|S237-3-EA.mzXML|S237-3-Bu.mzXML	3	3	598.358	0	598.358	161635
- 146965	-1	21-hydroxyrosamicin	CCMSLIB00000531536	CNT360_A1_B.mzXML	1	2	618.376	0	618.376	290717
- 166419	2186	18-dihydro-21-hydroxyrosamicin	CCMSLIB00000531539	T849-3-Me.mzXML|H646-3-Me.mzXML|T850-3-Me.mzXML	3	4	671.346	1	671.346	353451

MFs:
- 4992
- 2434
- 519
- 232
- 2186

In [74]:
rosa_mfs = list(map(str, [4992, 2434, 519, 232, 2186]))
# print([compi_classes[x][3][:2] for x in rosa_mfs])  # best 2 subclasses
# print([compi_classes[x][11:] for x in rosa_mfs])  # all npc levels
for x in rosa_mfs:
    cur_mf = [m for m in npl.molfams if m.family_id == int(x)][0]
    x_size = len(cur_mf.spectra)
    print(f'\nMF {x} size {x_size}')
    print('CF subclasses (top2)')
    print(compi_classes[x][3][:2])
    print('NPC classes (pathway, superclass, class)')
    print(compi_classes[x][11:])

### MF 519 seems best to continue with?


MF 4992 size 2
CF subclasses (top2)
[('Carbohydrates and carbohydrate conjugates', 0.5), ('Amino acids, peptides, and analogues', 0.5)]
NPC classes (pathway, superclass, class)
[[('Carbohydrates', 0.5)], [('Polyols', 0.5)], [('Amino cyclitols', 0.5)]]

MF 2434 size 5
CF subclasses (top2)
[('Amino acids, peptides, and analogues', 0.4), ('Carbohydrates and carbohydrate conjugates', 0.2)]
NPC classes (pathway, superclass, class)
[[('Polyketides', 0.4), ('Alkaloids', 0.2), ('Amino acids and Peptides', 0.2)], [('Macrolides', 0.2), ('Small peptides', 0.2)], [('Erythromycins', 0.2)]]

MF 519 size 33
CF subclasses (top2)
[('Carbohydrates and carbohydrate conjugates', 0.552), ('Amino acids, peptides, and analogues', 0.655)]
NPC classes (pathway, superclass, class)
[[('Polyketides', 0.517)], [('Macrolides', 0.448)], [('Erythromycins', 0.448)]]

MF 232 size 16
CF subclasses (top2)
[('Carbohydrates and carbohydrate conjugates', 0.375), ('Amino acids, peptides, and analogues', 0.375)]
NPC classes 

In [83]:
#explore some spectra from MF 519: 133415, 134244, 134524, 134525
spectra_mf519_inds = [133415, 134244, 134524, 134525]
spectra_mf519 = [s for s in npl.spectra if s.spectrum_id in spectra_mf519_inds]
print(spectra_mf519)
[spec_results.links[s] for s in spectra_mf519]

[Spectrum(id=6593, spectrum_id=133415, strains=2), Spectrum(id=6661, spectrum_id=134244, strains=4), Spectrum(id=6696, spectrum_id=134524, strains=6), Spectrum(id=6697, spectrum_id=134525, strains=3)]


[{GCF(id=32, class=NRPS, gcf_id=959, strains=7): ObjectLink(source=Spectrum(id=6593, spectrum_id=133415, strains=2), target=GCF(id=32, class=NRPS, gcf_id=959, strains=7), #methods=1),
  GCF(id=95, class=Others, gcf_id=3128, strains=4): ObjectLink(source=Spectrum(id=6593, spectrum_id=133415, strains=2), target=GCF(id=95, class=Others, gcf_id=3128, strains=4), #methods=1),
  GCF(id=107, class=PKS-NRP_Hybrids, gcf_id=3827, strains=8): ObjectLink(source=Spectrum(id=6593, spectrum_id=133415, strains=2), target=GCF(id=107, class=PKS-NRP_Hybrids, gcf_id=3827, strains=8), #methods=1),
  GCF(id=168, class=NRPS, gcf_id=5183, strains=1): ObjectLink(source=Spectrum(id=6593, spectrum_id=133415, strains=2), target=GCF(id=168, class=NRPS, gcf_id=5183, strains=1), #methods=1),
  GCF(id=170, class=NRPS, gcf_id=5243, strains=1): ObjectLink(source=Spectrum(id=6593, spectrum_id=133415, strains=2), target=GCF(id=170, class=NRPS, gcf_id=5243, strains=1), #methods=1),
  GCF(id=702, class=Others, gcf_id=4798,

In [105]:
all519 = [m.spectra for m in npl.molfams if m.family_id == 519][0]

In [106]:
# cur_spec = spectra_mf519[2]

for cur_spec in all519:
    sorted_links = spec_results.get_sorted_links(mc, cur_spec)
    print('Results for object: {}, {} total links, {} methods used'.format(cur_spec, len(sorted_links), spec_results.method_count))
    for link_data in sorted_links:

        print('  --> [{}] {} | {} | shared strains = {}'.format(','.join(method.name for method in link_data.methods), 
                                                                    link_data.target, 
                                                                    mc.format_data(link_data[mc]), 
                                                                    len(link_data.shared_strains)))

Results for object: Spectrum(id=5623, spectrum_id=117462, strains=1), 16 total links, 1 methods used
  --> [metcalf] GCF(id=170, class=NRPS, gcf_id=5243, strains=1) | 11.8743 | shared strains = 1
  --> [metcalf] GCF(id=747, class=Others, gcf_id=5229, strains=1) | 11.8743 | shared strains = 1
  --> [metcalf] GCF(id=748, class=Others, gcf_id=5230, strains=1) | 11.8743 | shared strains = 1
  --> [metcalf] GCF(id=749, class=Others, gcf_id=5231, strains=1) | 11.8743 | shared strains = 1
  --> [metcalf] GCF(id=750, class=Others, gcf_id=5232, strains=1) | 11.8743 | shared strains = 1
  --> [metcalf] GCF(id=751, class=Others, gcf_id=5233, strains=1) | 11.8743 | shared strains = 1
  --> [metcalf] GCF(id=752, class=Others, gcf_id=5236, strains=1) | 11.8743 | shared strains = 1
  --> [metcalf] GCF(id=753, class=Others, gcf_id=5237, strains=1) | 11.8743 | shared strains = 1
  --> [metcalf] GCF(id=754, class=Others, gcf_id=5238, strains=1) | 11.8743 | shared strains = 1
  --> [metcalf] GCF(id=755, 

In [192]:
cur_spec = [s for s in npl.spectra if s.spectrum_id == 134524][0]
cur_res = spec_results.get_sorted_links(mc, cur_spec)
print('Results for object: {}, {} total links, {} methods used\n'.format(cur_spec, len(cur_res), results.method_count))

gnps_dict = cur_spec.gnps_annotations
gnps_hit = None
if gnps_dict:
    gnps_hit = gnps_dict.get('Compound_Name')
print(f"\tGNPS hit: {gnps_hit}")
target_ci_classes = ci_classes.get(str(cur_spec.spectrum_id))
if target_ci_classes:  # if parent mass is too high, there is no classes ofc (maxmz 850)
    print("\tCF subclass: {}\n\tNPC classes: {}".format(
        target_ci_classes[3][:2], target_ci_classes[11:]))
else:
    print("\t--no spectrum classes--")
target_compi_classes = compi_classes.get(str(cur_spec.family_id))
print(f"\n\tMF {cur_spec.family_id} | size {len(cur_spec.family.spectra)} | grouped strains {len(cur_spec.family.strains)}")
if target_compi_classes:  # if parent mass is too high, there is no classes ofc (maxmz 850)
    print("\tMF CF subclass: {}\n\tMF NPC classes: {}\n".format(
        target_compi_classes[3][:2], target_compi_classes[11:]))
else:
    print("\t--no molfam classes--\n")

for i, link_data in enumerate(cur_res[:37]):

    print('{}.\t[{}] {} | {} | shared strains = {}'.format(i+1,
                                                                ','.join(method.name for method in link_data.methods), 
                                                                link_data.target, 
                                                                mc.format_data(link_data[mc]), 
                                                                len(link_data.shared_strains)))
    cur_gcf = link_data.target
    print(f"\tBiG-SCAPE class: {cur_gcf.bigscape_class} " +\
      f"| BGC classes: {Counter([b.product_prediction for b in cur_gcf.bgcs]).most_common()}\n")

Results for object: Spectrum(id=6696, spectrum_id=134524, strains=6), 37 total links, 1 methods used

	GNPS hit: 21-hydroxyrosamicin 
	CF subclass: [('Carbohydrates and carbohydrate conjugates', 0.433), ('Amino acids, peptides, and analogues', 0.957)]
	NPC classes: [[('Polyketides', 0.977)], [('Macrolides', 0.909)], [('Erythromycins', 0.799)]]

	MF 519 | size 33 | grouped strains 11
	MF CF subclass: [('Carbohydrates and carbohydrate conjugates', 0.552), ('Amino acids, peptides, and analogues', 0.655)]
	MF NPC classes: [[('Polyketides', 0.517)], [('Macrolides', 0.448)], [('Erythromycins', 0.448)]]

1.	[metcalf] GCF(id=100, class=NRPS, gcf_id=3501, strains=1) | 4.7610 | shared strains = 1
	BiG-SCAPE class: NRPS | BGC classes: [('nrps', 1)]

2.	[metcalf] GCF(id=168, class=NRPS, gcf_id=5183, strains=1) | 4.7610 | shared strains = 1
	BiG-SCAPE class: NRPS | BGC classes: [('nrps', 1)]

3.	[metcalf] GCF(id=170, class=NRPS, gcf_id=5243, strains=1) | 4.7610 | shared strains = 1
	BiG-SCAPE class

## Find some known links (staurosporine...) and compare classes
Not completely sure but from other notebook this seems to be one of the staurosporine GCFs:

Results for object: GCF(id=504, class=Others, gcf_id=3327, strains=3), 34 total links, 1 methods used
  --> [metcalf] Spectrum(id=1707, spectrum_id=27268, strains=1) | 3.8730 | shared strains = 1

In [253]:
cur_gcf = npl.gcfs[504]
cur_bgcs = [bgc for bgc in cur_gcf.bgcs if bgc.strain in cur_gcf.strains]
cur_gcf.bigscape_class, len(cur_gcf.bgcs), cur_bgcs

('Others',
 43,
 [BGC(id=2696, name=2515154186_c00001_B103DRA...cluster023, strain=Strain(Salinispora arenicola CNT798) [5 aliases], asid=c00007_B103DRA.., region=-1),
  BGC(id=2697, name=2516143022_scaffold1.cluster012, strain=Strain(Salinispora arenicola CNS991) [5 aliases], asid=scaffold1, region=-1),
  BGC(id=2698, name=34967.assembled_unknown.cluster033, strain=Strain(Salinispora arenicola CNH643) [4 aliases], asid=unknown_6, region=-1),
  BGC(id=2699, name=34969.assembled_unknown.cluster037, strain=Strain(Salinispora arenicola CNQ884) [4 aliases], asid=unknown_15, region=-1),
  BGC(id=2700, name=35121.assembled_unknown.cluster028, strain=Strain(Salinispora arenicola CNY281) [4 aliases], asid=unknown_7, region=-1),
  BGC(id=2701, name=35123.assembled_unknown.cluster032, strain=Strain(Salinispora arenicola CNH718) [4 aliases], asid=unknown_4, region=-1),
  BGC(id=2702, name=35125.assembled_unknown.cluster031, strain=Strain(Salinispora arenicola CNY486) [4 aliases], asid=unknown_6, 

In [None]:
cur_spec = npl.spectra[1707]
cur_spec_id = cur_spec.spectrum_id

cur_spec, cur_spec_id, ci_classes[str(cur_spec_id)]

In [112]:
cur_bgcs[0].product_prediction

'indole'

In [346]:
cur_res = results.get_sorted_links(mc, cur_gcf)
gcf_as_classes = Counter([b.product_prediction for b in cur_gcf.bgcs]).most_common()
npc_names = ["npc_pathway", "npc_superclass", "npc_class"]
print('Results for object: {}, {} total links, {} methods used'.format(cur_gcf, len(cur_res), results.method_count))
print(f"BiG-SCAPE class: {cur_gcf.bigscape_class} " +\
      f"| BGC classes: {gcf_as_classes}\n")
for i, link_data in enumerate(cur_res[:6]):

    print('{}.\t[{}] {} | {} | shared strains = {}'.format(i+1,
                                                                ','.join(method.name for method in link_data.methods), 
                                                                link_data.target, 
                                                                mc.format_data(link_data[mc]), 
                                                                len(link_data.shared_strains)))
    as_class = gcf_as_classes[0][0]
    gnps_dict = link_data.target.gnps_annotations
    gnps_hit = None
    if gnps_dict:
        gnps_hit = gnps_dict.get('Compound_Name')
    print(f"\tGNPS hit: {gnps_hit}")
    target_ci_classes = ci_classes.get(str(link_data.target.spectrum_id))
    if target_ci_classes:  # if parent mass is too high, there is no classes ofc (maxmz 850)
        cf_subclass = target_ci_classes[3][0]
        print("\tCF subclass score: {:.3f}\t{}".format(
            class_linking_tables["as_classes"]["cf_subclass"][as_class].get(cf_subclass[0], 0),
            cf_subclass))
        
        for name, cl_tup in zip(npc_names, target_ci_classes[11:]):
            npc_score = 0
            if cl_tup[0]:
                npc_score = class_linking_tables["as_classes"][name][as_class].get(cl_tup[0][0], 0)
            print("\tNPC {} score: {:.3f}\t{}".format(
                  name, npc_score, cl_tup[0]))
    else:
        print("\t--no spectrum classes--")
    target_compi_classes = compi_classes.get(str(link_data.target.family_id))
    print(f"\n\tMF {link_data.target.family_id} | size {len(link_data.target.family.spectra)} | grouped strains {len(link_data.target.family.strains)}")
    if target_compi_classes:  # if parent mass is too high, there is no classes ofc (maxmz 850)
        mf_cf_subclass = target_ci_classes[3][0]
        print("\tMF CF subclass score: {:.3f}\t{}".format(
            class_linking_tables["as_classes"]["cf_subclass"][as_class].get(mf_cf_subclass[0], 0),
            cf_subclass))
        for name, mf_cl_tup in zip(npc_names, target_compi_classes[11:]):
            mf_npc_score = 0
            if mf_cl_tup[0]:
                mf_npc_score = class_linking_tables["as_classes"][name][as_class].get(mf_cl_tup[0][0], 0)
            print("\tMF NPC {} score: {:.3f}\t{}".format(
                  name, mf_npc_score, mf_cl_tup[0]))
    else:
        print("\t--no molfam classes--")

Results for object: GCF(id=504, class=Others, gcf_id=3327, strains=43), 64 total links, 1 methods used
BiG-SCAPE class: Others | BGC classes: [('indole', 42), ('cf_putative', 1)]

1.	[metcalf] Spectrum(id=3140, spectrum_id=82711, strains=62) | 7.4217 | shared strains = 39
	GNPS hit: Spectral Match to Staurosporine from NIST14
	CF subclass score: 0.011	('Carbohydrates and carbohydrate conjugates', 0.988)
	NPC npc_pathway score: 0.007	('Carbohydrates', 0.994)
	NPC npc_superclass score: 0.010	('Polyols', 1.0)
	NPC npc_class score: 0.010	('Amino cyclitols', 1.0)

	MF 243 | size 14 | grouped strains 73
	MF CF subclass score: 0.011	('Carbohydrates and carbohydrate conjugates', 0.988)
	MF NPC npc_pathway score: 0.128	('Amino acids and Peptides', 0.5)
	MF NPC npc_superclass score: 0.029	('Small peptides', 0.333)
	MF NPC npc_class score: 0.010	('Amino cyclitols', 0.25)
2.	[metcalf] Spectrum(id=3632, spectrum_id=89513, strains=66) | 7.3030 | shared strains = 40
	GNPS hit: 7-OH-staurosporine
	CF 

In [157]:
print(link_data.target.family_id, link_data.target.family.id)

1032 7957


# Loading known MIBiG links

### Reading mibig classes

In [325]:
mibig_classes = glob.glob(os.path.join(npl.root_dir, "MIBiG*_compounds_with_AS_BGC_CF_NPC_classes.txt"))[0]
print(mibig_classes)

classes_dict = {}
with open(mibig_classes) as inf:
    header = inf.readline()
    print(header)
    for line in inf:
        elems = line.strip().split("\t")
        chem_id = elems.pop(0)
        class_base = elems.pop(0).split(',')
        classes = [cls.partition(':')[0] for cls in class_base]
        sub_classes = [cls for cls in class_base if cls.split(":")[1]]
        as_classes = elems.pop(0).split(',')

        bgc_classes = [classes, sub_classes, as_classes]        
        chem_classes = [chem_cls.split('; ') for chem_cls in elems[2:]]
        classes_dict[chem_id] = [bgc_classes, chem_classes]

print(classes_dict[chem_id])

/mnt/scratch/louwe015/NPLinker/own/nplinker_shared/crus_full_new_AS3_03-09/MIBiG2.0_compounds_with_AS_BGC_CF_NPC_classes.txt
compound_name	class:subclass	as_classes	smiles	inchi_key	cf_kingdom	cf_superclass	cf_class	cf_subclass	cf_direct_parent	npc_class	npc_superclass	npc_pathway	npc_isglycoside

[[['Other'], [], ['phosphonate']], [[''], [''], [''], [''], [''], [''], [''], [''], ['0']]]


In [326]:
# creating legend from the header -> some renaiming is maybe necessary for easy linking
s_h = header.strip().split('\t')
legend_bgc = ['mibig_classes']+s_h[1:3]
legend_chem = s_h[5:]
print(legend_bgc, legend_chem)

['mibig_classes', 'class:subclass', 'as_classes'] ['cf_kingdom', 'cf_superclass', 'cf_class', 'cf_subclass', 'cf_direct_parent', 'npc_class', 'npc_superclass', 'npc_pathway', 'npc_isglycoside']


In [215]:
# Which AS classes relate to Alkaloid class? -> based on MIBiG AS entries of alkaloids just taking "indole" seems sufficient
as_alks = []
for line in classes_dict.values():
    if "Alkaloid" in line[0][0]:
        as_alks.append(line[0][2])
from collections import Counter
Counter([cls for as_class in as_alks for cls in as_class]).most_common()

[('indole', 40),
 ('', 27),
 ('other', 26),
 ('NRPS', 15),
 ('T1PKS', 5),
 ('LAP', 1),
 ('thiopeptide', 1),
 ('NRPS-like', 1),
 ('bacteriocin', 1)]

### Linking MIBiG classes

In [327]:
from collections import defaultdict
def rec_dd():
    """Initialises a recurring defaultdict"""
    return defaultdict(rec_dd)

In [332]:
# aggregate pairwise class matrices for all compounds
result = rec_dd()
for chem_id, classes in classes_dict.items():
    bgc_classes, chem_classes = classes
    
    for i, bgc_cat in enumerate(legend_bgc):
        init_bgc_class = bgc_classes[i]
        if not init_bgc_class or init_bgc_class == ['']:
            continue
        
        #group pks-nrp hybrids for MIBiG classes
        hyb_count = len([1 for init_bgc_c in init_bgc_class if any([test in init_bgc_c.lower() for test in ['nrp', 'pks', 'polyketide']])])
        bgc_class = []
        if hyb_count >= 2:
            bgc_class.append("PKS-NRP_Hybrids")
            for init_bgc_c in init_bgc_class:
                if any([test in init_bgc_c.lower() for test in ['nrp', 'pks', 'polyketide']]):
                    pass
                else:
                    bgc_class.append(init_bgc_c)
        else:
            bgc_class = init_bgc_class[:]
#         print(bgc_class, init_bgc_class)
        
        for j, chem_cat in enumerate(legend_chem):
            chem_class = chem_classes[j]
            if not chem_class or chem_class == ['']:
                continue
            
            for bgc_c in bgc_class:
                for chem_c in chem_class:
                    try:
                        result[bgc_cat][chem_cat][bgc_c][chem_c] += 1
                    except TypeError:
                        result[bgc_cat][chem_cat][bgc_c][chem_c] = 1

In [333]:
res = result['as_classes']['npc_pathway']
df_res = pd.DataFrame(res, dtype=int)
df_res['indole']

Polyketides                        6.000
Shikimates and Phenylpropanoids      NaN
Alkaloids                         99.000
Amino acids and Peptides          18.000
Terpenoids                        17.000
Fatty acids                          NaN
Carbohydrates                      1.000
Name: indole, dtype: float64

In [334]:
# MiBIG BGC class and NPC pathway
df_mibig_npc_pway = pd.DataFrame.from_dict(result['mibig_classes']['npc_pathway'], dtype=int)
df_mibig_npc_pway

Unnamed: 0,Polyketide,Other,Alkaloid,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Polyketides,548,35,1.0,64.0,19.0,177,115.0,4.0
Shikimates and Phenylpropanoids,20,21,,3.0,1.0,6,9.0,
Alkaloids,72,96,97.0,,49.0,58,99.0,2.0
Amino acids and Peptides,24,78,12.0,9.0,2.0,187,320.0,113.0
Terpenoids,35,5,,4.0,136.0,2,,
Fatty acids,14,19,,,,12,2.0,
Carbohydrates,9,46,,60.0,1.0,1,8.0,


In [335]:
# "Rules" read column -> row (bgc -> metabolite)

df_mibig_npc_pway/df_mibig_npc_pway.sum(axis=0)

Unnamed: 0,Polyketide,Other,Alkaloid,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Polyketides,0.759,0.117,0.009,0.457,0.091,0.4,0.208,0.034
Shikimates and Phenylpropanoids,0.028,0.07,,0.021,0.005,0.014,0.016,
Alkaloids,0.1,0.32,0.882,,0.236,0.131,0.179,0.017
Amino acids and Peptides,0.033,0.26,0.109,0.064,0.01,0.422,0.579,0.95
Terpenoids,0.048,0.017,,0.029,0.654,0.005,,
Fatty acids,0.019,0.063,,,,0.027,0.004,
Carbohydrates,0.012,0.153,,0.429,0.005,0.002,0.014,


In [336]:
# Make linking object: dict similar to result but a table with fractions at the end
bgc_keys = result.keys()
metab_keys = list(result.values())[0].keys()
print(bgc_keys, metab_keys)
pd.DataFrame.from_dict(result['mibig_classes']['npc_pathway'], dtype=int)

dict_keys(['mibig_classes', 'class:subclass', 'as_classes']) dict_keys(['cf_kingdom', 'cf_superclass', 'cf_class', 'cf_direct_parent', 'npc_class', 'npc_superclass', 'npc_pathway', 'npc_isglycoside', 'cf_subclass'])


Unnamed: 0,Polyketide,Other,Alkaloid,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Polyketides,548,35,1.0,64.0,19.0,177,115.0,4.0
Shikimates and Phenylpropanoids,20,21,,3.0,1.0,6,9.0,
Alkaloids,72,96,97.0,,49.0,58,99.0,2.0
Amino acids and Peptides,24,78,12.0,9.0,2.0,187,320.0,113.0
Terpenoids,35,5,,4.0,136.0,2,,
Fatty acids,14,19,,,,12,2.0,
Carbohydrates,9,46,,60.0,1.0,1,8.0,


In [337]:
# read resulting tables column to row
# bgc -> chem: d[bgc_key][chem_key][bgc_class][chem_class]
# and vice versa for chem -> bgc
# todo: maybe make an object called class_linking that stores all these things as well as dict keys to loop through
class_linking_tables = {}
class_linking_counts = {}  # store the counts in df/get rid of defaultdicts
for bgc_key, bgc_result in result.items():
    for chem_key, elem_result in bgc_result.items():
        # init entries in dict
        if not bgc_key in class_linking_tables:
            class_linking_tables[bgc_key] = {}
            class_linking_counts[bgc_key] = {}
        if not chem_key in class_linking_tables:
            class_linking_tables[chem_key] = {}
            class_linking_counts[chem_key] = {}
        # add linking tables as DataFrames
        elem_df = pd.DataFrame.from_dict(elem_result, dtype=int)
        class_linking_tables[bgc_key][chem_key] = elem_df/elem_df.sum(axis=0)
        class_linking_counts[bgc_key][chem_key] = elem_df
        class_linking_tables[chem_key][bgc_key] = elem_df/elem_df.sum(axis=1)
        class_linking_counts[chem_key][bgc_key] = elem_df

In [338]:
# for staurosporine link for example?
b_key = "as_classes"
s_key = "npc_pathway"
b_cls = 'indole'
s_cls = "Alkaloids"
class_linking_tables[b_key][s_key][b_cls][s_cls], class_linking_counts[b_key][s_key][b_cls][s_cls]

(0.7021276595744681, 99.0)

# Get score for class combination
For now to get a class-link I envision to enter a gcf/bgc object and a spectrum/molfam object and return a matrix of bgc classes vs chemical classes and their scores for each level. This can then be stored in the link object (or a new link object).

In npl object init scoring tables for both CF and NPC. This could be a dict of dicts of matrix, so it would be npl.class_link_table['mibig_class']['npc_pathway'][bgc_class][spectrum_class]. Then make a func with bgc bigscape and antismash classes, and spectra NPC and CF classes to get the link table. 

Add classes to spectrum/molfam objects, for both NPC and CF. Maybe spectrum.classes and spectrum.gnps_classes and spectrum.molnetenhancer_classes. These can then have .npc and .cf

- For BiG-SCAPE classes get the MIBiG classes. Small conversion is needed here (MIBiG-BiG-SCAPE):
    - Polyketide <=> PKSI, PKSother
    - NRP <=> NRPS
    - RiPP <=> RiPPs
    - Saccharide <=> Saccharides
    - Other <=> Others
    - Terpene and PKS-NRP_hybrids (defined in loading func) are the same
    - Alkaloid is not classified by BiG-SCAPE -> take AS indole class?
    
Add missing classes (i.e. not present in MIBiG) to scoring tables to avoid errors.

Also, make a general classes attribute to spectra/molfams, like annotations that is a dict of {'gnps': gnps_classes, 'molnetenh': molnetenh_classes, 'canopus': canopus_classes}, apart from also the gnps_classes etc. And then loop through this dict to get the possible scores for all available spectrum classes.

In [339]:
# MIBiG BiG-SCAPE conversion -> put in bigscape class and get out correct mibig class to look up links
# Do something with Alkaloid
bigscape_mibig_conversion = {'PKSI': 'Polyketide', 'PKSother': 'Polyketide',
                             'NRPS': 'NRP', 'RiPPs': 'RiPP', 'Saccharides': 'Saccharide',
                             'Others': 'Other', 'Terpene': 'Terpene', 'PKS-NRP_hybrid': 'PKS-NRP_hybrids'}

In [340]:
from collections import defaultdict, Counter

class Class_links(object):
    '''Holds all info concerning class links (based on known bgc-compound links in MIBiG)
    '''
    def __init__(self, mibig_classes_file):
        self._mibig_classes_file = mibig_classes_file
        self._read_mibig_classes()
        self._get_class_counts()
        self._get_scoring_tables()
        pd.options.display.float_format = "{:,.3f}".format  # adjust pd formatting

    def _read_mibig_classes(self):
        # read mibig file to dict of list {chem_id: [bgc_classes, chem_classes]}
        classes_dict = {}
        with open(self._mibig_classes_file) as inf:
            header = inf.readline()
            for line in inf:
                elems = line.strip().split("\t")
                chem_id = elems.pop(0)
                class_base = elems.pop(0).split(',')
                classes = [cls.partition(':')[0] for cls in class_base]
                sub_classes = [cls for cls in class_base if cls.split(":")[1]]
                as_classes = elems.pop(0).split(',')

                bgc_classes = [classes, sub_classes, as_classes]        
                chem_classes = [chem_cls.split('; ') for chem_cls in elems[2:]]
                classes_dict[chem_id] = [bgc_classes, chem_classes]
        self._mibig_classes = classes_dict
        # add header info
        s_h = header.strip().split('\t')

        self._bgc_class_names = ['mibig_classes']+s_h[1:3]
        self._chem_class_names = s_h[5:]

        return self._mibig_classes
    
    def _get_class_counts(self):
        # aggregate pairwise class matrices for all compounds
        
        def _rec_dd():
            """Initialises a recurring defaultdict"""
            return defaultdict(_rec_dd)
        
        result = _rec_dd()
        for chem_id, (bgc_classes, chem_classes) in self._mibig_classes.items():

            for i, bgc_cat in enumerate(self.bgc_class_names):
                init_bgc_class = bgc_classes[i]
                if not init_bgc_class or init_bgc_class == ['']:
                    continue

                # group pks-nrp hybrids for MIBiG classes and as_classes
#                 if i == 0:
                hyb_count = len([1 for init_bgc_c in init_bgc_class \
                                 if any([test in init_bgc_c.lower() for test in ['nrp', 'pks', 'polyketide']])])
                bgc_class = []
                if hyb_count >= 2:
                    bgc_class.append("PKS-NRP_Hybrids")
                    for init_bgc_c in init_bgc_class:
                        if any([test in init_bgc_c.lower() for test in ['nrp', 'pks', 'polyketide']]):
                            pass
                        else:
                            bgc_class.append(init_bgc_c)
                else:
                    bgc_class = init_bgc_class[:]

                for j, chem_cat in enumerate(self.chem_class_names):
                    chem_class = chem_classes[j]
                    if not chem_class or chem_class == ['']:
                        continue

                    for bgc_c in bgc_class:
                        for chem_c in chem_class:
                            try:
                                result[bgc_cat][chem_cat][bgc_c][chem_c] += 1
                            except TypeError:
                                result[bgc_cat][chem_cat][bgc_c][chem_c] = 1
        self._class_count_dict = result
        return result
    
    def _get_scoring_tables(self):
        # makes dict of pd.DataFrames
        # read resulting tables column to row
        # bgc -> chem: d[bgc_key][chem_key][bgc_class][chem_class]
        # and vice versa for chem -> bgc
        class_linking_tables = {}
        class_linking_counts = {}  # store the counts in df/get rid of defaultdicts
        for bgc_key, bgc_chem_counts in self._class_count_dict.items():
            for chem_key, counts in bgc_chem_counts.items():
                # init entries in dict
                if not bgc_key in class_linking_tables:
                    class_linking_tables[bgc_key] = {}
                    class_linking_counts[bgc_key] = {}
                if not chem_key in class_linking_tables:
                    class_linking_tables[chem_key] = {}
                    class_linking_counts[chem_key] = {}
                # add linking tables as DataFrames
                counts_df = pd.DataFrame.from_dict(counts, dtype=int)
                class_linking_tables[bgc_key][chem_key] = counts_df/counts_df.sum(axis=0)
                class_linking_counts[bgc_key][chem_key] = counts_df
                class_linking_tables[chem_key][bgc_key] = counts_df/counts_df.sum(axis=1)
                class_linking_counts[chem_key][bgc_key] = counts_df
        self._class_links = class_linking_tables
        self._class_links_counts = class_linking_counts
        return class_linking_tables
    
    @property
    def class_links(self):
        return self._class_links
    
    @property
    def class_links_counts(self):
        return self._class_links_counts
    
    @property
    def bgc_class_names(self):
        return self._bgc_class_names
    
    @property
    def chem_class_names(self):
        return self._chem_class_names
        
class_link_obj = Class_links(mibig_classes)

In [342]:
class_link_obj = Class_links(mibig_classes)
class_link_obj.class_links['mibig_classes']['npc_pathway']

Unnamed: 0,Polyketide,Other,Alkaloid,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Polyketides,0.759,0.117,0.009,0.457,0.091,0.4,0.208,0.034
Shikimates and Phenylpropanoids,0.028,0.07,,0.021,0.005,0.014,0.016,
Alkaloids,0.1,0.32,0.882,,0.236,0.131,0.179,0.017
Amino acids and Peptides,0.033,0.26,0.109,0.064,0.01,0.422,0.579,0.95
Terpenoids,0.048,0.017,,0.029,0.654,0.005,,
Fatty acids,0.019,0.063,,,,0.027,0.004,
Carbohydrates,0.012,0.153,,0.429,0.005,0.002,0.014,


In [343]:
class_link_obj.class_links['mibig_classes']['npc_pathway']['Polyketide']['Polyketides']

0.7590027700831025

In [344]:
class_link_obj._mibig_classes['BGC0000825_staurosporine']

[[['Alkaloid'], [], ['indole']],
 [['Organic compounds'],
  ['Organoheterocyclic compounds'],
  ['Indoles and derivatives'],
  ['Carbazoles'],
  ['Indolocarbazoles'],
  ['Carbazole alkaloids'],
  ['Tryptophan alkaloids'],
  ['Alkaloids'],
  ['0']]]

In [345]:
class_link_obj.chem_class_names

['cf_kingdom',
 'cf_superclass',
 'cf_class',
 'cf_subclass',
 'cf_direct_parent',
 'npc_class',
 'npc_superclass',
 'npc_pathway',
 'npc_isglycoside']

## Function for linking
For now put in the dicts with scores for the spectrum obj, replace with attributes in spectrum obj later

For testing, use my staurosporine link of interest and of course Class_links object:
- GCF(id=504, class=Others, gcf_id=3327, strains=43)
- Spectrum(id=3632, spectrum_id=89513, strains=66)

Outline:
- (for now) get spectrum classes through class dicts + spectrum_ids
- get antismash classes for a gcf with small subroutine looking at the as_classes of the bgcs
- loop through the classes from the Class_links object, and convert them to indices in spectrum classes
- think of way to come from either side, i.e. bgc or chem
- use bigscape-mibig class conversion for bigscape class
- report (for now) all posible scores, and the highest score?

In [349]:
gcf_in = [gcf for gcf in npl.gcfs if gcf.id == 504][0]
spec_in = [spec for spec in npl.spectra if spec.id == 3632][0]
gcf_in, spec_in

(GCF(id=504, class=Others, gcf_id=3327, strains=43),
 Spectrum(id=3632, spectrum_id=89513, strains=66))

In [374]:
# get antismash classes for a gcf -> integrate into gcf object
def get_gcf_as_classes(gcf, cutoff = 0.5):
    '''Get antismash classes for a gcf if antismash class occurs in more than <cutoff> of gcf'''
    gcf_size = len(gcf.bgcs)
    unlist_all_products = [product for bgc in gcf.bgcs for product in bgc.product_prediction.split('.')]
    sorted_as_classes = Counter(unlist_all_products).most_common()
    # keep if in more than half of bgcs?
    cutoff = 0.5
    size_cutoff = gcf_size * cutoff
    filtered_as_classes = []
    for product in sorted_as_classes:
        if product[1] >= size_cutoff:
            filtered_as_classes.append(product[0])
    return filtered_as_classes

get_gcf_as_classes(gcf_in), get_gcf_as_classes(npl.gcfs[1])

(['indole'], ['nrps', 't1pks', 'otherks'])

In [381]:
dir(spec_in)

['__class__',
 '__cmp__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_jcamp',
 '_losses',
 'add_strain',
 'annotations',
 'edges',
 'family',
 'family_id',
 'get_growth_medium',
 'get_metadata_value',
 'gnps_annotations',
 'gnps_id',
 'growth_media',
 'has_annotations',
 'has_loss',
 'has_strain',
 'id',
 'is_library',
 'keep_top_k',
 'losses',
 'max_ms2_intensity',
 'metadata',
 'n_peaks',
 'normalised_peaks',
 'parent_mz',
 'peaks',
 'precursor_mz',
 'rt',
 'set_annotations',
 'spectrum_id',
 'strains',
 'to_jcamp_str',
 'total_ms2_intensity']

In [391]:
# TODO have this automatically from canopus output -> store in npl object when reading canopus output
ci_classes_names = [f"cf_{elem}" for elem in ci_classes_header[3:-3]] +\
                   [f"npc_{elem}" for elem in ci_classes_header[-3:]]
compi_classes_names = [f"cf_{elem}" for elem in compi_classes_header[2:-3]] +\
                      [f"npc_{elem}" for elem in compi_classes_header[-3:]]
ci_classes_names_inds = {elem:i for i,elem in enumerate(ci_classes_names)}
compi_classes_names_inds = {elem:i for i,elem in enumerate(compi_classes_names)}
ci_classes_names, compi_classes_names, ci_classes_names_inds, compi_classes_names_inds

(['cf_kingdom',
  'cf_superclass',
  'cf_class',
  'cf_subclass',
  'cf_level 5',
  'cf_level 6',
  'cf_level 7',
  'cf_level 8',
  'cf_level 9',
  'cf_level 10',
  'cf_level 11',
  'npc_pathway',
  'npc_superclass',
  'npc_class'],
 ['cf_kingdom',
  'cf_superclass',
  'cf_class',
  'cf_subclass',
  'cf_level 5',
  'cf_level 6',
  'cf_level 7',
  'cf_level 8',
  'cf_level 9',
  'cf_level 10',
  'cf_level 11',
  'npc_pathway',
  'npc_superclass',
  'npc_class'],
 {'cf_kingdom': 0,
  'cf_superclass': 1,
  'cf_class': 2,
  'cf_subclass': 3,
  'cf_level 5': 4,
  'cf_level 6': 5,
  'cf_level 7': 6,
  'cf_level 8': 7,
  'cf_level 9': 8,
  'cf_level 10': 9,
  'cf_level 11': 10,
  'npc_pathway': 11,
  'npc_superclass': 12,
  'npc_class': 13},
 {'cf_kingdom': 0,
  'cf_superclass': 1,
  'cf_class': 2,
  'cf_subclass': 3,
  'cf_level 5': 4,
  'cf_level 6': 5,
  'cf_level 7': 6,
  'cf_level 8': 7,
  'cf_level 9': 8,
  'cf_level 10': 9,
  'cf_level 11': 10,
  'npc_pathway': 11,
  'npc_superclass': 

In [404]:
# from gcf side first?
use_num_spec_classes = 1  # number of classes to use per lvl
# bgc_as = get_gcf_as_classes(gcf_in)  # list
# bgc_bigscape = gcf_in.bigscape_class  # str

# maybe as gcf.classes?
gcf_in_classes = {"bigscape_class": gcf_in.bigscape_class,  # str
                  "as_classes": get_gcf_as_classes(gcf_in)}  # list
spec_classes = ci_classes[str(spec_in.spectrum_id)]  # list of list of tuples/None
# spec_classes = compi_classes[str(spec_in.family_id)]  # use also molfam classes for spec?
for bgc_class_name in class_link_obj.bgc_class_names:
    if bgc_class_name == "mibig_classes":
        # treat specially as bigscape class needs to be translated to mibig class
        bigscape_class = gcf_in_classes["bigscape_class"]
        bgc_class = bigscape_mibig_conversion.get(bigscape_class)  # convert bigscape class to mibig class
    else:
        bgc_class = gcf_in_classes.get(bgc_class_name)
    if bgc_class:
        if isinstance(bgc_class, list):
            bgc_class = bgc_class[0]
        for chem_class_name in class_link_obj.chem_class_names:
            spec_class_i = ci_classes_names_inds.get(chem_class_name)  # does info exist for spectrum?
            if spec_class_i:
                spec_class_options = spec_classes[spec_class_i][0]  # get class
                if spec_class_options:  # if there is a class at this lvl
                    spec_class = spec_class_options[0]  # for now only use first
                    print(bgc_class_name, bgc_class, chem_class_name, spec_class)
                    score = class_link_obj.class_links[bgc_class_name][chem_class_name].get(bgc_class,{}).get(spec_class)
                    print(score)
# bgc_as, bgc_bigscape, spec_classes

mibig_classes Other cf_superclass Organoheterocyclic compounds
0.35135135135135137
mibig_classes Other cf_class Diazines
0.01937984496124031
mibig_classes Other cf_subclass Pyrimidines and pyrimidine derivatives
0.017543859649122806
mibig_classes Other npc_pathway Alkaloids
0.32
as_classes indole cf_superclass Organoheterocyclic compounds
0.780952380952381
as_classes indole cf_class Diazines
nan
as_classes indole cf_subclass Pyrimidines and pyrimidine derivatives
None
as_classes indole npc_pathway Alkaloids
0.7021276595744681
