In [1]:
import os
import glob
from pprint import pprint
import json
import pickle
import re
from collections import OrderedDict
from Bio import SeqIO

In [2]:
from rdkit import Chem as chem
from rdkit.Chem import AllChem, Draw
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
import imp
helper = imp.load_source('helper', './pks/helper.py')
domain = imp.load_source('domain', './pks/domain.py')
#pks = imp.load_source('pks', './pks/pks.py')

In [4]:
file_path = './mibig'
file_names = glob.glob(os.path.join(file_path, '*.json'))
print(len(file_names))

1396


In [5]:
import os, sys
sys.path.insert(0, '/clusterCAD')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "clusterCAD.settings")
import django
django.setup()
import pks.models

# Load data

In [6]:
# This gets all the modular type I PKSs contained in MiBiG
t1pks = []
for file_name in file_names:
    with open(file_name) as json_file:
        json_data = json.load(json_file)
    try:        
        if len(set(['Modular type I', 'Modular Type I', 'Type I']).intersection(set(json_data['general_params']['Polyketide']['pks_subclass']))) > 0:
            accession = json_data['general_params']['loci']['nucl_acc'][0]['Accession']
            t1pks.append((file_name.split('/')[-1].split('.')[0], accession))
    except KeyError:
        pass
# Number of clusters found
print('%d potential type I modular PKS clusters found!' %(len(t1pks)))

352 potential type I modular PKS clusters found!


In [7]:
# antiSMASH output from Tyler
antismash_file_path = './mibig/antismash/split_files'

# Proccessing functions

In [8]:
def process_subunit_modules(sec_met): 
    '''This function takes as input the the list recorded by feature.qualifiers['sec_met'] for a module in a PKS
       cluster. This assumes that feature.type=='CDS' and that feature.qualifiers has the key 'sec_met'.
       The function returns a dict corresponding to the modules in the subunit, indexed starting from zero within
       the subunit. If the first entry of 'sec_met' is not 'Type: t1pks' then nothing is returned.
    '''
    # Initialize dict for the subunit
    # keys: module number
    # values: OrderedDict of domains in module
    #         within OrderedDict, key is domain name and value is a length 2 list where the
    #         first element is a dictionary {start:, stop:} and the second element is specificity dictionary 
    subunit = {}
    
    # This is for the current module (function processes subunit which may have more than one module)
    module_index = 0  # key for module
    module_domains = [] # list of domains in module
    old_module_domains = [] # pre-initialize in case subunit starts with a domain
                            # that is expected to end the module
    
    # This is how domains appear in sec_met:
    # ['PKS_AT', 'PKS_KS', 'PKS_KR', 'PKS_DH', 'PKS_ER', 'ACP', 'Thioesterase']
    # Iterate over the entries in sec_met, and add them to the module_domains list 
    for entry in sec_met:    
        # Split entry into a list
        entrysplit = [item.strip() for item in entry.split(';') if item != '']
        # Split part of entry that is expected to describe catalytic domain
        domainsplit = entrysplit[0].split()
        # This is just different ways of processing the name of the domain depending
        # on how the name of the domain is formatted
        if ' '.join(domainsplit[:2]) == 'NRPS/PKS Domain:' and len(domainsplit) > 2:
            # Note that we want to make sure that there is a leading 'PKS_' before we do our trimming
            if domainsplit[2].split('_')[0] == 'PKS':
                if domainsplit[2] in ['PKS_Docking_Nterm', 'PKS_Docking_Cterm']:
                    domaintype = domainsplit[2]
                else:
                    # We trim off the leading 'PKS_'
                    # Assume 'DH2' and 'DHt' are the same as 'DH' 
                    domaintype = domainsplit[2].split('_')[-1].replace('DHt', 'DH').replace('DH2', 'DH')
            # Special case of 'CAL' domain
            elif domainsplit[2] == 'CAL_domain':
                domaintype = 'CAL'
            else:
                domaintype = domainsplit[2]
        else:
            continue
    
        # DEBUG
#        print(domaintype)
    
        # These are the catlytic domains that we want to recognize
        if domaintype not in ['KS', 'AT', 'KR', 'DH', 'ER', 'ACP', 'Thioesterase', 
                              'cMT', 'oMT', 'CAL', 'PCP', 
                              'Heterocyclization', 'AMP-binding', 
                              'Condensation_DCL', 'Condensation_LCL',
                              'PKS_Docking_Nterm', 'PKS_Docking_Cterm']:
            # Break out of for loop and stop looking for additional catalytic domains if 
            # we encountered a domain that we don't recognize
            # we end up excluding any subunit that has a non-recognized catalytic domain
            # this is dealt with by checking subunits against those that are expected to be recognized
            # as determined by the MiBiG JSON file
            break    
        # Get the obundaries of the catalytic domain
        boundaries = [int(bound) for bound in domainsplit[3].replace('(', '').replace(')', '').replace('.', '').split('-')]
        
        # Here, we add each domain to a list, which will be converted to an OrderedDict
        # based on whether or not the domain is expected to have substrate specificity annotations
        if domaintype in ['KS', 'DH', 'ER', 'ACP', 'cMT', 'oMT', 'CAL', 'PCP',
                          'Heterocylization', 'AMP-binding', 
                          'Condensation_DCL', 'Condensation_LCL',
                          'PKS_Docking_Nterm', 'PKS_Docking_Cterm']:   # Recall that we trimmed leading 'PKS_'
            module_domains.append((domaintype, [{'start': boundaries[0], 'stop': boundaries[1]}]))
        # Include substrate and stereospecificity annotations for AT and KR domains respectively
        if domaintype in ['AT', 'KR']:   # Recall that we trimmed leading 'PKS_'
            notesdict = {}
            for note in entrysplit[1:]:
                item = note.split(': ')
                notesdict[item[0]] = item[1]
            module_domains.append((domaintype, [{'start': boundaries[0], 'stop': boundaries[1]}, notesdict]))
                
        # End of the module has been reached of the domain is 'ACP' or 'PCP
        if domaintype in ['ACP', 'PCP']:
            domains_present = [d[0] for d in module_domains]
            # Make sure every module has an AT, or else it isn't a valid module and we just ignore it
            # This means it will be excluded from the subunit, which makes sense since we can't 
            # really perform a polyketide chain extension without an AT
            if 'AT' in domains_present:            
                subunit[module_index] = OrderedDict(module_domains)
                old_module_domains = module_domains
                module_index += 1
            else:
                old_module_domains = []
            module_domains = []
        # These domains may come after the ACP or PCP, so if they are encountered, we add
        # them to previous module and keep going forward
        if domaintype in ['Thioesterase', 'PKS_Docking_Cterm', 'Condensation_LCL']:
            # Overwrite previous subunit, or else will have duplicate entries
            old_module_domains.append((domaintype, [{'start': boundaries[0], 'stop': boundaries[1]}]))
            subunit[module_index - 1] = OrderedDict(old_module_domains)
            module_domains = []
            
    return subunit

In [9]:
def get_gene_data(record):
    '''Takes as input a record read in from an MiBiG GenBank file using SeqIO.read() and outputs PKS data 
       from that record. Data will be comprised of PKS subunits and standalone PKS genes.
    '''
    # Get list to hold information about all genes that are in the record
    gene_data = []
    
    # Only the "CDS" features are potentially genes
    # Here we get genes that aren't necessarily subunits
    for feature in record.features:
        # These are the features we are interested in
        if feature.type == 'CDS' and 'protein_id' in feature.qualifiers.keys() and 'gene' in feature.qualifiers.keys(): 
            # This gets the location of the feature
            location = feature.location
            # Potential information about gene
            if 'product' in feature.qualifiers.keys():
                description = feature.qualifiers['product'][0]
            gene_data.append([feature.qualifiers['protein_id'][0],
                              feature.qualifiers['gene'][0],
                             ])
            # Feature may not be a PKS module and therefore may not have have subunits 
            # (this will be overwritten if it does have subunits)
            subunit_modules = None
            # Information if gene is PKS subunit
            if 'sec_met' in feature.qualifiers.keys() and len(feature.qualifiers['sec_met']) > 3:
                if feature.qualifiers['sec_met'][3] in ['NRPS/PKS subtype: Type I Modular PKS', 
                                                        'NRPS/PKS subtype: PKS-like protein',
                                                        'NRPS/PKS subtype: PKS/NRPS-like protein',
                                                        'NRPS/PKS subtype: Hybrid PKS-NRPS']:                    
                    # DEBUG
#                    print(feature.qualifiers['gene'][0])
                    # This gets the subunit information                    
                    subunit_modules = process_subunit_modules(feature.qualifiers['sec_met'])
#                else:
#                    print(feature)
            
            # More general information
            gene_data[-1].extend([description, [location.start.position, location.end.position]])

            # Subunit information (if it doesn't have subunit information, assumed to be a standalone enzyme)
            if subunit_modules:
                gene_data[-1].append(subunit_modules)

            # General information about gene
            gene_data[-1].append(feature.qualifiers['translation'][0])

    return gene_data

In [10]:
def check_json_module_validity(module_list):
    '''Function that makes sure module specified in JSON file is valid,
       that is to say, make sure that it contains KS, AT, and ACP or PCP.
       AT least as of February 24, 2017, the names of these domains appear
       only in the following forms in clusters that are annotated as Type I modular PKSs
       ['KS', 'AT', 'T']
       ['Ketosynthase', 'Acyltransferase', 'Thiolation (ACP/PCP)']
    '''
    at_check = len(set(['AT', 'Acyltransferase']).intersection(set(module_list)))
    acp_check = len(set(['ACP', 'PCP', 'T', 'Thiolation (ACP/PCP)']).intersection(set(module_list)))

    if at_check and acp_check:
        return True
    else:
        return False

def process_cluster(record, cluster_ref, mibig_json):
    '''Takes in a record and corresponding MiBiG json file
       then returns pks.Cluster object representing the cluster.
    '''
    # Get information about the gene
    gene_data = get_gene_data(record)
    if len(gene_data) == 0:
        return

    # Initalize lists for subunits and standalones
    # We make two dictionaries because sometimes the subunit name in the MiBiG JSON files
    # is the gene name, e.g. eryA1, and sometimes it is the accession number, e.g. A0000000
    unordered_subunits = {}
    unordered_subunits_alt = {}
    standalones = []
     
    # Recall that each entry in gene_data is a list
    # [protein id, gene, product, [location start, location end], subunit dict (optional), translation]
    
    #####################
    # Basic information #
    #####################
    
    counter = 1
    for gene in gene_data:
        geneid = gene[0].strip()
        genename = gene[1].strip()
        genedesc = gene[2].strip()
        genestart = gene[3][0]
        genestop = gene[3][1]
        genetranslation = gene[-1].strip()

        # Just use length of gene_data to differentiate between standalones and subunits
        if len(gene) == 6:
            # We do this to take care of duplicated gene names, as is the case wity tylactone (BGC0000166)
            if genename in unordered_subunits_alt.keys():
                genename = genename + '_' + str(counter)
                counter += 1
 
            # Get subunit data from gene
            genesubunitdata = gene[-2]
            # Here we use the two dictionary options to save the unordered subunits
            # Sometimes MiBiG uses geneid and sometimes it uses genename to reference subunits
            unordered_subunits[geneid] = (genename, genedesc, genestart, genestop,
                                            genesubunitdata, genetranslation)
            unordered_subunits_alt[genename] = (geneid, genedesc, genestart, genestop,
                                                genesubunitdata, genetranslation)
        else:
            # Standalones lack subunit and orphan entries
            assert len(gene) == 5, gene
            
            ########
            # CREATE STANDALONE
#            pks.models.Standalone(cluster=cluster_ref)
 #           standalones.append(pks.Standalone(geneid, genename, genedesc, 
 #                                             genestart, genestop, genetranslation))
        
    #########################################
    # JSON file has cyclization information #
    #########################################

    # Get ordered version of subunits from corresponding JSON file
    with open(mibig_json) as json_file:
        mibig_data = json.load(json_file)
    
    # Get PKS cyclization information
    # this will be either 'Cyclic' or 'Linear'
    try:
        lin_cycl_pk = mibig_data['general_params']['Polyketide']['lin_cycl_pk']
        if lin_cycl_pk == 'Cyclic':
            cyclize = True
        elif lin_cycl_pk == 'Linear':
            cyclize = False
        else:
            raise Exception("lin_cycl_pk expected to be 'Cyclic' or 'Linear'.")
    except KeyError:
        cyclize = False
            
    #####################################
    # JSON file has subunit information #
    #####################################
        
    # Note that all gene data has now been processed, want to reprocess to get right ordering 
    # We strip out subunits that have invalid modules
    try:
        ordered_subunits = []
        for subunit in mibig_data['general_params']['Polyketide']['mod_pks_genes']:
            subunit_name = re.sub(r'\s+', '', subunit['mod_pks_gene'])
            subunit_modules = subunit['pks_module']

            valid_subunit = True
            # This checks if the module is valid
            for module in subunit_modules:
                # Just for debugging
    #            print(module['pks_domains'])
                if not check_json_module_validity(module['pks_domains']):
                    valid_subunit = False
            if valid_subunit:
                ordered_subunits.extend(subunit_name.split(','))
            else:
                # Loop is broken once first invalid subunit is encountered
                break
        # If no valid subunits, then just return
        if len(ordered_subunits) == 0:
            print('\tNo valid subunits!')
            for subunit in mibig_data['general_params']['Polyketide']['mod_pks_genes']:
                subunit_name = re.sub(r'\s+', '', subunit['mod_pks_gene'])
                subunit_modules = subunit['pks_module']
                for module in subunit_modules:
                    print(module['pks_domains'])
            return
        # This makes sure the subunit accession naming is consistent
        # The purpose of these two 'if' statements is because there may be cases in the MiBiG JSON file
        # where the name of the gene is for example, 'eryA1, A000000' and we want to keep consistant naming
        if len(ordered_subunits[0]) >= 8:
            ordered_subunits = [entry for entry in ordered_subunits if len(entry) >= 8]
        if len(ordered_subunits) > 1:
            if len(ordered_subunits[1]) >= 8:
                ordered_subunits = [entry for entry in ordered_subunits if len(entry) >= 8]
        # This is because sometimes the accession number under which the gene is recorded sometimes
        # has a version number, and sometimes does not
        if len(ordered_subunits[0].split('.')) == 1 and len(ordered_subunits[0]) == 8:
            ordered_subunits = [entry + '.1' for entry in ordered_subunits]

        # Check if subunit is in either dictionary
        for isubunit,subunit in enumerate(ordered_subunits):
            if subunit not in set(list(unordered_subunits.keys()) + list(unordered_subunits_alt.keys())):
                print('Missing subunit: "%s"' %(subunit))
                for gene in mibig_data['general_params']['Polyketide']['mod_pks_genes']:
                    if gene['mod_pks_gene'] == subunit:
                        module = gene['pks_module']
                        for entry in module:
                            print(entry['pks_domains'])
                print(unordered_subunits.keys())
                return
    #    print([gene_ref for gene_ref in mibig_data['general_params']['Polyketide']['mod_pks_genes']])

        # Determine whether to use standard or alternative dict
        if len(ordered_subunits[0]) >= 8:
            alt = False
        else:
            alt = True
    
    # Just use unordered gene order if the gene ordering is not already in the JSON file 
    except Exception:
        ordered_subunits = list(unordered_subunits_alt.keys())
        ordered_subunits.sort()
        alt = True

    ####################################
    # This does the subunit reordering #
    ####################################
    for subunit_key in ordered_subunits:
        # subunit data has form (id, description, start, stop, module dict, sequence)
        if not alt:
            subunitdata = unordered_subunits[subunit_key]
        else:
            subunitdata = unordered_subunits_alt[subunit_key]
     
        if not alt:
            # subunit = id
            # subunit[0] = name
            # id, name, description, start, stop, sequence
            subunit = pks.models.Subunit(cluster=cluster_ref,
                                         genbankAccession=subunit_key,
                                         name=subunitdata[0],
                                         start=subunitdata[2],
                                         stop=subunitdata[3],
                                         sequence=subunitdata[-1])
            subunit.save()
            # subunits.append(pks.Subunit(subunit, subunitdata[0], subunitdata[1],
            #                            subunitdata[2], subunitdata[3], subunitdata[-1],
            #                            modules))
        else:
            # subunit = name
            # subunit[0] = id
            subunit = pks.models.Subunit(cluster=cluster_ref,
                                         genbankAccession=subunitdata[0],
                                         name=subunit_key,
                                         start=subunitdata[2],
                                         stop=subunitdata[3],
                                         sequence=subunitdata[-1])
            subunit.save()
                
                # subunits.append(pks.Subunit(subunitdata[0], subunit, subunitdata[1],
                #                            subunitdata[2], subunitdata[3], subunitdata[-1],
                #                            modules))
        
        # This is the modules for the subunit
        moduledata = subunitdata[-2]
        
        # We do this so we can lump in the loading didomain and TE on the first and last modules respectively
        modulekeys = list(moduledata.keys())
        imodule = 0
        modules_seen = 0
        while imodule < len(modulekeys):
            # Get info
            keys = list(moduledata[modulekeys[imodule]].keys())
            values = moduledata[modulekeys[imodule]].values()
            # Process info according to loading or not
            if modules_seen == 0:
                loading = True                
                # Don't name KSQ and ATL separate after all
#                moduledict =  OrderedDict([(k.replace('KS', 'KS').replace('AT', 'AT'), v) \
#                                          if k in ['KS','AT'] \
#                                          else (k,v) \
#                                          for k,v in zip(keys,values)])
                moduledict =  OrderedDict([(k,v) for k,v in zip(keys,values)])
            else: 
                loading = False
                moduledict = OrderedDict([(k,v) for k,v in zip(keys,values)])
            # Determine whether module is terminal or not
            if 'Thioesterase' in list(moduledata[modulekeys[imodule]].keys()):
                terminal = True
            else:
                terminal = False
            imodule += 1
            modules_seen += 1
            try:
                # This is to make sure we don't add subunits with invalid modules
                # The check for errors here is to compare agains the predicted chemcial structure
                domains_present = moduledict.keys()
                if 'ACP' in domains_present or 'PCP' in domains_present:
                    if 'AT' in domains_present or 'ATL' in domains_present:
                        module = pks.models.Module(subunit=subunit, loading=loading, terminal=terminal)
                        module.save()
                        module.buildDomains(moduledict, cyclic=cyclize)
                        
                        # modules.append(pks.Module(moduledict, loading=loading, terminal=terminal))
            except AssertionError as e:
                print(moduledict)
                print(type(e).__name__, e.args, subunit + ' ' + subunitdata[1])
                raise Exception(type(e).__name__, e.args, subunit + ' ' + subunitdata[1])
                break


In [None]:
# Testing specific entries
entry = ('BGC0000055', 'U78289')  # tylactone
#entry = ('BGC0000033', 'AF497482') # calicheamicin

with open(os.path.join(file_path, entry[0] + '.json')) as json_file:
    mibig_data = json.load(json_file)
#pprint(test_data.keys())
#pprint(test_data['general_params'].keys())
#pprint(test_data['general_params'])
#pprint([gene_ref['mod_pks_gene'] for gene_ref in test_data['general_params']['Polyketide']['mod_pks_genes']])
record = SeqIO.read(os.path.join(antismash_file_path, entry[0] + '.embl'), "embl")

# antismash_data = get_gene_data(record)
# for subunit in antismash_data:
#     if len(subunit) == 6:
#         print(subunit)

# TYLER: CLUSTER CREATION
[cluster.delete() for cluster in pks.models.Cluster.objects.all()]
cluster_ref = pks.models.Cluster(
    genbankAccession = record.annotations['comment'].split()[-1].strip().strip('.'), \
    mibigAccession = record.id, \
    description= record.description, \
    sequence= record.seq)
cluster_ref.save()

process_cluster(record, cluster_ref, os.path.join(file_path, entry[0] + '.json'))

In [None]:
[cluster.delete() for cluster in pks.models.Cluster.objects.all()]

# Iterate over list of type I modular PKSs
for i in range(len(t1pks)):
    print('%d: %s' %(i, t1pks[i]))
    entry = t1pks[i]

     # This prints the accession number and product compound of the cluster
    with open(os.path.join(file_path, entry[0] + '.json')) as json_file:
        mibig_data = json.load(json_file)
        pprint([compound['compound'] for compound in mibig_data['general_params']['compounds']])

    # Read in cluster data
    record = SeqIO.read(os.path.join(antismash_file_path, entry[0] + '.embl'), "embl")    

    try:
        cluster_ref = pks.models.Cluster(
            genbankAccession = record.annotations['comment'].split()[-1].strip().strip('.'), \
            mibigAccession = record.id, \
            description= record.description, \
            sequence= record.seq)
        cluster_ref.save()

        process_cluster(record, cluster_ref, os.path.join(file_path, entry[0] + '.json'))
    except Exception:
        pass


   

0: ('BGC0000001', 'JF752342')
['Abyssomicin C', 'Atrop-abyssomicin C']
1: ('BGC0000002', 'CP007155')
['aculeximycin']
2: ('BGC0000003', 'AB179766')
['AF-toxin']
3: ('BGC0000004', 'AB196490')
['aflatoxin']
4: ('BGC0000005', 'AF452809')
['aflatoxin']
5: ('BGC0000006', 'AY510451')
['aflatoxin']
6: ('BGC0000007', 'AY510452')
['aflatoxin']
7: ('BGC0000008', 'AY510453')
['aflatoxin']
8: ('BGC0000009', 'AY510454')
['aflatoxin']
9: ('BGC0000010', 'AY510455')
['aflatoxin']
10: ('BGC0000011', 'AY092402')
['aflatoxin', 'sterigmatocystin']
11: ('BGC0000012', 'AB120221')
['alternapyrone']
12: ('BGC0000013', 'BN001304')
['alternariol']
13: ('BGC0000014', 'DQ897667')
['ambruticin']
14: ('BGC0000015', 'AF357202')
['amphotericin']




15: ('BGC0000017', 'FJ477836')
['Anatoxin-a', 'Homoanatoxin-a']
16: ('BGC0000018', 'EU220288')
['angolamycin']
17: ('BGC0000019', 'EU232693')
['angolamycin']
18: ('BGC0000020', 'AF453501')
['ansamitocin']
19: ('BGC0000021', 'JF819834')
['apoptolidin']
20: ('BGC0000022', 'AACD01000015')
['asperfuranone']
21: ('BGC0000023', 'AM850130')
['Aurafuron A']
22: ('BGC0000024', 'AJ575648')
['aureothin']
23: ('BGC0000025', 'AB032523')
['avermectin']
24: ('BGC0000028', 'GU390405.1')
['Bafilomycin B1']
25: ('BGC0000029', 'FJ872523')
['BE-14106']
26: ('BGC0000030', 'HF679027')
['bikaverin']
27: ('BGC0000031', 'AJ580915')
['Borrelidin']
28: ('BGC0000032', 'HM452329')
['calcimycin']
29: ('BGC0000033', 'AF497482')
['calicheamicin']
30: ('BGC0000034', 'AY310323')
['candicidin']
31: ('BGC0000035', 'AY509120')
['chalcomycin']
32: ('BGC0000036', 'DQ116941')
['chlorothricin', 'deschlorothricin']
33: ('BGC0000037', 'BN001301')
['cichorine']
34: ('BGC0000038', 'AL645882')
['coelimycin P1']
35: ('BGC0000039', 



46: ('BGC0000052', 'AY899214')
['ECO-02301']
47: ('BGC0000053', 'GP697151')
['elaiophylin']
48: ('BGC0000054', 'AY623658')
['erythromycin']
49: ('BGC0000055', 'AM420293')
['Erythromycin A', 'Erythromycin B', 'Erythromycin C', 'Erythromycin D']
50: ('BGC0000056', 'AY267372')
['esperamicin']
51: ('BGC0000058', 'AB469193')
['FD-891']
52: ('BGC0000059', 'BA000030')
['filipin']
53: ('BGC0000060', 'HQ434551')
['fostriecin']
54: ('BGC0000061', 'AY310323')
['FR-008']
55: ('BGC0000062', 'AF155773')
['fumonisin']
56: ('BGC0000063', 'EU449979')
['fumonisin']
57: ('BGC0000064', 'AY604568')
['fusarin']
58: ('BGC0000066', 'AY179507')
['geldanamycin']
59: ('BGC0000067', 'DQ249341')
['geldanamycin']
60: ('BGC0000068', 'DQ914285')
['geldanamycin']
61: ('BGC0000069', 'KF479198')
['gephyronic acid']
62: ('BGC0000072', 'KM361622')
['gulmirecin A']
63: ('BGC0000073', 'AB241068')
['halstoctacosanolide']
64: ('BGC0000074', 'AY947889')
['herbimycin']
65: ('BGC0000075', 'JX504844')
['hygrocin A', 'hygrocin B']



87: ('BGC0000098', 'DQ176595')
['monacolin K']
88: ('BGC0000099', 'HM070047')
['monascorubrin']
89: ('BGC0000100', 'AF440781')
['monensin']
90: ('BGC0000102', 'AB089954')
['mycinamicin']
91: ('BGC0000103', 'BX649209')
['mycolactone']
92: ('BGC0000105', 'AF521085')
['nanchangmycin']
93: ('BGC0000106', 'GQ452266')
['naphthomycin']
94: ('BGC0000107', 'BN001302')
['naphthopyrone']
95: ('BGC0000108', 'HQ386234')
['natamycin']
96: ('BGC0000109', 'AB363939')
['nemadectin']
97: ('BGC0000110', 'AM778535')
['Neoaureothin', 'orinocin', 'SNF4435C', 'SNF4435D']
98: ('BGC0000111', 'AB097904')
['neocarzilin']
99: ('BGC0000113', 'AF016585')
['niddamycin']
100: ('BGC0000114', 'DQ354110')
['nigericin']
101: ('BGC0000115', 'AF263912')
['nystatin A1']
102: ('BGC0000116', 'EU108007')
['nystatin-like Pseudonocardia polyene']
103: ('BGC0000117', 'AB070940')
['oligomycin']
104: ('BGC0000118', 'AB303063')
['pactamycin']
105: ('BGC0000121', 'KC145148')
['pestheic acid']
106: ('BGC0000122', 'KF739396')
['phenyln



138: ('BGC0000158', 'EU035755')
['tautomycetin']
139: ('BGC0000159', 'EF990140')
['tautomycin']
140: ('BGC0000160', 'CH476602')
['terreic acid']
141: ('BGC0000162', 'EU443633')
['tetrocarcin A']
142: ('BGC0000163', 'FJ462704')
['tetronasin']


In [None]:
# delete clusters with no modules
for cluster in pks.models.Cluster.objects.all():
    for subunit in cluster.subunits():
        if len(subunit.modules()) == 0:
            subunit.delete()
    if len(cluster.subunits()) == 0:
        cluster.delete()

In [None]:
# delete clusters with no computable product
for cluster in pks.models.Cluster.objects.all():
    try:
        cluster.computeProduct()
    except:
        cluster.delete()