In [1]:
import os
import glob
from pprint import pprint
import json
import pickle
import re
from collections import OrderedDict
from Bio import SeqIO

In [2]:
from rdkit import Chem as chem
from rdkit.Chem import AllChem, Draw
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
import os, sys
sys.path.insert(0, '/clusterCAD')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "clusterCAD.settings")
import django
django.setup()
import pks.models

In [4]:
file_path = './mibig'
file_names = glob.glob(os.path.join(file_path, '*.json'))
print('%d MiBiG entries found!' %(len(file_names)))

1396 MiBiG entries found!


# Load data

In [5]:
# Print PKS subtype labels there present
labels = []
for file_name in file_names:
    with open(file_name) as json_file:
        json_data = json.load(json_file)
    try:
        labels.extend(json_data['general_params']['Polyketide']['pks_subclass'])
    except KeyError:
        pass
print(set(labels))

{'Type III', 'Modular type I', 'Iterative typeI', 'Trans-AT type I', 'Modular Type I', 'Iterative type I', 'Type II', 'Other', 'PUFA synthase or related', 'Enediyne type I', 'Type I'}


In [6]:
# This gets all the modular type I PKSs contained in MiBiG
t1pks = []
for file_name in file_names:
    with open(file_name) as json_file:
        json_data = json.load(json_file)
    try:        
        if len(set(['Modular type I', 'Modular Type I', 'Type I']).intersection(set(json_data['general_params']['Polyketide']['pks_subclass']))) > 0:
            accession = json_data['general_params']['loci']['nucl_acc'][0]['Accession']
            t1pks.append((file_name.split('/')[-1].split('.')[0], accession))
    except KeyError:
        pass
# Number of clusters found
print('%d potential type I modular PKS clusters found!' %(len(t1pks)))

352 potential type I modular PKS clusters found!


In [7]:
# antiSMASH output
antismash_file_path = './mibig/antismash/split_files'

# Proccessing functions

In [8]:
def process_subunit_modules(sec_met): 
    '''This function takes as input annotated features for a PKS subunit (stored  as feature.qualifiers['sec_met'] 
        after annotation by antiSMASH), and returns a OrderedDict of OrderedDict objects corresponding to the 
        annotated modules contained by the subunit. They keys of the subunit OrderedDict are the index of the 
        module within the subunit. This function assumes that feature.type=='CDS' and that feature.qualifiers 
        has the key 'sec_met'. If the first entry of 'sec_met' is not 'Type: t1pks' then nothing is returned.
    '''
    # Initialize dict for the subunit
    # keys: module number
    # values: OrderedDict of domains in module. within the OrderedDict, the key is the domain main, while the
    #         value is a length two list where the first element is a dictionary {start:, stop:} and the 
    #         second element is specificity dictionary 
    subunit = {}
    
    # This is for the current module (function processes subunit which may have more than one module)
    module_index = 0  # key for module
    module_domains = [] # list of domains in module
    old_module_domains = [] # pre-initialize in case subunit starts with a domain
                            # that is expected to end the module
    
    # This is how domains appear in sec_met:
    # ['PKS_AT', 'PKS_KS', 'PKS_KR', 'PKS_DH', 'PKS_ER', 'ACP', 'Thioesterase']
    # Iterate over the entries in sec_met, and add them to the module_domains list 
    for entry in sec_met:    
        # Split entry into a list
        entrysplit = [item.strip() for item in entry.split(';') if item != '']
        # Split part of entry that is expected to describe catalytic domain
        domainsplit = entrysplit[0].split()
        # This is just different ways of processing the name of the domain depending
        # on how the name of the domain is formatted
        if ' '.join(domainsplit[:2]) == 'NRPS/PKS Domain:' and len(domainsplit) > 2:
            # Note that we want to make sure that there is a leading 'PKS_' before we do our trimming
            if domainsplit[2].split('_')[0] == 'PKS':
                if domainsplit[2] in ['PKS_Docking_Nterm', 'PKS_Docking_Cterm']:
                    domaintype = domainsplit[2]
                else:
                    # Trim off the leading 'PKS_'
                    # Assume 'DH2' and 'DHt' are the same as 'DH' 
                    domaintype = domainsplit[2].split('_')[-1].replace('DHt', 'DH').replace('DH2', 'DH')
            # Special case of 'CAL' domain
            elif domainsplit[2] == 'CAL_domain':
                domaintype = 'CAL'
            else:
                domaintype = domainsplit[2]
        else:
            continue
        
        # These are the catalytic domains that we want to recognize
        if domaintype not in ['KS', 'AT', 'KR', 'DH', 'ER', 'ACP', 'Thioesterase', 
                              'cMT', 'oMT', 'CAL', 'PCP', 
                              'Heterocyclization', 'AMP-binding', 
                              'Condensation_DCL', 'Condensation_LCL',
                              'PKS_Docking_Nterm', 'PKS_Docking_Cterm']:
            # Break out of for loop and stop looking for additional catalytic domains if 
            # we encounter a domain that we don't recognize
            # We end up excluding any subunit that has a non-recognized catalytic domain
            break    
        # Get the boundaries of the catalytic domain
        boundaries = [int(bound) for bound in domainsplit[3].replace('(', '').replace(')', '').replace('.', '').split('-')]
        
        # Here, we add each domain to a list, which will be converted to an OrderedDict
        # based on whether or not the domain is expected to have substrate specificity annotations
        if domaintype in ['KS', 'DH', 'ER', 'ACP', 'cMT', 'oMT', 'CAL', 'PCP',
                          'Heterocylization', 'AMP-binding', 
                          'Condensation_DCL', 'Condensation_LCL',
                          'PKS_Docking_Nterm', 'PKS_Docking_Cterm']:
            module_domains.append((domaintype, [{'start': boundaries[0], 'stop': boundaries[1]}]))
        # Include substrate and stereospecificity annotations for AT and KR domains respectively
        if domaintype in ['AT', 'KR']:
            notesdict = {}
            for note in entrysplit[1:]:
                item = note.split(': ')
                notesdict[item[0]] = item[1]
            module_domains.append((domaintype, [{'start': boundaries[0], 'stop': boundaries[1]}, notesdict]))
                
        # End of the module has been reached of the domain is 'ACP' or 'PCP
        if domaintype in ['ACP', 'PCP']:
            domains_present = [d[0] for d in module_domains]
            # Make sure every module has an AT, or else it isn't a valid module and we just ignore it
            # This means it will be excluded from the subunit, which makes sense since we can't 
            # really perform a polyketide chain extension without an AT
            if 'AT' in domains_present:            
                subunit[module_index] = OrderedDict(module_domains)
                old_module_domains = module_domains
                module_index += 1
            else:
                old_module_domains = []
            module_domains = []
        # These domains may come after the ACP or PCP, so if they are encountered, we add
        # them to previous module and keep going forward
        if domaintype in ['Thioesterase', 'PKS_Docking_Cterm', 'Condensation_LCL']:
            # Overwrite previous subunit, or else will have duplicate entries
            old_module_domains.append((domaintype, [{'start': boundaries[0], 'stop': boundaries[1]}]))
            subunit[module_index - 1] = OrderedDict(old_module_domains)
            module_domains = []
            
    return subunit

In [9]:
def get_gene_data(record):
    '''Takes as input a record read in from an MiBiG GenBank file using SeqIO.read() and outputs list of sublists
        containing PKS data from that record. Each entry in list is a sublist that represents a subunit or standalone.
    '''
    # Get list to hold information about all genes that are in the record
    gene_data = []
    
    # Only the "CDS" features are potentially gene
    for feature in record.features:
        # These are the features we are interested in
        if feature.type == 'CDS' and 'protein_id' in feature.qualifiers.keys() and 'gene' in feature.qualifiers.keys(): 
            # This gets the location of the feature
            location = feature.location
            # General information about gene
            if 'product' in feature.qualifiers.keys():
                description = feature.qualifiers['product'][0]
            gene_data.append([feature.qualifiers['protein_id'][0],
                              feature.qualifiers['gene'][0],
                             ])
            # Feature may not be a PKS module and therefore may not have have subunits 
            # (this will be overwritten if it does have subunits)
            subunit_modules = None
            # Information if gene is PKS subunit
            if 'sec_met' in feature.qualifiers.keys() and len(feature.qualifiers['sec_met']) > 3:
                if feature.qualifiers['sec_met'][3] in ['NRPS/PKS subtype: Type I Modular PKS', 
                                                        'NRPS/PKS subtype: PKS-like protein',
                                                        'NRPS/PKS subtype: PKS/NRPS-like protein',
                                                        'NRPS/PKS subtype: Hybrid PKS-NRPS']:
                    subunit_modules = process_subunit_modules(feature.qualifiers['sec_met'])

            # Append description and position of gene within nucleotide sequence
            gene_data[-1].extend([description, [location.start.position, location.end.position]])

            # Subunit information (if it doesn't have subunit information, assumed to be a standalone enzyme)
            if subunit_modules:
                gene_data[-1].append(subunit_modules)

            # Amino acid sequence
            gene_data[-1].append(feature.qualifiers['translation'][0])

    return gene_data

In [10]:
def check_json_module_validity(module_list):
    '''Function that makes sure module specified in JSON file is valid,
       that is to say, make sure that it contains KS, AT, and ACP or PCP.
       These domains have the following possible annotations in type I modular PKSs as of February 24, 2017:
       ['KS', 'AT', 'T']
       ['Ketosynthase', 'Acyltransferase', 'Thiolation (ACP/PCP)']
    '''
    at_check = len(set(['AT', 'Acyltransferase']).intersection(set(module_list)))
    acp_check = len(set(['ACP', 'PCP', 'T', 'Thiolation (ACP/PCP)']).intersection(set(module_list)))

    if at_check and acp_check:
        return True
    else:
        return False

def process_cluster(record, cluster_ref, mibig_json):
    '''Takes in a record and corresponding MiBiG json file and enters cluster information into Django database.
    '''
    # Get information about the gene
    gene_data = get_gene_data(record)
    if len(gene_data) == 0:
        return

    # Initalize lists for subunits and standalones
    # We make two dictionaries because sometimes the subunit name in the MiBiG JSON files
    # is the gene name, e.g. eryA1, and sometimes it is the accession number, e.g. A0000000
    unordered_subunits = {}
    unordered_subunits_alt = {}
    standalones = []
     
    # Recall that each entry in gene_data is a list
    # [protein id, gene, product, [location start, location end], subunit dict (optional), translation]
    
    #####################
    # Basic information #
    #####################
    
    counter = 1
    for gene in gene_data:
        geneid = gene[0].strip()
        genename = gene[1].strip()
        genedesc = gene[2].strip()
        genestart = gene[3][0]
        genestop = gene[3][1]
        genetranslation = gene[-1].strip()

        # Just use length of gene_data to differentiate between standalones and subunits
        if len(gene) == 6:
            # We do this to take care of duplicated gene names, as is the case wity tylactone (BGC0000166)
            if genename in unordered_subunits_alt.keys():
                genename = genename + '_' + str(counter)
                counter += 1
 
            # Get subunit data from gene
            genesubunitdata = gene[-2]
            # Here we use the two dictionary options to save the unordered subunits
            # Sometimes MiBiG uses geneid and sometimes it uses genename to reference subunits
            unordered_subunits[geneid] = (genename, genedesc, genestart, genestop,
                                            genesubunitdata, genetranslation)
            unordered_subunits_alt[genename] = (geneid, genedesc, genestart, genestop,
                                                genesubunitdata, genetranslation)
        else:
            # Standalones lack subunit and orphan entries
            assert len(gene) == 5, gene
            
            #####################
            # CREATE STANDALONE #
            #####################

#            pks.models.Standalone(cluster=cluster_ref)
#            standalones.append(pks.Standalone(geneid, genename, genedesc, 
#                                              genestart, genestop, genetranslation))
        
    #########################################
    # JSON file has cyclization information #
    #########################################

    # Get ordered version of subunits from corresponding JSON file
    with open(mibig_json) as json_file:
        mibig_data = json.load(json_file)
    
    # Get PKS cyclization information
    # this will be either 'Cyclic' or 'Linear'
    try:
        lin_cycl_pk = mibig_data['general_params']['Polyketide']['lin_cycl_pk']
        if lin_cycl_pk == 'Cyclic':
            cyclize = True
        elif lin_cycl_pk == 'Linear':
            cyclize = False
        else:
            raise Exception("lin_cycl_pk expected to be 'Cyclic' or 'Linear'.")
    except KeyError:
        cyclize = False
            
    #############################################
    # Case 1: Use JSON file subunit information #
    #############################################
        
    # Note that all gene data has now been processed, want to reprocess to get right ordering 
    # We strip out subunits that have invalid modules
    try:
        ordered_subunits = []
        for subunit in mibig_data['general_params']['Polyketide']['mod_pks_genes']:
            subunit_name = re.sub(r'\s+', '', subunit['mod_pks_gene'])
            subunit_modules = subunit['pks_module']

            valid_subunit = True
            # This checks if the module is valid
            for module in subunit_modules:
                if not check_json_module_validity(module['pks_domains']):
                    valid_subunit = False
            if valid_subunit:
                ordered_subunits.extend(subunit_name.split(','))
            else:
                # Loop is broken once first invalid subunit is encountered
                break
        # If no valid subunits, then raise exception to use alphabetical ordering
        if len(ordered_subunits) == 0:
            raise Exception
#             print('\tNo valid subunits!')
#             for subunit in mibig_data['general_params']['Polyketide']['mod_pks_genes']:
#                 subunit_name = re.sub(r'\s+', '', subunit['mod_pks_gene'])
#                 subunit_modules = subunit['pks_module']
#                 for module in subunit_modules:
#                     print(module['pks_domains'])
#             return
        # This makes sure the subunit accession naming is consistent
        # The purpose of these two 'if' statements is because there may be cases in the MiBiG JSON file
        # where the name of the gene is for example, 'eryA1, A000000' and we want to keep consistant naming
        if len(ordered_subunits[0]) >= 8:
            ordered_subunits = [entry for entry in ordered_subunits if len(entry) >= 8]
        if len(ordered_subunits) > 1:
            if len(ordered_subunits[1]) >= 8:
                ordered_subunits = [entry for entry in ordered_subunits if len(entry) >= 8]
        # This is because sometimes the accession number under which the gene is recorded sometimes
        # has a version number, and sometimes does not
        if len(ordered_subunits[0].split('.')) == 1 and len(ordered_subunits[0]) == 8:
            ordered_subunits = [entry + '.1' for entry in ordered_subunits]

        # Check if subunit is in either dictionary
        for isubunit,subunit in enumerate(ordered_subunits):
            if subunit not in set(list(unordered_subunits.keys()) + list(unordered_subunits_alt.keys())):
                print('Missing subunit: "%s"' %(subunit))
                for gene in mibig_data['general_params']['Polyketide']['mod_pks_genes']:
                    if gene['mod_pks_gene'] == subunit:
                        module = gene['pks_module']
                        for entry in module:
                            print(entry['pks_domains'])
                print(unordered_subunits.keys())
                return
    #    print([gene_ref for gene_ref in mibig_data['general_params']['Polyketide']['mod_pks_genes']])

        # Determine whether to use standard or alternative dict
        if len(ordered_subunits[0]) >= 8:
            alt = False
        else:
            alt = True
    
    #########################################
    # Case 2: Alphabetical subunit ordering #
    #########################################
    
    # Just use unordered gene order if the gene ordering is not already in the JSON file 
    except Exception:
        ordered_subunits = list(unordered_subunits_alt.keys())
        ordered_subunits.sort()
        alt = True

    ####################################
    # This does the subunit reordering #
    ####################################
    # This is just to make sure laoding module is assigned correctly
    modules_seen = 0
    for subunit_key in ordered_subunits:
        # subunit data has form (id, description, start, stop, module dict, sequence)
        if not alt:
            subunitdata = unordered_subunits[subunit_key]
        else:
            subunitdata = unordered_subunits_alt[subunit_key]
     
        if not alt:
            # subunit = id
            # subunit[0] = name
            # id, name, description, start, stop, sequence
            subunit = pks.models.Subunit(cluster=cluster_ref,
                                         genbankAccession=subunit_key,
                                         name=subunitdata[0],
                                         start=subunitdata[2],
                                         stop=subunitdata[3],
                                         sequence=subunitdata[-1])
            subunit.save()
            # subunits.append(pks.Subunit(subunit, subunitdata[0], subunitdata[1],
            #                            subunitdata[2], subunitdata[3], subunitdata[-1],
            #                            modules))
        else:
            # subunit = name
            # subunit[0] = id
            subunit = pks.models.Subunit(cluster=cluster_ref,
                                         genbankAccession=subunitdata[0],
                                         name=subunit_key,
                                         start=subunitdata[2],
                                         stop=subunitdata[3],
                                         sequence=subunitdata[-1])
            subunit.save()
                
                # subunits.append(pks.Subunit(subunitdata[0], subunit, subunitdata[1],
                #                            subunitdata[2], subunitdata[3], subunitdata[-1],
                #                            modules))
        
        # This is the modules for the subunit
        moduledata = subunitdata[-2]
        
        # We do this so we can lump in the loading didomain and TE on the first and last modules respectively
        modulekeys = list(moduledata.keys())
        imodule = 0
        while imodule < len(modulekeys):
            # Get info
            keys = list(moduledata[modulekeys[imodule]].keys())
            values = moduledata[modulekeys[imodule]].values()
            # Process info according to loading or not
            if modules_seen == 0:
                loading = True                
            else: 
                loading = False
            moduledict = OrderedDict([(k,v) for k,v in zip(keys,values)])
            # Determine whether module is terminal or not
            if 'Thioesterase' in list(moduledata[modulekeys[imodule]].keys()):
                terminal = True
            else:
                terminal = False
            imodule += 1
            modules_seen += 1
            try:
                # This is to make sure we don't add subunits with invalid modules
                # The check for errors here is to compare agains the predicted chemcial structure
                domains_present = moduledict.keys()
                if 'ACP' in domains_present or 'PCP' in domains_present:
                    if 'AT' in domains_present or 'ATL' in domains_present:
                        # modules.append(pks.Module(moduledict, loading=loading, terminal=terminal))
                        module = pks.models.Module(subunit=subunit, loading=loading, terminal=terminal)
                        module.save()
                        module.buildDomains(moduledict, cyclic=cyclize)
            except AssertionError as e:
                print(moduledict)
                print(type(e).__name__, e.args, subunit + ' ' + subunitdata[1])
                raise Exception(type(e).__name__, e.args, subunit + ' ' + subunitdata[1])
                break

# Process all clusters

In [11]:
# This cleans up the dictionary
[cluster.delete() for cluster in pks.models.Cluster.objects.all()]

# Iterate over list of type I modular PKSs
for i in range(len(t1pks)):
    print('%d: %s' %(i, t1pks[i]))
    entry = t1pks[i]

     # This prints the accession number and product compound of the cluster
    with open(os.path.join(file_path, entry[0] + '.json')) as json_file:
        mibig_data = json.load(json_file)
        pprint([compound['compound'] for compound in mibig_data['general_params']['compounds']])

    # Read in cluster data
    record = SeqIO.read(os.path.join(antismash_file_path, entry[0] + '.embl'), "embl")
    try:
        cluster_ref = pks.models.Cluster(
            genbankAccession=record.annotations['comment'].split()[-1].strip().strip('.'), \
            mibigAccession=record.id, \
            description=record.description.replace(' biosynthetic gene cluster', ''), \
            sequence=record.seq)
        cluster_ref.save()
        # Processes subunits and modules belonging to cluster
        process_cluster(record, cluster_ref, os.path.join(file_path, entry[0] + '.json'))
    except Exception:
        pass

0: ('BGC0000971', 'FR687018')
['Cinnabaramide']
1: ('BGC0000170', 'ACJE01000012')
['yanuthone D']
2: ('BGC0000958', 'NZ_FR873698')
['Antimycin']
3: ('BGC0000013', 'BN001304')
['alternariol']
4: ('BGC0001042', 'FJ809786')
['sanglifehrin A']
5: ('BGC0000116', 'EU108007')
['nystatin-like Pseudonocardia polyene']
6: ('BGC0000083', 'GQ274954')
['Lactimidomycin',
 '8,9-dihydro-LTM',
 '8,9-dihydro-8S-hydroxy-LTM',
 '8,9-dihydro-9R-hydroxy-LTM']
7: ('BGC0000992', 'AY700570')
['fusaridione A']
8: ('BGC0001172', 'KF585133')
['Chlorizidine A']
9: ('BGC0001000', 'AY974560')
['hectochlorin']
10: ('BGC0001160', 'CP000951')
['1-nonadecene', '(14Z)-1,14-Nonadecadiene']
11: ('BGC0000969', 'AM179409')
['Chondramid A']
12: ('BGC0001243', 'FR717895')
['botryenalol',
 'botrydial',
 'dihydrobotrydial',
 'botryendial',
 'beta-O-Methyl-dihydrobotrydialone',
 'botcinic acid',
 'botcinin A']
13: ('BGC0000003', 'AB179766')
['AF-toxin']
14: ('BGC0001359', 'LN997801')




['PM100117', 'PM100118']
15: ('BGC0001258', 'AF549411')
['1,8-dihydroxynaphthalene']
16: ('BGC0000035', 'AY509120')
['chalcomycin']
17: ('BGC0000121', 'KC145148')
['pestheic acid']
18: ('BGC0000141', 'AJ871581')
['rubradirin']
19: ('BGC0001101', 'AF484556')
['leinamycin']
20: ('BGC0000148', 'AY007564')
['spinosad']
21: ('BGC0000029', 'FJ872523')
['BE-14106']
22: ('BGC0001119', 'HF563079.1')
['divergolide A', 'divergolide B', 'divergolide C', 'divergolide D']
23: ('BGC0000979', 'GQ385959')
['cylindrospermopsin']
24: ('BGC0000159', 'EF990140')
['tautomycin']
25: ('BGC0001300', 'KT368179')
['anthracimycin']
26: ('BGC0001033', 'CP003355')
['paenilamicin']
27: ('BGC0001236', 'KT282101')
['thiotetroamide']
28: ('BGC0001064', 'JX477167')
['Cylindrocyclophane D', 'Cylindrocyclophane E', 'Cylindrocyclophane F']
29: ('BGC0000039', 'AB072893')
['compactin']
30: ('BGC0000064', 'AY604568')
['fusarin']
31: ('BGC0000043', 'GQ412749')
['curacin A']
32: ('BGC0001051', 'GQ981380')
['Thuggacin A']
33: (



39: ('BGC0001013', 'DL080157')
['meridamycin']
40: ('BGC0000980', 'GQ385960')
['cylindrospermopsin']
41: ('BGC0000020', 'AF453501')
['ansamitocin']
42: ('BGC0000155', 'FJ943499')
['T-toxin']
43: ('BGC0001021', 'AL123456')
['mycobactin']
44: ('BGC0000011', 'AY092402')
['aflatoxin', 'sterigmatocystin']




45: ('BGC0000054', 'AY623658')
['erythromycin']
46: ('BGC0000031', 'AJ580915')
['Borrelidin']
47: ('BGC0001011', 'DQ351275')
['meridamycin']
48: ('BGC0001161', 'KF550301')
['1-heptadecene']
49: ('BGC0001218', 'KP737857')
['fumosorinone']
50: ('BGC0001055', 'AM236324')
['yersiniabactin']
51: ('BGC0000072', 'KM361622')
['gulmirecin A']
52: ('BGC0000137', 'CP000850')
['rifamycin']
53: ('BGC0000995', 'HM047288')
['FR901464']
54: ('BGC0000017', 'FJ477836')
['Anatoxin-a', 'Homoanatoxin-a']
55: ('BGC0001195', 'KP339942')
['nocardiopsin']
56: ('BGC0000084', 'JQ793783')
['laidlomycin']
57: ('BGC0000972', 'AM229678')
['N-myristoyl-D-asparagine',
 'cis-7-tetradecenoyl-D-asparagine',
 '(R)-N1-((S)-5-oxohexan-2-yl)-2-tetradecanamidosuccinamide']
58: ('BGC0001390', 'LC125467')
['LL-Z1272beta']
59: ('BGC0001017', 'AF183408.1')
['microcystin LR']
60: ('BGC0001262', 'AB872924')
['AK-toxin']
61: ('BGC0001212', 'KT067736')
['nannocystin a']
62: ('BGC0000380', 'HM639990')
['Leupyrrin']
63: ('BGC0000118',



75: ('BGC0001144', 'GG698521')
['neosartoricin']
76: ('BGC0000004', 'AB196490')
['aflatoxin']
77: ('BGC0000977', 'AB506492')
['cyclopiazonic acid']
78: ('BGC0000988', 'AF210843')
['epothilone']
79: ('BGC0000086', 'AB449340')
['lasalocid']
80: ('BGC0000100', 'AF440781')
['monensin']
81: ('BGC0000007', 'AY510452')
['aflatoxin']
82: ('BGC0000142', 'CP000667')
['salinilactam']
83: ('BGC0000008', 'AY510453')
['aflatoxin']
84: ('BGC0000146', 'AB514562')
['solanapyrone']
85: ('BGC0000079', 'FJ545274')
['indanomycin']
86: ('BGC0000068', 'DQ914285')
['geldanamycin']
87: ('BGC0000134', 'EU520419')
['radicicol']
88: ('BGC0000129', 'CM000174')
['pyripyropene']
89: ('BGC0001287', 'LN831790')
['chaxamycin A', 'chaxamycin B', 'chaxamycin C', 'chaxamycin D']
90: ('BGC0001022', 'AF319998')
['myxalamid']
91: ('BGC0000108', 'HQ386234')
['natamycin']
92: ('BGC0000954', 'AM946600')
['Ajudazol']
93: ('BGC0001066', 'AM992894')
['Kendomycin']
94: ('BGC0000058', 'AB469193')
['FD-891']
95: ('BGC0001355', 'LN879



100: ('BGC0001203', 'KP830094')
['clarexpoxcin']
101: ('BGC0000098', 'DQ176595')
['monacolin K']
102: ('BGC0000095', 'KC608848')
['micromonolactam']
103: ('BGC0001245', 'KJ434938')
['lasiodiplodin']
104: ('BGC0001304', 'EQ963475')
['aflavarin']
105: ('BGC0000102', 'AB089954')
['mycinamicin']
106: ('BGC0000002', 'CP007155')
['aculeximycin']




107: ('BGC0000090', 'EU827593')
['Macbecin I', 'Macbecin II']
108: ('BGC0000076', 'EU520417')
['hypothemycin']
109: ('BGC0001394', 'CP004025')
['phenalamide']
110: ('BGC0000005', 'AF452809')
['aflatoxin']
111: ('BGC0001054', 'FN667742')
['Xenocoumacin 1', 'Xenocoumacin II']
Missing subunit: "XNC1_1704"
['Ketosynthase', 'Acyltransferase', 'Ketoreductase', 'Thiolation (ACP/PCP)']
dict_keys(['CBJ89760.1', 'CBJ89766.1', 'CBJ89764.1'])
112: ('BGC0001360', 'HG792019')
['mycophenolic acid']
113: ('BGC0000037', 'BN001301')
['cichorine']
114: ('BGC0001169', 'KF874660.1')
['piericidin A1']
115: ('BGC0000105', 'AF521085')
['nanchangmycin']
116: ('BGC0000042', 'AB818354')
['cremimycin']
117: ('BGC0000171', 'FR878059')
['9-methylstreptimidone']
118: ('BGC0001031', 'JX315603')
['oocydin A']
119: ('BGC0001027', 'AP006618')
['nocobactin NA']
120: ('BGC0001270', 'Y15279')
['gibberellin A3',
 'gibberellin A1',
 'gibberellin A7',
 'gibberellin A4',
 'gibberellin A9']
121: ('BGC0001373', 'LC079035')
['sor



124: ('BGC0001246', 'KJ434939')
['trans-resorcylide']
125: ('BGC0000077', 'EU520418')
['hypothemycin']
126: ('BGC0001281', 'NC_026500')
['ustilagic acid']
127: ('BGC0000135', 'AB568601')
['reveromycin']
128: ('BGC0001032', 'JX315604')
['oocydin A']
129: ('BGC0000177', 'GQ274953')
['iso-migrastatin',
 'migrastatin',
 'Dorrigocin A',
 'Dorrigocin B',
 '13-epi-Dorrigocin A']
130: ('BGC0001005', 'KF866134')
['locillomycin']
131: ('BGC0001253', 'AB444613')
['ACT-Toxin I']
132: ('BGC0001383', 'KF815729')
['macrolactin 1a', 'macrolactin 1b', 'macrolactin 1c', 'macrolactin 3a']
133: ('BGC0001125', 'KM078884')
['puwainaphycin A', 'puwainaphycin B', 'puwainaphycin C', 'puwainaphycin D']
134: ('BGC0000052', 'AY899214')
['ECO-02301']
135: ('BGC0000162', 'EU443633')
['tetrocarcin A']
136: ('BGC0001070', 'AM746336')
['kirromycin']




137: ('BGC0000981', 'GQ385961')
['cylindrospermopsin']
138: ('BGC0001276', 'D85860')
['6-methylsalicyclic acid']
139: ('BGC0000117', 'AB070940')
['oligomycin']
140: ('BGC0000099', 'HM070047')
['monascorubrin']
141: ('BGC0001045', 'JQ045344')
['Spiruchostatin A']
142: ('BGC0001358', 'LSBH01000002')
['leucinostatin A', 'leucinostatin B']
143: ('BGC0000143', 'DQ630728')
['salinomycin']
144: ('BGC0000082', 'EU301739')
['kijanimicin']
145: ('BGC0001403', 'CM000172')
['trypacidin']
146: ('BGC0001349', 'KP742963')
['heronamide A',
 'heronamide B',
 'heronamide C',
 'heronamide D',
 'heronamide E',
 'heronamide F']
147: ('BGC0000132', 'EF140903')
['pyrrolomycin']
148: ('BGC0001381', 'KP161205')
['brasilinolide A', 'brasilinolide B', 'brasilinolide C']
149: ('BGC0001159', 'JX157625')
['marinopyrrole A',
 'marynopyrrole E',
 'marynopyrrole D',
 'marynopyrrole C',
 'marinopyrrole B']
150: ('BGC0000073', 'AB241068')
['halstoctacosanolide']
151: ('BGC0001266', 'GU930713')
['grayanic acid']
152: ('B



185: ('BGC0000955', 'FR831800')
['Althiomycin']
186: ('BGC0000110', 'AM778535')
['Neoaureothin', 'orinocin', 'SNF4435C', 'SNF4435D']
187: ('BGC0000040', 'DQ149987')
['concanamycin A']
188: ('BGC0001065', 'JN671974')
['herboxidiene']
189: ('BGC0001053', 'AJ620477')
['tubulysin']
190: ('BGC0000173', 'JX173632')
['bongkrekic acid', 'isobongkrekic acid']
191: ('BGC0000125', 'AJ278573')
['pimaricin']
192: ('BGC0001273', 'AY540947')
['asperlactone']
193: ('BGC0000978', 'EU140798')
['cylindrospermopsin']
194: ('BGC0000963', 'AF210249')
['bleomycin']
195: ('BGC0000973', 'HE575208')
['collismycin A']
196: ('BGC0000062', 'AF155773')
['fumonisin']
197: ('BGC0001265', 'AB176546')
['melanin']
198: ('BGC0001034', 'HE616533')
['Pellasoren']
199: ('BGC0000160', 'CH476602')
['terreic acid']
200: ('BGC0000140', 'GQ332353')
['RK-682']
201: ('BGC0001216', 'KP719128')
['splenocin']
202: ('BGC0000127', 'AF081920')
['pyoluteorin']
203: ('BGC0000001', 'JF752342')
['Abyssomicin C', 'Atrop-abyssomicin C']
204: 



215: ('BGC0000128', 'AY394844')
['pyoluteorin']
216: ('BGC0001378', 'KT716443')
['tiancimycin']
217: ('BGC0000059', 'BA000030')
['filipin']
218: ('BGC0000984', 'CP000884')
['delftibactin A', 'delftibactin B']
219: ('BGC0001264', 'LC011911')
['betaenone A', 'betaenone B', 'betaenone C']
220: ('BGC0001010', 'AJ557546')
['Melithiazol A']
221: ('BGC0001252', 'AB725683')
['ACR-toxin']
222: ('BGC0001202', 'KP830093')
['landepoxcin']
223: ('BGC0001163', 'GL890825')
['1-heptadecene']
224: ('BGC0000014', 'DQ897667')
['ambruticin']
225: ('BGC0000012', 'AB120221')
['alternapyrone']
226: ('BGC0000018', 'EU220288')
['angolamycin']
227: ('BGC0000163', 'FJ462704')
['tetronasin']
228: ('BGC0001401', 'JF739169')
['melleolide F', "6'-Chloromelleolide F", "6'-Bromomelleolide F"]
229: ('BGC0001257', 'AF151533')
['1,3,6,8-tetrahydroxynaphthalene']
230: ('BGC0001400', 'CH476608')
['citreoviridin']
231: ('BGC0000055', 'AM420293')
['Erythromycin A', 'Erythromycin B', 'Erythromycin C', 'Erythromycin D']
232: (



233: ('BGC0001001', 'AY522504')
['Jamaicamide', 'Jamaicamide b', 'jamaicamide c']
234: ('BGC0000122', 'KF739396')
['phenylnannolone A', 'phenylnannolone B', 'phenylnannolone C']
235: ('BGC0001183', 'JX306680')
['lobophorin']
236: ('BGC0000106', 'GQ452266')
['naphthomycin']
237: ('BGC0001367', 'CP001804')
['haliamide']
238: ('BGC0001296', 'LC061217')
['streptazone E']
239: ('BGC0001405', 'CH408033')
['chaetoviridin', 'chaetomugilin']
240: ('BGC0001012', 'DQ885223')
['meridamycin']
241: ('BGC0001293', 'KT327068')
['cyclizidine']
242: ('BGC0000131', 'EF140902')
['pyrrolomycin']
243: ('BGC0000047', 'AY118081')
['dihydrochalcomycin']
244: ('BGC0000063', 'EU449979')
['fumonisin']
245: ('BGC0000993', 'EF210776')
['FK228']
246: ('BGC0000952', 'FR681999.1')
['pristinamycin']
247: ('BGC0000078', 'AB767280')
['incednine']




248: ('BGC0001044', 'AY553235')
['sirodesmin']
249: ('BGC0000030', 'HF679027')
['bikaverin']
250: ('BGC0001298', 'KF683117')
['4-Z-annimycin']
251: ('BGC0000147', 'U24241')
['Soraphen']
252: ('BGC0000061', 'AY310323')
['FR-008']
253: ('BGC0001272', 'EU086466')
['elsinochrome B', 'elsinochrome C']
254: ('BGC0000022', 'AACD01000015')
['asperfuranone']
255: ('BGC0001102', 'CP000085')
['Malleilactone']
256: ('BGC0001268', 'JX308619')
['fusarin']
257: ('BGC0000982', 'AY834753')
['cystothiazole A']
258: ('BGC0000021', 'JF819834')
['apoptolidin']




259: ('BGC0001230', 'BK009377')
['Salinamide A',
 'Salinamide B',
 'Salinamide C',
 'Salinamide D',
 'Salinamide E',
 'Salinamide F',
 'Desmethylsalinamide C',
 'Desmethylsalinamide E']
260: ('BGC0000115', 'AF263912')
['nystatin A1']
261: ('BGC0001024', 'AF188287')
['Myxothiazol']
262: ('BGC0001269', 'AM886292')
['gibberellin A3',
 'gibberellin A1',
 'gibberellin A7',
 'gibberellin A4',
 'gibberellin A9']
263: ('BGC0001038', 'JX424761')
['pyralomicin 1a']
264: ('BGC0000383', 'BX470251')
['Luminmycin']
265: ('BGC0001247', 'AY649543')
['cercosporin']
266: ('BGC0001057', 'DQ019316')
['zearalenone']
267: ('BGC0001058', 'EU670723')
['zorbamycin']
Missing subunit: "ACG60781.1"
['KS', 'AT', 'KR', 'T']
dict_keys(['ACG60777.1', 'ACG60770.1', 'ACG60783.1'])
268: ('BGC0000089', 'AF141925')
['lovastatin']
269: ('BGC0001060', 'EF552206')
['dynemicin']




270: ('BGC0000997', 'AM412319')
['glidobactin']
271: ('BGC0000025', 'AB032523')
['avermectin']
272: ('BGC0000153', 'AJ421825')
['Stigmatellin A']
273: ('BGC0000989', 'EU414841')
['epothilone']
274: ('BGC0000048', 'DQ149246')
['dothistromin']
275: ('BGC0000994', 'AF235504')
['FK520']
276: ('BGC0000060', 'HQ434551')
['fostriecin']
277: ('BGC0001099', 'GU479979')
['Kalimantacin A']
278: ('BGC0001162', 'KF550302')
['1-heptadecene']
279: ('BGC0000346', 'KF647219')
['epoxomicin']
280: ('BGC0000091', 'KF711829')
['marineosin']
281: ('BGC0001052', 'GU385216')
['tirandamycin']
282: ('BGC0001020', 'HQ257512')
['muraymycin']
283: ('BGC0000186', 'CP000085')
['thailandamide A', 'thailandamide B', 'thailandamide lactone']
284: ('BGC0001018', 'AB481215')
['micropeptin K139']
285: ('BGC0001352', 'LN879414')
['Tu 3010']
286: ('BGC0001120', 'CP000086')
['burkholderic acid']
Missing subunit: "ABC36203.1"
dict_keys([])
287: ('BGC0001244', 'KM365454')
['(-)-Mellein']
288: ('BGC0001030', 'AY557343')
['ochra



289: ('BGC0000138', 'AY442225')
['rimocidin']
290: ('BGC0000165', 'HQ011923.1')
['tiacumicin B']
291: ('BGC0000123', 'AY354515')
['phoslactomycin B']
292: ('BGC0001063', 'AL645882')
['undecylprodigiosin']
293: ('BGC0001009', 'AJ557546')
['melithiazol']
294: ('BGC0001048', 'EF032505')
['tallysomycin A']
295: ('BGC0000149', 'AM407731')
['Spirangien A1']
296: ('BGC0000961', 'CP000085')
['bactobolin']
297: ('BGC0000096', 'BD420675')
['midecamycin']
298: ('BGC0000957', 'KF813023')
['ansatrienin (mycotrienin)']
299: ('BGC0001036', 'KF386858')
['polyoxypeptin']
300: ('BGC0001357', 'KT826756')
['carbamidocyclophane A',
 'carbamidocyclophane B',
 'carbamidocyclophane C',
 'carbamidocyclophane D',
 'carbamidocyclophane E',
 'carbamidocyclophane F',
 'carbamidocyclophane H',
 'carbamidocyclophane M',
 'carbamidocyclophane N',
 'carbamidocyclophane O',
 'carbamidocyclophane P',
 'carbamidocyclophane Q',
 'carbamidocyclophane R',
 'carbamidocyclophane S',
 'carbamidocyclophane T',
 'carbamidocyclop



309: ('BGC0000032', 'HM452329')
['calcimycin']
310: ('BGC0000085', 'AB088224')
['lankamycin']
311: ('BGC0000109', 'AB363939')
['nemadectin']
312: ('BGC0001259', 'EU086466')
['elsinochrome A']
313: ('BGC0000114', 'DQ354110')
['nigericin']
314: ('BGC0001014', 'AL646052')
['micacocidin']
315: ('BGC0001046', 'FN433113')
['Streptolydigin']
316: ('BGC0000010', 'AY510455')
['aflatoxin']




317: ('BGC0000038', 'AL645882')
['coelimycin P1']
318: ('BGC0000033', 'AF497482')
['calicheamicin']
319: ('BGC0000080', 'DQ897668')
['jerangolid A', 'jerangolid D']
320: ('BGC0000019', 'EU232693')
['angolamycin']
321: ('BGC0000144', 'JN033543')
['salinomycin']
322: ('BGC0001288', 'LC021382')
['maklamicin']
323: ('BGC0001019', 'KF657738')
['Microsclerodermin M']
324: ('BGC0000345', 'KF647220')
['eponemycin']
325: ('BGC0000066', 'AY179507')
['geldanamycin']
326: ('BGC0001275', 'AY941322')
['6-methylsalicyclic acid']
327: ('BGC0000045', 'JX971534')
['dehydrocurvularin']
328: ('BGC0000087', 'FM173265')
['lasalocid']
329: ('BGC0000962', 'AF516145')
['barbamide']
330: ('BGC0001271', 'BN001308')
['emericellin']
331: ('BGC0000113', 'AF016585')
['niddamycin']
332: ('BGC0000145', 'EF397502')
['salinosporamide A']
333: ('BGC0001071', 'GQ979609')
['nosperin']
334: ('BGC0000991', 'AF217189')
['epothilone']




335: ('BGC0001072', 'AF324838.2')
['Simocyclinone D8']
336: ('BGC0000067', 'DQ249341')
['geldanamycin']
337: ('BGC0001039', 'HM436809')
['pyridomycin']
338: ('BGC0000157', 'DQ983361')
['tautomycetin']
339: ('BGC0001385', 'KU928136')
['byssochlamic acid', 'agnestadrides']
340: ('BGC0000051', 'KC894072')
['ebelactone']
341: ('BGC0000960', 'EU240558')
['azinomycin B']
342: ('BGC0000107', 'BN001302')
['naphthopyrone']
343: ('BGC0000041', 'AF098795')
['coronafacic acid']
344: ('BGC0000999', 'EF028635')
['heat-stable antifungal factor']
345: ('BGC0001404', 'AM920436')
['sorbicillin']
346: ('BGC0001377', 'KT762610')
['uncialamycin']
347: ('BGC0000393', 'CP000113')
['Myxoprincomide-c506']
348: ('BGC0001026', 'GQ176852')
['NG-391']
349: ('BGC0001280', 'LC011911')
['betaenone C', 'betaenone A']
350: ('BGC0001242', 'HE613440')
['fusarubin']
351: ('BGC0000074', 'AY947889')
['herbimycin']


# Manually reorder clusters

In [12]:
# BE-14106
cluster = pks.models.Cluster.objects.get(mibigAccession='BGC0000029.1')
cluster.reorderSubunits(['becB', 'becD', 'becE', 'becF', 'becG'])
pks.models.setCyclization(cluster, False)
# Cremimycin
cluster = pks.models.Cluster.objects.get(mibigAccession='BGC0000042.1')
cluster.reorderSubunits(['cmiP7', 'cmiP8', 'cmiP1', 'cmiP4', 'cmiP5', 'cmiP6'])
# Meilingmycin
cluster = pks.models.Cluster.objects.get(mibigAccession='BGC0000093.1')
cluster.deleteSubunit('pks1')
cluster.deleteSubunit('pks2')
# ML-449
cluster = pks.models.Cluster.objects.get(mibigAccession='BGC0000097.1')
cluster.reorderSubunits(['mlaB', 'mlaD', 'mlaE', 'mlaF', 'mlaG'])
pks.models.setCyclization(cluster, False)
# Nystatin
cluster = pks.models.Cluster.objects.get(mibigAccession='BGC0000115.1')
cluster.reorderSubunits(['nysA', 'nysB', 'nysC', 'nysI', 'nysJ', 'nysK'])
# Tiacumicin
cluster = pks.models.Cluster.objects.get(mibigAccession='BGC0000165.1')
cluster.deleteSubunit('tiaB')
# Simocyclinone
cluster = pks.models.Cluster.objects.get(mibigAccession='BGC0001072.1')
cluster.reorderSubunits(['simC1A', 'simC1B', 'simC1C'])
# Brasilinolide
cluster = pks.models.Cluster.objects.get(mibigAccession='BGC0001381.1')
cluster.reorderSubunits(['nbrI', 'nbrJ', 'nbrK', 'nbrL', 'nbrG', 'nbrH'])

<QuerySet [<Subunit: nbrI pks subunit>, <Subunit: nbrJ pks subunit>, <Subunit: nbrK pks subunit>, <Subunit: nbrL pks subunit>, <Subunit: nbrG pks subunit>, <Subunit: nbrH pks subunit>]>

# Testing cluster chemical operations

In [13]:
acc = 'BGC0000029.1'
cluster = pks.models.Cluster.objects.get(mibigAccession=acc)
pprint(cluster.architecture())
pprint(cluster.computeProduct())

[[<Subunit: becB pks subunit>,
  [[<Module: pks module 12539>,
    <QuerySet [<KS: domain>, <AT: substrate mal, loading True>, <KR: type U>, <ACP: domain>]>],
   [<Module: pks module 12540>,
    <QuerySet [<KS: domain>, <AT: substrate mmal, loading False>, <KR: type B1>, <ACP: domain>]>]]],
 [<Subunit: becD pks subunit>,
  [[<Module: pks module 12537>,
    <QuerySet [<KS: domain>, <AT: substrate mal, loading False>, <KR: type C2>, <ACP: domain>]>],
   [<Module: pks module 12538>,
    <QuerySet [<KS: domain>, <AT: substrate mal, loading False>, <KR: type U>, <ACP: domain>]>]]],
 [<Subunit: becE pks subunit>,
  [[<Module: pks module 12533>,
    <QuerySet [<KS: domain>, <AT: substrate mal, loading False>, <KR: type B1>, <ACP: domain>]>]]],
 [<Subunit: becF pks subunit>,
  [[<Module: pks module 12534>,
    <QuerySet [<KS: domain>, <AT: substrate mmal, loading False>, <KR: type B1>, <ACP: domain>]>],
   [<Module: pks module 12535>,
    <QuerySet [<KS: domain>, <AT: substrate mal, loading Fa

# Validate and then delete clusters lacking valid products

In [14]:
# Delete clusters with no computable product
for cluster in pks.models.Cluster.objects.all():
    try:
        cluster.computeProduct()
    except:
        acc = cluster.mibigAccession
        print('%s: %s' %(acc, cluster.description))
        cluster.delete()
#        cluster = pks.models.Cluster.objects.get(mibigAccession=acc)
#        cluster.computeProduct()

BGC0001011.1: Meridamycin
BGC0001212.1: Nannocystin
BGC0000954.1: Ajudazol
BGC0000164.1: Tetronomycin
BGC0001070.1: Kirromycin
BGC0001344.1: Tubulysin
BGC0000974.1: Crocacin
BGC0001059.1: Zwittermycin A
BGC0001034.1: Pellasoren
BGC0001010.1: Melithiazol
BGC0000163.1: Tetronasin
BGC0001293.1: Cyclizidine
BGC0000982.1: Cystothiazole A
BGC0000021.1: Apoptolidin
BGC0001024.1: Myxothiazol
BGC0000153.1: Stigmatellin


# Validate and then delete clusters with less than three modules

In [15]:
# Delete clusters with no modules
for cluster in pks.models.Cluster.objects.all():
    for subunit in cluster.subunits():
        if len(subunit.modules()) == 0:
            subunit.delete()
    if len(cluster.subunits()) == 0:
        cluster.delete()
        
# Delete clusters with less than three modules
for cluster in pks.models.Cluster.objects.all():
    nmodules = 0
    for subunit in cluster.subunits():
        nmodules += len(subunit.modules())
    if nmodules < 3:
        cluster.delete()

# Compare Django object clusters with IPython Notebook object clusters

In [16]:
notebook_avail = ['LC021382.1', 'AL645882.2', 'JN671974.1', 'AB089954.2', 'AY834753.1', 'AF453501.1', 'FN547928.1', 'HQ386234.1', 'FJ872523.1', 'HQ434551.1', 'HM452329.1', 'AF217189.1', 'JX504844.1', 'AM179409.1', 'AF016585.1', 'JF819834.1', 'KU245058.1', 'AJ557546.1', 'LC061217.1', 'AJ441056.1', 'AY354515.1', 'AB568601.1', 'KT209587.1', 'AF079138.1', 'AM420293.1', 'FJ545274.1', 'AY007564.1', 'HM639990.1', 'CP007155.1', 'AB449340.1', 'KU568466.1', 'AB767280.1', 'AF521085.1', 'EU035755.1', 'EF990140.1', 'GQ981381.1', 'EU108007.1', 'JX545234.1', 'KF585133.1', 'KT327068.1', 'X86780.1', 'JQ793783.1', 'AY623658.2', 'AJ871581.1', 'AB818354.1', 'AF319998.1', 'BA000030.3', 'EU220288.1', 'DQ885223.1', 'AF440781.1', 'AF188287.1', 'DQ351275.1', 'AB086653.1', 'AF263912.1', 'KC894072.1', 'KT067736.1', 'AJ421825.1', 'AB435553.1', 'DQ176871.1', 'U78289.1', 'AB241068.1', 'FJ809786.1', 'AM850130.1', 'KM361622.1', 'EU414841.1', 'BX649209.1', 'FJ872525.1', 'AJ278573.1', 'AY179507.1', 'AY509120.1', 'AF324838.2', 'AB088224.2', 'GQ385961.1', 'GQ452266.1', 'FN433113.1', 'AY118081.2', 'KC013978.1', 'BN001209.1', 'DQ897668.1', 'FJ952082.1', 'AF040570.3', 'DQ897667.1', 'AB469193.1', 'GU063811.1', 'AF098795.1', 'AY310323.2', 'AY310323.2', 'AB363939.1', 'LN997801.1', 'JN033543.1', 'FM173265.1', 'KF479198.1', 'EU827593.1', 'AM407731.1', 'AJ557546.1', 'JF752342.1', 'AF357202.1', 'AJ580915.1', 'GQ981380.1', 'HQ011923.1', 'AF263245.1', 'AM778535.1', 'AF235504.1', 'LN831790.1', 'AY947889.1', 'EU140798.1', 'KP161205.1', 'U24241.2', 'AF210843.1']
notebook_avail = set(notebook_avail)

In [17]:
django_avail = []
for cluster in pks.models.Cluster.objects.all():
    django_avail.append(cluster.genbankAccession)
django_avail = set(django_avail)

In [18]:
only_notebook = notebook_avail.difference(django_avail)
only_django = django_avail.difference(notebook_avail)
print(only_notebook)
print(only_django)

{'DQ351275.1', 'KT327068.1', 'AY834753.1', 'AJ421825.1', 'AJ557546.1', 'KT067736.1', 'JF819834.1', 'AF188287.1', 'FN547928.1', 'AL645882.2'}
{'KF739396.1', 'AM988861.1', 'HF563079.1'}


# Compare Django object clusters with available final structures

In [19]:
django_avail = []
for cluster in pks.models.Cluster.objects.all():
    django_avail.append(cluster.mibigAccession.split('.')[0])
django_avail = set(django_avail)
struct_avail = set(pickle.load(open('./mibig_rdkitmol_finalstructs.p', 'rb')).keys())

print(django_avail.difference(struct_avail))

set()


# Check sequence queries

In [20]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

acc = 'BGC0000055.1'
cluster = pks.models.Cluster.objects.get(mibigAccession=acc)
subunit = cluster.subunits()[0]
translated= Seq(subunit.getNucleotideSequence(), generic_dna).translate()
# translated = Seq(subunit.getNucleotideSequence(), generic_dna).reverse_complement().translate()
direct = subunit.getAminoAcidSequence()

n = 10
print(translated[:n])
print(direct[:n])
print(translated[len(translated)-n-1:])
print(direct[len(direct)-n:])

print(len(translated))
print(len(direct))


module = subunit.modules()[0]
for domain in module.domains():
    print(domain)
    translated = Seq(domain.getNucleotideSequence(), generic_dna).translate()
    # translated = Seq(domain.getNucleotideSequence(), generic_dna).reverse_complement().translate()
    direct = domain.getAminoAcidSequence()

    n = 10
    print(translated[:n])
    print(direct[:n])
    print(translated[len(translated)-n:])
    print(direct[len(direct)-n:])

    print(len(translated))
    print(len(direct))

VADLSKLSDS
MADLSKLSDS
EALGRELDGD*
EALGRELDGD
3546
3545
substrate prop, loading True
VFPGQGAQWA
VFPGQGAQWA
AAVVPTLQRG
AAVVPTLQRG
294
294
domain
AVVNGETAAL
AVVNGETAAL
PRALAEALAA
PRALAEALAA
66
66


# Converting cluster to JSON file

In [21]:
acc = 'BGC0000029.1'
cluster = pks.models.Cluster.objects.get(mibigAccession=acc)
pprint(pks.models.clusterDict(cluster))
pks.models.clusterJSON('./corrections', cluster)

{'description': 'BE-14106',
 'genbankAccession': 'FJ872523.1',
 'mibigAccession': 'BGC0000029.1',
 'architecture': {'becB': {0: {'AT': OrderedDict([('substrate', 'mal')]),
                               'KR': {'active': False,
                                      'type': 'U'}},
                           1: {'AT': OrderedDict([('substrate', 'mmal')]),
                               'KR': {'active': True,
                                      'type': 'B1'}}},
                  'becD': {0: {'AT': OrderedDict([('substrate', 'mal')]),
                               'KR': {'active': True,
                                      'type': 'C2'}},
                           1: {'AT': OrderedDict([('substrate', 'mal')]),
                               'KR': {'active': True,
                                      'type': 'U'}}},
                  'becE': {0: {'AT': OrderedDict([('substrate', 'mal')]),
                               'KR': {'active': True,
                                      'type'

In [22]:
pks.models.correctCluster('corrections/BGC0000029.1.json')

In [23]:
def correctCluster(filepath):
    '''Function that takes as input a JSON file representing the changeable parameters
       describing a PKS cluster and updates the database entry to reflect the information
       contained in the JSON file.
    '''
    corr = json.loads(open(filepath).read(), object_pairs_hook=OrderedDict)
    cluster = pks.models.Cluster.objects.get(mibigAccession=corr['mibigAccession'])
    # Reorder subunits if necessary
    newOrder = [str(x) for x in corr['architecture'].keys()]
    cluster.reorderSubunits(newOrder)
    # Change domain properties if necessary
    for s,sdict in corr['architecture'].items():
        for m,mdict in sdict.items():
            for d,ddict in mdict.items():
                if d == 'AT':
                    pks.models.setSpecificity(cluster, s, int(m), d, ddict['substrate'])
                elif d == 'KR':
                    pks.models.setSpecificity(cluster, s, int(m), d, ddict['type'])
                    pks.models.setActive(cluster, s, int(m), d, ddict['active'])
                elif d in ['DH', 'ER']:
                    pks.models.setActive(cluster, s, int(m), d, ddict['active'])
                else:
                    assert d == 'TE'
                    cyclic = ddict['cyclic']
    if cyclic:
        pks.models.setCyclization(cluster, cyclic)

correctCluster('corrections/BGC0000029.1.json')

In [31]:
acc = 'BGC0000029.1'
cluster = pks.models.Cluster.objects.get(mibigAccession=acc)
#pprint(cluster.architecture())
pprint(cluster.computeProduct())

for subunit in cluster.subunits():
    for module in subunit.modules():
        for domain in module.domains():
            try:
                print(type(domain), domain.active)
            except:
                pass

[<rdkit.Chem.rdchem.Mol object at 0x7fe5b8154ba8>,
 <rdkit.Chem.rdchem.Mol object at 0x7fe5b713d588>,
 <rdkit.Chem.rdchem.Mol object at 0x7fe5b713d5f8>,
 <rdkit.Chem.rdchem.Mol object at 0x7fe5b713d668>,
 <rdkit.Chem.rdchem.Mol object at 0x7fe5b713d6d8>,
 <rdkit.Chem.rdchem.Mol object at 0x7fe5b713d748>,
 <rdkit.Chem.rdchem.Mol object at 0x7fe5b713d7b8>,
 <rdkit.Chem.rdchem.Mol object at 0x7fe5b70ecdd8>]
<class 'pks.models.KR'> False
<class 'pks.models.KR'> True
<class 'pks.models.KR'> True
<class 'pks.models.KR'> True
<class 'pks.models.KR'> True
<class 'pks.models.KR'> True
<class 'pks.models.KR'> True
<class 'pks.models.KR'> True
