In [1]:
import os, sys
sys.path.insert(0, '/clusterCAD')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "clusterCAD.settings")
import django
django.setup()

In [2]:
import pks.models
from model_utils.managers import InheritanceManager

Get all nonloading modules with reductive loops

In [79]:
allModules = pks.models.Module.objects.all()
fullLoopModules = []
len(allModules)

1022

In [80]:
for module in allModules:
    domainTypes = [repr(x) for x in module.domains().select_subclasses()]
    reductiveDomainCount = len(list(filter(lambda domain: domain in ['DH', 'ER', 'KR'], domainTypes)))
    if not module.loading:
        if reductiveDomainCount > 2:
            fullLoopModules.append(module)

In [81]:
len(fullLoopModules)

135

In [69]:
fullLoopModules[0].product.smiles
fullLoopModules[0].order
fullLoopModules[0].subunit.cluster.description
fullLoopModules[0].subunit.cluster.mibigAccession


'BGC0000001.1'

In [82]:
file = open("clusterSmiles","w") 
for module in fullLoopModules:
    file.write("%s\t%s_module_%s\n" % (module.product.smiles, module.subunit.cluster.mibigAccession, module.order))
file.close()

# Extract didomain sequences

In [26]:
from Bio import Seq, SeqRecord, SeqIO
from Bio.Alphabet import IUPAC

In [155]:
allModules = pks.models.Module.objects.all()

In [7]:
len(ksATModules)

1010

In [159]:
didomainTypes = [['AT', 'DH'], ['AT', 'KR'], ['KR', 'ACP'], ['ACP', 'TE']]
for didomainType in didomainTypes:
    myModules = []
    for module in allModules:
        domainTypes = [repr(x) for x in module.domains().select_subclasses()]
        for i, x in enumerate(domainTypes):
            if x == didomainType[0]:
                if len(domainTypes) < (i + 2):
                    continue
                if domainTypes[i + 1] == didomainType[1]:
                    if not module.loading:
                        myModules.append(module)
    name = didomainType[0] + didomainType[1]
    with open(name + '.fasta', 'w') as f:
        for module in myModules:
            domains = module.domains().select_subclasses()
            domain1 = list(filter(lambda domain: repr(domain) == didomainType[0], domains))[0]
            domain2 = list(filter(lambda domain: repr(domain) == didomainType[1], domains))[0]
            assert domain1.start < domain2.stop
            sequence = module.subunit.getAminoAcidSequence()
            sequence = sequence[(domain1.start - 1):domain2.stop]
            moduleTitle = name + '_' + module.subunit.cluster.mibigAccession + '_' + str(module.subunit.name) + '_mod' + str(module.order)
            moduleTitle = "_".join(moduleTitle.split())
            sseq = SeqRecord.SeqRecord(Seq.Seq(sequence, IUPAC.protein),
                                           id=moduleTitle,
                                           name='',
                                           description=''
                                          )
            SeqIO.write(sseq, f, "fasta")

# extract all TEs

In [160]:
from pks.models import TE
tes = TE.objects.all()

In [166]:
with open('TE.fasta', 'w') as f:
    for te in tes:
        module = te.module
        moduleTitle = 'TE_' + module.subunit.cluster.mibigAccession + '_' + str(module.subunit.name) + '_mod' + str(module.order)
        moduleTitle = "_".join(moduleTitle.split())
        if te.cyclic:
            moduleTitle += "_cyclic"
        sequence = te.getAminoAcidSequence()
        sseq = SeqRecord.SeqRecord(Seq.Seq(sequence, IUPAC.protein),
                               id=moduleTitle,
                               name='',
                               description=''
                              )
        SeqIO.write(sseq, f, "fasta")

# Get all active reductive loop structures

In [143]:
myModules = []
for module in allModules:
    domains = module.domains()
    domainTypes = [repr(x) for x in domains]
    if domainTypes[1:6] != ['AT', 'DH', 'ER', 'KR', 'ACP']:
        continue
    if domains[1].substrate not in ['mal', 'mmal']:
        continue
    if domains[4].type not in ['A1', 'A2', 'B1', 'B2', 'U']:
        continue
    if module.loading:
        continue 
    reductiveActivity = [domains[2].active, domains[3].active, domains[4].active]
    if reductiveActivity != [True, True, True]:
        continue
    myModules.append(module)

In [146]:
name = "DHERKR"
lengths = []
with open(name + '.fasta', 'w') as f:
    for module in myModules:
        domains = module.domains()
        domainAT = domains[1]
        domainACP = domains[5]
        assert domainAT.stop < domainACP.start
        sequence = module.subunit.getAminoAcidSequence()
        sequence = sequence[domainAT.stop:(domainACP.start - 1)]
        lengths.append(len(sequence))
        moduleTitle = name + '_' + module.subunit.cluster.mibigAccession + '_' + str(module.subunit.name) + '_mod' + str(module.order)
        sseq = SeqRecord.SeqRecord(Seq.Seq(sequence, IUPAC.protein),
                                       id=moduleTitle,
                                       name='',
                                       description=''
                                      )
        SeqIO.write(sseq, f, "fasta")

In [154]:
[len(allModules), len(myModules), max(lengths), min(lengths), sum(lengths), sum(lengths)*3]

[1022, 120, 1252, 1032, 139086, 417258]

# Save list of all DH domains

In [64]:
from pks.models import DH
allDH = DH.objects.all()
allDHlabels = [x.module.subunit.cluster.mibigAccession + '_mod' + str(x.module.order) for x in allDH]
len(allDHlabels)

591

In [52]:
import pickle
pickle.dump(allDHlabels, open( "allDH.p", "wb" ))

In [54]:
productDHs = pickle.load(open("productionDHs.p", "rb"))

In [65]:
onlyInProduction = []
for domain in productDHs:
    if domain not in allDHlabels:
        onlyInProduction.append(domain)

In [66]:
onlyInProduction

['BGC0001359.1_mod1',
 'BGC0000035.1_mod4',
 'BGC0000054.1_mod1',
 'BGC0000069.1_mod3',
 'BGC0000166.1_mod1',
 'BGC0001396.1_mod1',
 'BGC0001396.1_mod5',
 'BGC0000102.1_mod4',
 'BGC0000094.1_mod1',
 'BGC0000047.1_mod4',
 'BGC0000060.1_mod5',
 'BGC0000149.1_mod4',
 'BGC0000085.1_mod1',
 'BGC0001046.1_mod0',
 'BGC0000018.1_mod6']