# Export DH sequences from ClusterCAD

Note: this notebook must be run inside of a working version of ClusterCAD

Tyler W. H. Backman

In [1]:
import os, sys
import re
sys.path.insert(0, '/clusterCAD')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "clusterCAD.settings")
import django
django.setup()
import pks.models

In [2]:
from Bio import Seq, SeqRecord, SeqIO
from Bio.Alphabet import IUPAC
from Bio import pairwise2
import pandas as pd

Get all PKS modules from ClusterCAD

In [3]:
allModules = pks.models.Module.objects.all()
len(allModules)

737

Identify only modules that have the following properties:
* must contain the following domains: 'AT', 'DH', 'KR', 'ACP'
* KR must be active, e.g. one of types 'A1', 'A2', 'A', 'B1', 'B2', 'B', 'U'
* module must be a non-loading module
* DH must be active

In [4]:
myModules = []
for module in allModules:
    domains = module.domains()
    domainTypes = {repr(x) for x in domains}
    
    # require module to contain an AT, DH, KR, and ACP at least
    if len(domainTypes.intersection({'AT', 'DH', 'KR', 'ACP'})) != 4:
        continue
    
    # require KR to be active
    thiskr = list(filter(lambda d: repr(d) == 'KR', domains))[0]
    if thiskr.type not in ['A1', 'A2', 'A', 'B1', 'B2', 'B', 'U']:
        continue
        
    # require module to be non-loading
    if module.loading:
        continue 
        
    # require DH to be active
    thisdh = list(filter(lambda d: repr(d) == 'DH', domains))[0]
    if not thisdh.active:
        continue
    
    myModules.append(module)
len(myModules)

assert len(list(set([x.id for x in myModules]))) == len(myModules)
len(myModules)

330

Export DH sequences to a FASTA file starting from the residue immediately after the end of the AT until the residue immediately before the start of the next domain (ER or KR)

In [5]:
name = "DHs"
lengths = []
sequences = {}
annotations = []
with open(name + '.fasta', 'w') as f:
    for module in myModules:
        
        # get domain objects
        domains = module.domains()
        domainAT = list(filter(lambda d: repr(d) == 'AT', domains))[0]
        domainDH = list(filter(lambda d: repr(d) == 'DH', domains))[0]
        
        # get next domain after DH
        index = list(domains).index(domainDH)
        nextDomain = domains[index + 1]
        
        # confirm that AT comes before the post-DH domain and get AA sequence
        assert domainAT.stop < nextDomain.start
        sequence = module.subunit.getAminoAcidSequence()
        sequence = sequence[domainAT.stop:(nextDomain.start - 1)]
        lengths.append(len(sequence))
        
        # strip whitespace from subunit name
        subunitName = re.sub(r'\s+', '', str(module.subunit.name))
        
        # write FASTA
        moduleTitle = name + '_' + module.subunit.cluster.mibigAccession + '_' + subunitName + '_mod' + str(module.order)
        sequences[module.id] = sequence
        sseq = SeqRecord.SeqRecord(Seq.Seq(sequence, IUPAC.protein),
                                       id=moduleTitle,
                                       name='',
                                       description=''
                                      )
        SeqIO.write(sseq, f, "fasta")
        
        # create PANDAS series with annotation data
        annotations.append(pd.Series(
            data = [module.subunit.cluster.mibigAccession,
                    subunitName,
                    module.order,
                    domainAT.substrate,
                    len(sequence),
                    module.product.smiles,
                    moduleTitle],
            index = ['mibig_accession',
                     'subunit_name',
                     'module_number',
                     'at_substrate',
                     'length_aa_residues',
                     'intermediate_smiles',
                     'name_in_fasta'],
            copy = True
        ))

Print out stats on exported clusters

In [6]:
print('Total modules in ClusterCAD:', str(len(allModules)))
print('DH domains meeting export criteria:', str(len(myModules)))
print('Longest DH AA sequence:', str(max(lengths)))
print('Shortest DH AA sequence:', str(min(lengths)))
print('Sum of DH AA sequences:', str(sum(lengths)))
print('Total DNA sequence length:', str(sum(lengths)*3))

Total modules in ClusterCAD: 737
DH domains meeting export criteria: 330
Longest DH AA sequence: 805
Shortest DH AA sequence: 287
Sum of DH AA sequences: 181361
Total DNA sequence length: 544083
