In [2]:
from typing import Dict, List
import re
from itertools import product
from Bio.Seq import Seq
from dnachisel import *

# Codon tables from Cocoputs

Search codon table by genbank taxonomic ID, then copy and paste table below:
* Putida = 160488
* E. coli = 469008
* C. glut = 196627
* S. aureofaciencs = 1894

In [3]:
cglut_table = '''
TTT     13.27   (37433) TCT     11.03   (31096) TAT      7.43   ( 20939)        TGT      2.25   ( 6354)
TTC     22.76   (64195) TCC     21.45   (60484) TAC     14.68   ( 41401)        TGC      3.96   (11158)
TTA      4.98   (14050) TCA      8.12   (22905) TAA      1.69   (  4776)        TGA      0.47   ( 1334)
TTG     19.46   (54889) TCG      7.51   (21166) TAG      0.93   (  2631)        TGG     13.92   (39253)

CTT     16.62   (46863) CCT     11.23   (31663) CAT      6.63   ( 18706)        CGT     13.74   (38735)
CTC     21.89   (61727) CCC      9.66   (27255) CAC     14.15   ( 39909)        CGC     25.14   (70885)
CTA      5.78   (16311) CCA     16.86   (47532) CAA     12.93   ( 36460)        CGA      6.50   (18321)
CTG     27.85   (78549) CCG     10.42   (29388) CAG     20.80   ( 58658)        CGG      4.78   (13479)

ATT     21.79   (61454) ACT     12.53   (35335) AAT     10.89   ( 30697)        AGT      4.81   (13562)
ATC     34.28   (96667) ACC     32.73   (92309) AAC     21.84   ( 61593)        AGC     10.25   (28916)
ATA      1.67   ( 4716) ACA      7.54   (21253) AAA     13.67   ( 38553)        AGA      2.11   ( 5938)
ATG     22.11   (62340) ACG      8.69   (24505) AAG     20.89   ( 58920)        AGG      3.11   ( 8768)

GTT     20.88   (58895) GCT     25.64   (72292) GAT     33.31   ( 93921)        GGT     24.38   (68750)
GTC     22.31   (62903) GCC     27.28   (76920) GAC     26.57   ( 74940)        GGC     34.27   (96654)
GTA      8.06   (22741) GCA     31.60   (89108) GAA     35.99   (101503)        GGA     15.08   (42529)
GTG     28.97   (81687) GCG     23.32   (65757) GAG     27.97   ( 78880)        GGG      6.55   (18465)
'''
ecoli_table = '''
TTT     22.24   (114170)        TCT      8.35   ( 42886)        TAT     15.96   ( 81936)        TGT      5.10   ( 26164)
TTC     16.60   ( 85242)        TCC      8.61   ( 44203)        TAC     12.16   ( 62447)        TGC      6.40   ( 32845)
TTA     13.73   ( 70493)        TCA      6.90   ( 35445)        TAA      2.05   ( 10523)        TGA      0.92   (  4703)
TTG     13.67   ( 70177)        TCG      8.90   ( 45694)        TAG      0.21   (  1063)        TGG     15.23   ( 78200)

CTT     10.96   ( 56257)        CCT      6.92   ( 35523)        CAT     12.72   ( 65279)        CGT     21.17   (108685)
CTC     11.12   ( 57113)        CCC      5.38   ( 27624)        CAC      9.65   ( 49544)        CGC     22.24   (114199)
CTA      3.83   ( 19687)        CCA      8.39   ( 43049)        CAA     15.27   ( 78398)        CGA      3.46   ( 17756)
CTG     53.63   (275324)        CCG     23.61   (121211)        CAG     28.92   (148464)        CGG      5.13   ( 26332)

ATT     30.63   (157252)        ACT      8.85   ( 45411)        AAT     17.31   ( 88890)        AGT      8.55   ( 43903)
ATC     25.32   (130003)        ACC     23.39   (120097)        AAC     21.53   (110550)        AGC     16.01   ( 82217)
ATA      4.06   ( 20845)        ACA      6.70   ( 34391)        AAA     33.57   (172330)        AGA      1.84   (  9450)
ATG     28.12   (144372)        ACG     14.40   ( 73923)        AAG     10.14   ( 52038)        AGG      1.01   (  5190)

GTT     18.38   ( 94356)        GCT     15.33   ( 78727)        GAT     32.02   (164381)        GGT     24.98   (128230)
GTC     15.24   ( 78233)        GCC     25.80   (132480)        GAC     19.04   ( 97771)        GGC     29.93   (153678)
GTA     10.99   ( 56425)        GCA     20.26   (104039)        GAA     39.89   (204803)        GGA      7.75   ( 39810)
GTG     26.56   (136348)        GCG     34.26   (175894)        GAG     17.73   ( 91034)        GGG     10.96   ( 56270)
'''
putida_table = '''
TTT      6.95   ( 12556)        TCT      2.31   (  4174)        TAT      6.93   (12523) TGT      1.83   ( 3302)
TTC     28.62   ( 51695)        TCC      8.36   ( 15101)        TAC     18.59   (33569) TGC      8.34   (15062)
TTA      1.31   (  2359)        TCA      2.72   (  4917)        TAA      0.65   ( 1176) TGA      2.03   ( 3663)
TTG     16.52   ( 29836)        TCG     13.82   ( 24968)        TAG      0.34   (  617) TGG     14.46   (26116)

CTT      6.79   ( 12271)        CCT      5.80   ( 10475)        CAT      7.84   (14161) CGT     12.79   (23096)
CTC     14.64   ( 26446)        CCC     11.29   ( 20382)        CAC     15.55   (28092) CGC     36.71   (66307)
CTA      2.47   (  4455)        CCA      6.85   ( 12375)        CAA     10.76   (19441) CGA      2.97   ( 5368)
CTG     75.38   (136143)        CCG     24.66   ( 44541)        CAG     36.06   (65119) CGG     10.03   (18107)

ATT      9.33   ( 16849)        ACT      4.36   (  7878)        AAT      6.19   (11184) AGT      5.13   ( 9265)
ATC     34.49   ( 62287)        ACC     31.75   ( 57351)        AAC     23.56   (42552) AGC     23.82   (43027)
ATA      1.91   (  3457)        ACA      3.25   (  5876)        AAA      9.50   (17155) AGA      0.95   ( 1719)
ATG     22.80   ( 41181)        ACG      8.34   ( 15070)        AAG     24.41   (44086) AGG      2.04   ( 3689)

GTT      6.69   ( 12082)        GCT     11.50   ( 20770)        GAT     15.96   (28832) GGT     16.21   (29271)
GTC     21.05   ( 38025)        GCC     58.57   (105774)        GAC     37.42   (67586) GGC     49.64   (89650)
GTA      7.38   ( 13335)        GCA     13.20   ( 23836)        GAA     27.40   (49479) GGA      2.54   ( 4584)
GTG     37.12   ( 67032)        GCG     28.12   ( 50783)        GAG     29.21   (52758) GGG     11.75   (21217)
'''
aurefaciens_table = '''
TTT      0.29   (  3461)        TCT      0.43   (   5184)       TAT      0.49   (  5813)        TGT      0.57   (  6811)
TTC     26.11   (312272)        TCC     18.34   ( 219311)       TAC     19.38   (231808)        TGC      7.27   ( 86912)
TTA      0.04   (   492)        TCA      0.83   (   9868)       TAA      0.13   (  1543)        TGA      2.41   ( 28861)
TTG      2.64   ( 31522)        TCG     13.45   ( 160834)       TAG      0.52   (  6249)        TGG     15.07   (180286)

CTT      1.33   ( 15958)        CCT      1.31   (  15608)       CAT      1.05   ( 12570)        CGT      4.08   ( 48796)
CTC     35.24   (421521)        CCC     23.09   ( 276209)       CAC     22.14   (264738)        CGC     37.60   (449717)
CTA      0.45   (  5413)        CCA      1.28   (  15367)       CAA      1.43   ( 17043)        CGA      2.64   ( 31526)
CTG     67.81   (810998)        CCG     38.43   ( 459612)       CAG     27.56   (329643)        CGG     32.52   (388958)

ATT      0.48   (  5777)        ACT      1.08   (  12964)       AAT      0.54   (  6505)        AGT      1.33   ( 15882)
ATC     28.45   (340310)        ACC     46.54   ( 556602)       AAC     17.18   (205475)        AGC     14.29   (170913)
ATA      0.41   (  4921)        ACA      1.07   (  12852)       AAA      0.62   (  7432)        AGA      0.44   (  5240)
ATG     14.59   (174527)        ACG     12.60   ( 150662)       AAG     17.74   (212153)        AGG      2.83   ( 33872)

GTT      1.41   ( 16866)        GCT      3.09   (  36913)       GAT      2.69   ( 32176)        GGT      7.48   ( 89509)
GTC     45.83   (548149)        GCC     87.87   (1050908)       GAC     54.55   (652444)        GGC     64.40   (770248)
GTA      1.58   ( 18914)        GCA      4.89   (  58464)       GAA      8.00   ( 95624)        GGA      5.57   ( 66597)
GTG     35.67   (426603)        GCG     45.40   ( 543026)       GAG     47.08   (563099)        GGG     18.34   (219290)
'''

# Code to convert cocoputs to dnachisel format

In [4]:
AA_TO_CODON: Dict[str, List[str]] = {
    "*": ["TAA", "TAG", "TGA"],
    "A": ["GCA", "GCC", "GCG", "GCT"],
    "C": ["TGC", "TGT"],
    "D": ["GAC", "GAT"],
    "E": ["GAA", "GAG"],
    "F": ["TTC", "TTT"],
    "G": ["GGA", "GGC", "GGG", "GGT"],
    "H": ["CAC", "CAT"],
    "I": ["ATA", "ATC", "ATT"],
    "K": ["AAA", "AAG"],
    "L": ["CTA", "CTC", "CTG", "CTT", "TTA", "TTG"],
    "M": ["ATG"],
    "N": ["AAC", "AAT"],
    "P": ["CCA", "CCC", "CCG", "CCT"],
    "Q": ["CAA", "CAG"],
    "R": ["AGA", "AGG", "CGA", "CGC", "CGG", "CGT"],
    "S": ["AGC", "AGT", "TCA", "TCC", "TCG", "TCT"],
    "T": ["ACA", "ACC", "ACG", "ACT"],
    "V": ["GTA", "GTC", "GTG", "GTT"],
    "W": ["TGG"],
    "Y": ["TAC", "TAT"],
}


def translate(seq: str) -> str:
    return str(Seq(seq).translate())


def read_codon_table(table: str) -> Dict[str, float]:
    codon_counts: Dict[str, float] = {
        codon.upper(): int(count)
        for codon, count in re.findall(
            "([ATCG][ATCG][ATCG])\s+\d+.\d+\s+\(\s*(\d+)\)",
            table.replace("U", "T"),
        )
    }
    return codon_counts


def convert_cocoputs_table_to_dnachisel(
    table: str,
) -> Dict[str, Dict[str, float]]:
    codon_table_counts = read_codon_table(table)
    new_codon_table: Dict[str, Dict[str, float]] = {}
    for aa in AA_TO_CODON:
        new_codon_table[aa] = {}
        codon_sum: int = sum(
            [codon_table_counts[codon] for codon in AA_TO_CODON[aa]]
        )
        for codon in AA_TO_CODON[aa]:
            new_codon_table[aa][codon] = round(
                codon_table_counts[codon] / codon_sum, 3
            )
    return new_codon_table

# Examples

In [5]:
ecoli_dnachisel = convert_cocoputs_table_to_dnachisel(ecoli_table)
putida_dnachisel = convert_cocoputs_table_to_dnachisel(putida_table)
cglut_dnachisel = convert_cocoputs_table_to_dnachisel(cglut_table)
aurefaciens_dnachisel = convert_cocoputs_table_to_dnachisel(aurefaciens_table)

In [7]:
read_codon_table(ecoli_table)

{'TTT': 114170,
 'TCT': 42886,
 'TAT': 81936,
 'TGT': 26164,
 'TTC': 85242,
 'TCC': 44203,
 'TAC': 62447,
 'TGC': 32845,
 'TTA': 70493,
 'TCA': 35445,
 'TAA': 10523,
 'TGA': 4703,
 'TTG': 70177,
 'TCG': 45694,
 'TAG': 1063,
 'TGG': 78200,
 'CTT': 56257,
 'CCT': 35523,
 'CAT': 65279,
 'CGT': 108685,
 'CTC': 57113,
 'CCC': 27624,
 'CAC': 49544,
 'CGC': 114199,
 'CTA': 19687,
 'CCA': 43049,
 'CAA': 78398,
 'CGA': 17756,
 'CTG': 275324,
 'CCG': 121211,
 'CAG': 148464,
 'CGG': 26332,
 'ATT': 157252,
 'ACT': 45411,
 'AAT': 88890,
 'AGT': 43903,
 'ATC': 130003,
 'ACC': 120097,
 'AAC': 110550,
 'AGC': 82217,
 'ATA': 20845,
 'ACA': 34391,
 'AAA': 172330,
 'AGA': 9450,
 'ATG': 144372,
 'ACG': 73923,
 'AAG': 52038,
 'AGG': 5190,
 'GTT': 94356,
 'GCT': 78727,
 'GAT': 164381,
 'GGT': 128230,
 'GTC': 78233,
 'GCC': 132480,
 'GAC': 97771,
 'GGC': 153678,
 'GTA': 56425,
 'GCA': 104039,
 'GAA': 204803,
 'GGA': 39810,
 'GTG': 136348,
 'GCG': 175894,
 'GAG': 91034,
 'GGG': 56270}

In [6]:
ecoli_dnachisel

{'*': {'TAA': 0.646, 'TAG': 0.065, 'TGA': 0.289},
 'A': {'GCA': 0.212, 'GCC': 0.27, 'GCG': 0.358, 'GCT': 0.16},
 'C': {'TGC': 0.557, 'TGT': 0.443},
 'D': {'GAC': 0.373, 'GAT': 0.627},
 'E': {'GAA': 0.692, 'GAG': 0.308},
 'F': {'TTC': 0.427, 'TTT': 0.573},
 'G': {'GGA': 0.105, 'GGC': 0.407, 'GGG': 0.149, 'GGT': 0.339},
 'H': {'CAC': 0.431, 'CAT': 0.569},
 'I': {'ATA': 0.068, 'ATC': 0.422, 'ATT': 0.51},
 'K': {'AAA': 0.768, 'AAG': 0.232},
 'L': {'CTA': 0.036,
  'CTC': 0.104,
  'CTG': 0.501,
  'CTT': 0.102,
  'TTA': 0.128,
  'TTG': 0.128},
 'M': {'ATG': 1.0},
 'N': {'AAC': 0.554, 'AAT': 0.446},
 'P': {'CCA': 0.189, 'CCC': 0.121, 'CCG': 0.533, 'CCT': 0.156},
 'Q': {'CAA': 0.346, 'CAG': 0.654},
 'R': {'AGA': 0.034,
  'AGG': 0.018,
  'CGA': 0.063,
  'CGC': 0.406,
  'CGG': 0.094,
  'CGT': 0.386},
 'S': {'AGC': 0.279,
  'AGT': 0.149,
  'TCA': 0.12,
  'TCC': 0.15,
  'TCG': 0.155,
  'TCT': 0.146},
 'T': {'ACA': 0.126, 'ACC': 0.439, 'ACG': 0.27, 'ACT': 0.166},
 'V': {'GTA': 0.154, 'GTC': 0.214, '

In [1]:
# DEFINE THE OPTIMIZATION PROBLEM

initial_sequence = random_dna_sequence(30)

problem = DnaOptimizationProblem(
    sequence=initial_sequence,
    constraints=[
        AvoidPattern("BsaI_site"),
        EnforceGCContent(mini=0.3, maxi=0.7, window=50),
        EnforceTranslation(),
    ],
    objectives=[CodonOptimize(codon_usage_table=ecoli_dnachisel, original_codon_usage_table=aurefaciens_dnachisel)]
)

# SOLVE THE CONSTRAINTS, OPTIMIZE WITH RESPECT TO THE OBJECTIVE

problem.resolve_constraints()
problem.optimize()

# PRINT SUMMARIES TO CHECK THAT CONSTRAINTS PASS

print(problem.constraints_text_summary())
print(problem.objectives_text_summary())

# GET THE FINAL SEQUENCE (AS STRING OR ANNOTATED BIOPYTHON RECORDS)

final_sequence = problem.sequence  # string
final_record = problem.to_record(with_sequence_edits=True)


NameError: name 'random_dna_sequence' is not defined

In [6]:
initial_sequence

'CCTAGTGCGAAACGCTCCAAGTCCGTCAAC'

In [7]:
final_sequence

'CCGAGCGCGAAACGCAGCAAAAGCGTGAAC'

In [8]:
translate(initial_sequence)

'PSAKRSKSVN'

In [9]:
translate(final_sequence)

'PSAKRSKSVN'