# Cloning of insertion parts for insertion of pfa chain in *C. Cinerea*

### Imports

In [None]:
from teemi.design.combinatorial_design import DesignAssembly, Assembly
import os
from IPython.display import display
import pandas as pd
from pathlib import Path


if Path.cwd().name.lower() == "notebooks": #safe to run all cells multiple times
    os.chdir("..") #move to root to allow relative import
from src.smart_functions import read_fasta_to_dseqrecords

## Genetic elements

### Homologous recombination PABA marker

The PABA marker contains a homologous arm to restore PABA auxotrophy by introducing a amino acid change from glutamic acid to arginine. Correct homologous recombination can then be achieved and validated with selective plating and diagnostic PCR.

### PUFA Synthesis pathway

The PUFA synthesis pathway is from *Aetherobacter fasciculatus* (SBSr002) and contains pfa1, pfa2, pfa3 and a pptase for DHA synthesis.

## Primer creation using TEEMI

### Load promoter & Terminator sub libraries and PUFA sequences

In [2]:
top_promoters = r'data/promoter_terminator_library/subset_promoter.fasta'
top_terminators = r'data/promoter_terminator_library/subset_terminator.fasta'

m_paba_fa = r'data/insert_sequences/PABA.fasta'
cds_fa = r'data/insert_sequences/pufa_optimized.fasta' #pfa123 and pptase

promoters, promoter_names = read_fasta_to_dseqrecords(top_promoters)
terminators, terminator_names = read_fasta_to_dseqrecords(top_terminators)

cds_records, cds_names = read_fasta_to_dseqrecords(cds_fa)
m_paba, m_paba_names = read_fasta_to_dseqrecords(m_paba_fa)

#### A quick count of sequences and their length

In [7]:
print(f"Promoters: {len(promoters)}, CDS: {len(cds_records)}, Terminators: {len(terminators)}")
print("Example promoter length:", len(promoters[0].seq), "bp")
print("Example terminator length:", len(promoters[0].seq), "bp")

lengths = [len(rec.seq) for rec in cds_records]
print("CDS lengths: " + " ".join(f"{n}bp," for n in lengths))

Promoters: 4, CDS: 4, Terminators: 2
Example promoter length: 1000 bp
Example terminator length: 1000 bp
CDS lengths: 1605bp, 6666bp, 7908bp, 786bp,


### Putting it all together in a list for DesignAssembly

In [4]:
list_of_seqs  = [
    [m_paba[0]],
    promoters, [cds_records[0]], terminators, 
    promoters, [cds_records[1]], terminators, 
    promoters, [cds_records[2]], terminators,
    promoters, [cds_records[3]], terminators,
    [m_paba[1]]
    ]

### DesignAssembly

In the interest of reasonable complexity, it has been decided that constructs will generated for four promoters and two terminators. On top of this only variants where the promoter and terminator is the same for all genes will be generated. As otherwise variant count would be far to big and require more computational power and lab work. In this case the selection of primers and terminator sub libraries is mostly arbitrary and should be tailored to results from the quantative flouresence assay as constructed in *combinatorial_library_and_assembly*.<br>

This assembly can easily be scaled up if one wishes to test out specific combinations of different promoters for genes but this is outside the bounds of the current project.

In [5]:
TARGET_TM = 68
LIMIT = 13
OVERLAP = 35

all_variants = []
all_primers = []
all_pcrs = []

for i, prom in enumerate(promoters):
    for j, term in enumerate(terminators):
        print(f"Running DesignAssembly for promoter {i+1}/{len(promoters)} and terminator {j+1}/{len(terminators)}")
        list_of_seqs_per = [
            [m_paba[0]],
            [prom], [cds_records[0]], [term],
            [prom], [cds_records[1]], [term],
            [prom], [cds_records[2]], [term],
            [prom], [cds_records[3]], [term],
            [m_paba[1]]
        ]

        design = DesignAssembly(list_of_seqs_per, list_of_pads=[], positions_of_pads=[], target_tm=TARGET_TM, limit=LIMIT, overlap=OVERLAP)
        variants_df = design.show_variants_lib_df()
        primers_df  = design.primer_list_to_dataframe()
        pcrs_df     = design.pcr_list_to_dataframe()

        # tag
        variants_df['promoter_index'] = i
        variants_df['terminator_index'] = j
        primers_df['promoter_index'] = i
        primers_df['terminator_index'] = j
        pcrs_df['promoter_index'] = i
        pcrs_df['terminator_index'] = j

        all_variants.append(variants_df)
        all_primers.append(primers_df)
        all_pcrs.append(pcrs_df)

variants_df = pd.concat(all_variants, ignore_index=True) if all_variants else pd.DataFrame()
primers_df = pd.concat(all_primers, ignore_index=True) if all_primers else pd.DataFrame()
pcrs_df = pd.concat(all_pcrs, ignore_index=True) if all_pcrs else pd.DataFrame()

#Showing output
print(f"Variants: {len(variants_df)}")
print(f"Primers:  {len(primers_df)}")
print(f"PCRs:     {len(pcrs_df)}")
display(variants_df.head())
display(primers_df.head())
display(pcrs_df.head())

Running DesignAssembly for promoter 1/4 and terminator 1/2
Running DesignAssembly for promoter 1/4 and terminator 2/2
Running DesignAssembly for promoter 2/4 and terminator 1/2
Running DesignAssembly for promoter 2/4 and terminator 2/2
Running DesignAssembly for promoter 3/4 and terminator 1/2
Running DesignAssembly for promoter 3/4 and terminator 2/2
Running DesignAssembly for promoter 4/4 and terminator 1/2
Running DesignAssembly for promoter 4/4 and terminator 2/2
Variants: 8
Primers:  192
PCRs:     112


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,Systematic_name,Variant,promoter_index,terminator_index
0,PABA-UP,PKG1_promoter,pfa1,PKG1_terminator,PKG1_promoter,pfa2,PKG1_terminator,PKG1_promoter,pfa3,PKG1_terminator,PKG1_promoter,PPtase_BBa_K5300011,PKG1_terminator,PABA-DW,"(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)",0,0,0
1,PABA-UP,PKG1_promoter,pfa1,TEF1_terminator,PKG1_promoter,pfa2,TEF1_terminator,PKG1_promoter,pfa3,TEF1_terminator,PKG1_promoter,PPtase_BBa_K5300011,TEF1_terminator,PABA-DW,"(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)",0,0,1
2,PABA-UP,ADH1_promoter,pfa1,PKG1_terminator,ADH1_promoter,pfa2,PKG1_terminator,ADH1_promoter,pfa3,PKG1_terminator,ADH1_promoter,PPtase_BBa_K5300011,PKG1_terminator,PABA-DW,"(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)",0,1,0
3,PABA-UP,ADH1_promoter,pfa1,TEF1_terminator,ADH1_promoter,pfa2,TEF1_terminator,ADH1_promoter,pfa3,TEF1_terminator,ADH1_promoter,PPtase_BBa_K5300011,TEF1_terminator,PABA-DW,"(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)",0,1,1
4,PABA-UP,DED1_promoter,pfa1,PKG1_terminator,DED1_promoter,pfa2,PKG1_terminator,DED1_promoter,pfa3,PKG1_terminator,DED1_promoter,PPtase_BBa_K5300011,PKG1_terminator,PABA-DW,"(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)",0,2,0


Unnamed: 0,id,anneals to,sequence,annealing temperature,length,price(DKK),description,footprint,len_footprint,promoter_index,terminator_index
0,P001,PABA-UP,"(T, T, C, T, T, C, T, G, G, C, A, T, C, T, T, ...",68.36,26,46.8,Anneals to PABA-UP,"(T, T, C, T, T, C, T, G, G, C, A, T, C, T, T, ...",26,0,0
1,P002,PABA-UP,"(A, T, C, A, C, G, A, C, C, A, G, A, T, A, A, ...",68.58,45,81.0,"Anneals to PABA-UP, overlaps to 1036bp_PCR_prod","(C, C, T, C, T, C, T, T, A, C, T, C, C, C, G, ...",27,0,0
2,P003,PKG1_promoter,"(G, G, A, C, G, G, G, A, G, T, A, A, G, A, G, ...",68.28,44,79.2,"Anneals to PKG1_promoter, overlaps to PABA-UP","(G, T, G, T, T, A, T, C, T, G, G, T, C, G, T, ...",26,0,0
3,P004,PKG1_promoter,"(G, C, G, G, C, C, G, A, T, G, G, C, G, G, A, ...",68.18,50,90.0,"Anneals to PKG1_promoter, overlaps to 1641bp_P...","(T, G, T, G, G, A, T, T, G, T, G, A, A, G, A, ...",32,0,0
4,P005,pfa1,"(T, C, T, T, C, T, T, C, A, C, A, A, T, C, C, ...",66.97,35,63.0,"Anneals to pfa1, overlaps to PKG1_promoter","(A, T, G, T, C, C, G, C, C, A, T, C, G, G, C, ...",17,0,0


Unnamed: 0,pcr_number,template,forward_primer,reverse_primer,f_tm,r_tm,promoter_index,terminator_index
0,PCR1,PABA-UP,P001,P002,68.36,68.58,0,0
1,PCR2,PKG1_promoter,P003,P004,68.28,68.18,0,0
2,PCR3,pfa1,P005,P006,66.97,66.91,0,0
3,PCR4,PKG1_terminator,P007,P008,68.03,68.16,0,0
4,PCR5,PKG1_promoter,P009,P010,68.28,68.18,0,0


#### Saving output as CSVs

In [6]:
out_dir = os.getcwd()
variants_csv = os.path.join(out_dir, "data/constructs/full_construct_variants_library.csv")
primers_csv  = os.path.join(out_dir, "data/constructs/full_construct_primers_list.csv")
pcrs_csv= os.path.join(out_dir, "data/constructs/full_construct_pcr_plan.csv")

variants_df.to_csv(variants_csv, index=False)
primers_df.to_csv(primers_csv, index=False)
pcrs_df.to_csv(pcrs_csv, index=False)

## Notes on assembly of full constructs:
As mentioned in the report, we have the issue that the same promoters used multiple times gives rise to wrong assemblies. This could be solved by introducing small differences in each promoter. With a more sophisticated machine learning model it might be possible to see which basepairs have the lowest impact on promoter/terminator strength and mutating these. This will ensure that the construct is assembled correctly *in vivo* while maintaining optimal expression.