# Parsing GenBank File For Promoter Sequences

In [1]:
# setting up imports and data directories
data_dir="../data/"
gbfile=data_dir+'GCF_002163715.1_ASM216371v1_genomic.gbff'

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
import pandas as pd

In [None]:
'''parse_genbank_file_for_utr

    This function parses a given GenBank file and extracts all UTR regions that do not face each other.
    It creates a comprehensive output: feature_dict, records, target_promotor_seqs, ordered_keys, seqs
    The feature_dict is a dictionary of all features in the GenBank file labelled as gene, it saves the position
    and the strand as well as the sequence itself. Records inherits the GenBank records within the GenBank file.
    Target_promoter_seqs inherits the sequences with UTRs and their corresponding positions on the genome.
    Ordered_keys is a list of keys for the feature_dict dictionary that is ordered (meaning this list preserves the
    occurence of sequences within the genome of the GenBank file) and seqs is a dictionary with sequences of the UTRs
    in the direction on which they occure within the DNA sequence of the GenBank file.
    
    :param gbfile
        :type str
    :param basepairs
        :type int
    :param end_basepairs
        :type int
    :param start_basepairs
        :type int
    :param filter_facing_seqs
        :type bool
    
    :returns feature_dict, records, target_promotor_seqs, ordered_keys, seqs
        :type tuple(dict, list, dict, list, dict)

'''
def parse_genbank_file_for_utr(gbfile:str,
                               basepairs=200, 
                               end_basepairs=180,
                               start_basepairs=60, 
                               filter_facing_seqs=True)->tuple:

    records = []
    for record in SeqIO.parse(gbfile,'genbank'):
        records.append(record)

    print("[+] Working on {} genbank records".format(len(records)))
    print("[+] Extracting all sequences with location from end plus {}bp".format(basepairs))
    count = 0
    feature_dict = {}
    #list inherits all protein identifier from start to end of gb file
    ordered_keys = []
    for index, rec in enumerate(records):
        try:
            for feature in rec.features:
                count += 1
                try:
                    if feature.type == 'gene':
                        #if condition to get all sequences, even the ones without any new annotation
                        if 'old_locus_tag' in feature.qualifiers:
                            if feature.location.strand == -1:
                                seq = records[0].seq[feature.location.end.position:feature.location.end.position+basepairs].reverse_complement()
                                loc = (feature.location.end.position,feature.location.end.position+basepairs)

                            elif feature.location.strand == 1:
                                seq = records[0].seq[feature.location.start.position-basepairs:feature.location.start.position]
                                loc = (feature.location.start.position-basepairs,feature.location.start.position)
                            
                            feature_dict[feature.qualifiers['old_locus_tag'][0]] = [feature.location,loc,seq,index]
                            ordered_keys.append(feature.qualifiers['old_locus_tag'][0])


                        else:
                            if feature.location.strand == -1:
                                seq = records[0].seq[feature.location.end.position:feature.location.end.position+basepairs].reverse_complement()
                                loc = (feature.location.end.position,feature.location.end.position+basepairs)
                            elif feature.location.strand == 1:
                                seq = records[0].seq[feature.location.start.position-basepairs:feature.location.start.position]
                                loc = (feature.location.start.position-basepairs,feature.location.start.position)
                            
                            feature_dict[feature.qualifiers['locus_tag'][0]] = [feature.location,loc,seq,index]
                            ordered_keys.append(feature.qualifiers['locus_tag'][0])

                except Exception as e:
                    print("[-] ERROR: {}".format(e))
                    pass
        except Exception as e:
            print("[-] ERROR: {}".format(e))
            continue
    
    #parsing feature dict - evaluating end location + basepairs of target sequences 
    target_promotor_seqs = {}
    print("[+] Try Parsing Results For Putative Promotor Sequences")
    print("\t[+] Applied filters: sequences have to be smaller than {} and bigger than {}".format(end_basepairs,start_basepairs))
    if filter_facing_seqs is True:
        print("\t[+] Filter out end facing sequences (-1 1 | 1 -1) that might share putative promotor sequences")
    for index, key in enumerate(ordered_keys):
        #do not evaluate start and end sequences
        if index > 0 and index < len(ordered_keys)-1:
            cds = feature_dict[key]
            
            if cds[0].strand == -1:#if the strand is negative look at the next sequence 
                key_before = ordered_keys[index+1]
                start = cds[0].end.position                
                cds_before = feature_dict[key_before]
                start_next_gene = cds_before[0].start.position
        

            elif cds[0].strand == 1:#if the strand is positive look at the sequence before
                key_before = ordered_keys[index-1]
                start = cds[0].start.position
                cds_before = feature_dict[key_before]
                start_next_gene = cds_before[0].end.position
            
            #filter out sequences that "look on each other"
            if filter_facing_seqs is True:
                if((cds[0].strand == -1 and cds_before[0].strand == 1) or (cds[0].strand == 1 and cds_before[0].strand == -1)) == False:
                    if cds[0].strand == -1:
                        if(abs(start_next_gene - start) <= end_basepairs) and (abs(start_next_gene - start) >= start_basepairs) :

                            if (int(cds[0].end.position) < int(cds_before[0].start.position)):
                                print("\t[+] ",key, key_before, abs(start_next_gene - start), cds[0].strand, cds_before[0].strand)
                                target_promotor_seqs[key] = [start_next_gene, start]
                            else:
                                print("\t\t[+] unusual overlapping sequences: {} - {} - strand: {}".format(key,key_before, cds[0].strand))

                    elif cds[0].strand == 1:
                        if(abs(start_next_gene - start) <= end_basepairs) and (abs(start_next_gene - start) >= start_basepairs):
                            if(int(cds_before[0].end.position) < int(cds[0].start.position)):
                                print("\t[+] ",key, key_before, abs(start_next_gene - start), cds[0].strand, cds_before[0].strand)
                                target_promotor_seqs[key] = [start_next_gene, start]
                            else:
                                print("\t\t[+] unusual overlapping sequences: {} - {} - strand: {}".format(key,key_before, cds[0].strand))


            else:
                if cds[0].strand == -1:
                    if (abs(start_next_gene - start) <= end_basepairs) and (abs(start_next_gene - start) >= start_basepairs) :
                        if(int(cds[0].end.position) < int(cds_before[0].start.position)) :
                            print("\t[+] ",key, key_before, abs(start_next_gene - start), cds[0].strand, cds_before[0].strand)
                            target_promotor_seqs[key] = [start_next_gene, start]
                        else:
                            print("\t\t[+] unusual overlapping sequences: {} - {} - strand: {}".format(key,key_before, cds[0].strand))                                       

                elif cds[0].strand == 1:
                    if(abs(start_next_gene - start) <= end_basepairs) and (abs(start_next_gene - start) >= start_basepairs):
                        if (int(cds_before[0].end.position) < int(cds[0].start.position)):
                            print("\t[+] ",key, key_before, abs(start_next_gene - start), cds[0].strand, cds_before[0].strand)
                            target_promotor_seqs[key] = [start_next_gene, start]
                        else:
                            print("\t\t[+] unusual overlapping sequences: {} - {} - strand: {}".format(key,key_before, cds[0].strand))

                            
        targets = target_promotor_seqs.keys()
        seqs = {}
        for rec in records:
            try:
                for feature in rec.features:
                    count += 1
                    try:
                        if feature.type == 'gene':
                            if 'old_locus_tag' in feature.qualifiers:
                                if feature.qualifiers['old_locus_tag'][0] in targets:
                                    if feature.location.strand == -1:
                                        seqs[feature.qualifiers['old_locus_tag'][0]] = rec.seq[feature.location.end.position:feature.location.end.position+basepairs]
                                    else:
                                        seqs[feature.qualifiers['old_locus_tag'][0]] = rec.seq[feature.location.start.position-basepairs:feature.location.start.position]

                            else:
                                if feature.qualifiers['locus_tag'][0] in targets:
                                    if feature.location.strand == -1:
                                        seqs[feature.qualifiers['locus_tag'][0]] = rec.seq[feature.location.end.position:feature.location.end.position+basepairs]
                                    else:
                                        seqs[feature.qualifiers['locus_tag'][0]] = rec.seq[feature.location.start.position-basepairs:feature.location.start.position]

                                
                    except Exception as e:
                        print("[-] ERROR: {}".format(e))
                        pass
            except Exception as e:
                print("[-] ERROR: {}".format(e))
                continue        

            
    print("[+] DONE")
    return feature_dict, records, target_promotor_seqs, ordered_keys, seqs

In [None]:
'''extract_sequences_based_on_target_promotor_seqs_dict

    Helper function to extract the target sequences from the GenBank files.
    This function parses the results of parse_genbank_file_for_utr and returns 
    the true UTR sequences.
    
    :param feature_dict
        :type dict
    :param target_promoter_seqs
        :type dict
    :param records
        :type list
    
    :returns seqs
        :type dict

'''
def extract_sequences_based_on_target_promotor_seqs_dict(feature_dict:dict,
                                                         target_promotor_seqs:dict, 
                                                         records:list)->dict:
    seqs = {}
    
    print("[+] Trying to extract putative promotor sequences")
    for target in target_promotor_seqs.keys():
        if feature_dict[target][0].strand == 1:
            loc_cds_start = target_promotor_seqs[target][0]
            loc_cds_end = target_promotor_seqs[target][1]
            try:
                seqs[target] = records[feature_dict[target][3]].seq[loc_cds_start:loc_cds_end].__str__()
            except Exception as e:
                print("[-] ERROR: {}".format(e))
        if feature_dict[target][0].strand == -1:
            loc_cds_start = target_promotor_seqs[target][1]
            loc_cds_end = target_promotor_seqs[target][0]
            try:
                seqs[target] = records[feature_dict[target][3]].seq[loc_cds_start:loc_cds_end].reverse_complement().__str__()
            except Exception as e:
                print("[-] ERROR: {}".format(e))
    print("[+] DONE")
    return seqs

In [None]:
#19.09.2022 downloaded from NCBI refseq ftp fileserver
res = parse_genbank_file_for_utr(gbfile, end_basepairs=170, start_basepairs=60)
seqs = extract_sequences_based_on_target_promotor_seqs_dict(res[0],res[2], res[1])

In [None]:
print("[+] Number of target promotor sequences: {}".format(len(res[2])))

In [None]:
print("[+] Number of extracted sequences: {}".format(len(seqs)))

In [None]:
df_seqs=pd.DataFrame(seqs,index=['seq']).transpose()
df_seqs = df_seqs.reset_index()
df_seqs.columns=['aep','seq']
df_seqs.head()

In [None]:
deseq2_excel=data_dir + "excel_sheet_ordered_degs_curvibacter_aep_to_wp.xlsx"
deseq2_df=pd.read_excel(deseq2_excel)
deseq2_df.head()

In [None]:
promotor_stength_df=deseq2_df.merge(df_seqs,on='aep')
f = lambda s: s.replace(",",".")
promotor_stength_df['Wp_Number'] = promotor_stength_df['Wp_Number'].apply(f)

In [None]:
print("[+] Number of target promoters after merging with transcriptome dataframe: {}".format(len(promotor_stength_df)))

In [None]:
print("[*] Extracting sequences that do not reside in the merged dataframe ...")
print("\t[*] Those sequences are mainly composed of tRNAs or other small nucleotide sequences")
diff = [d for d in list(df_seqs.aep.values) if d not in list(promotor_stength_df.aep)] # list of sequences that have putative promotor utrs but do not reside in the transcriptome dataframe
print("\t[*] DONE")
#e.g. AEP_00144 is a tRNA

In [None]:
raw_reads_path=data_dir + "raw_read_counts.csv"
raw_reads_df=pd.read_csv(raw_reads_path)
raw_reads_df.columns=["Wp_Number","G1","G2","G3","Hydra1","Hydra2","Hydra3"]
raw_reads_df.head()

In [None]:
promotor_stength_raw_df=promotor_stength_df.merge(raw_reads_df,on="Wp_Number")
promotor_stength_raw_df['readCountMeanGSamples'] = promotor_stength_raw_df[['G1','G2','G3']].mean(axis=1)
promotor_stength_raw_df['readCountMeanNormalized'] = promotor_stength_raw_df['readCountMeanGSamples']/promotor_stength_raw_df['readCountMeanGSamples'].max()
promotor_stength_raw_df.head()

In [None]:
print("[+] Length of dataframe after merging raw read data: {}".format(len(promotor_stength_raw_df)))

In [None]:
df_targets = promotor_stength_raw_df[['Wp_Number','aep','readCountMeanGSamples','readCountMeanNormalized','log2FoldChange','seq']]

In [None]:
df_targets.head()

In [None]:
df_targets.to_csv("../results/readCountsNormalizedWpToAEP.csv")

In [None]:
result_df=df_targets.copy()

# Filtering for high/low expressed genes

In [None]:
#top 400 expressed regions
high_400 = df_targets.sort_values(by='readCountMeanNormalized', ascending=False)[:400]
#smallest 100 of the remaining regions
other_seqs = df_targets.sort_values(by='readCountMeanNormalized', ascending=False)[400:]

In [None]:
high_400.head()

In [None]:
#creating sequence length column for the subset of the smallest 100 of remaining regions
seq_length=lambda seq: len(seq)
other_seqs['seq_length']=other_seqs['seq'].apply(seq_length)
high_400['seq_length']=high_400['seq'].apply(seq_length)

In [None]:
low_100=other_seqs.sort_values(by='seq_length',ascending=True)[:100]

In [None]:
read_level_result_df=pd.concat([high_400,low_100])

## Plotting genome location of target sequences

In [None]:
import numpy as np
import matplotlib.pyplot as plt

#values for plotting
ordered_gene_list = res[3] #ordered_dict
result_gene_list = list(result_df['aep'])
steps = 1/len(ordered_gene_list)

x_values=np.arange(1,len(ordered_gene_list)+1)*steps
y_values=np.repeat(1,len(x_values))
#print(len(x_values) == len(y_values))
x2_values=[]
for key,value in zip(ordered_gene_list,x_values):
    if key in result_gene_list:
        x2_values.append(value)
y2_values=list(np.repeat(1,len(x2_values)))
#print(len(x2_values) == 500)
#print(len(x2_values) == len(y2_values))

In [None]:
plt.figure(figsize=(30,8))
plt.scatter(x=x_values,y=y_values, s=1000)
plt.scatter(x=x2_values,y=y2_values, s=5, c='r')
plt.ylim(0.9,1.1)
plt.grid()

In [None]:
target_promoters = []
with open("../data/target_promoter_sequences.faa","r") as ffile:
    for line in ffile.readlines():
        if line.startswith(">"):
            target_promoters.append(line.rstrip().split(">")[1])

In [None]:
len(target_promoters)

# Generate Translation Table For Target Promoter Regions

In [3]:
records = []
for record in SeqIO.parse(gbfile,'genbank'):
    records.append(record)

In [4]:
aep_to_cpl = pd.read_csv("../data/cpls_to_aep.csv")

In [6]:
aeps = aep_to_cpl["AEP nr"].to_list()

In [21]:
count = 0
wp_numbers = []
aep_numbers = []
for rec in records:
    for feature in rec.features:
        if "locus_tag" in feature.qualifiers:
            if feature.qualifiers["locus_tag"][0] in aeps:
                if "protein_id" in feature.qualifiers:
                    aep_numbers.append(feature.qualifiers["locus_tag"][0])
                    wp_numbers.append(feature.qualifiers["protein_id"][0])
                    #print(feature.qualifiers["locus_tag"], feature.qualifiers["protein_id"])
                    count+=1
aep_numbers.append('J23100 control')
wp_numbers.append("J23100")

In [25]:
aep_wp_to_cpl = aep_to_cpl.merge(pd.DataFrame({"AEP nr":aep_numbers,"WP nr":wp_numbers}), on="AEP nr")
aep_wp_to_cpl.to_csv("../data/aep_wp_to_cpl.csv")

In [30]:
blast_results = pd.read_csv("../results/Promoter_Manuscript_BLAST_Results_TF_Motifs.csv", index_col=0)
blast_results.head()
blast_results["WP nr"] = blast_results.sseqid.apply(lambda x: x.split("_")[0] + "_" + x.split("_")[1])

In [51]:
for qseq in blast_results.qseqid.unique():
    print(qseq, len(blast_results[blast_results["qseqid"] == qseq]))

tr|I6XGD8|DosR 28
sp|P25084|LASR_PSEAE 5
tr|Q92L12|Q92L12_RHIME 5
sp|Q87KN2|LEXA_VIBPA 3
tr|A0A0H2ZQ65|A0A0H2ZQ65_STRP2 5


In [53]:
len(blast_results["WP nr"].unique())

38

In [56]:
blast_results[blast_results["WP nr"].duplicated()]

Unnamed: 0,qseqid,sseqid,pident,evalue,bitscore,slen,qgi,sgi,sacc,staxids,...,scomnames,stitle,organism_name_taxdb,genus,family,superfamily,order,class,phylum,WP nr
15,tr|I6XGD8|DosR,WP_087497238.1_GCF_002163715.1_ASM216371v1,34.524,0.00022,38.1,330,0,0,WP_087497238.1_GCF_002163715.1_ASM216371v1,1844971,...,Curvibacter sp. AEP1-3,response regulator transcription factor [Curvi...,Curvibacter sp. AEP1-3,Curvibacter,Comamonadaceae,unknown,Burkholderiales,Betaproteobacteria,Pseudomonadota,WP_087497238.1
30,sp|P25084|LASR_PSEAE,WP_087496564.1_GCF_002163715.1_ASM216371v1,36.765,3.9e-07,46.6,267,0,0,WP_087496564.1_GCF_002163715.1_ASM216371v1,1844971,...,Curvibacter sp. AEP1-3,transcriptional regulator EpsA [Curvibacter sp...,Curvibacter sp. AEP1-3,Curvibacter,Comamonadaceae,unknown,Burkholderiales,Betaproteobacteria,Pseudomonadota,WP_087496564.1
31,sp|P25084|LASR_PSEAE,WP_087497238.1_GCF_002163715.1_ASM216371v1,42.105,1.85e-06,44.7,330,0,0,WP_087497238.1_GCF_002163715.1_ASM216371v1,1844971,...,Curvibacter sp. AEP1-3,response regulator transcription factor [Curvi...,Curvibacter sp. AEP1-3,Curvibacter,Comamonadaceae,unknown,Burkholderiales,Betaproteobacteria,Pseudomonadota,WP_087497238.1
32,sp|P25084|LASR_PSEAE,WP_157673178.1_GCF_002163715.1_ASM216371v1,37.5,0.000398,37.4,205,0,0,WP_157673178.1_GCF_002163715.1_ASM216371v1,1844971,...,Curvibacter sp. AEP1-3,response regulator transcription factor [Curvi...,Curvibacter sp. AEP1-3,Curvibacter,Comamonadaceae,unknown,Burkholderiales,Betaproteobacteria,Pseudomonadota,WP_157673178.1
33,tr|Q92L12|Q92L12_RHIME,WP_087495460.1_GCF_002163715.1_ASM216371v1,28.177,7.75e-14,64.3,237,0,0,WP_087495460.1_GCF_002163715.1_ASM216371v1,1844971,...,Curvibacter sp. AEP1-3,autoinducer binding domain-containing protein ...,Curvibacter sp. AEP1-3,Curvibacter,Comamonadaceae,unknown,Burkholderiales,Betaproteobacteria,Pseudomonadota,WP_087495460.1
34,tr|Q92L12|Q92L12_RHIME,WP_087496729.1_GCF_002163715.1_ASM216371v1,27.273,2.04e-13,63.2,237,0,0,WP_087496729.1_GCF_002163715.1_ASM216371v1,1844971,...,Curvibacter sp. AEP1-3,autoinducer binding domain-containing protein ...,Curvibacter sp. AEP1-3,Curvibacter,Comamonadaceae,unknown,Burkholderiales,Betaproteobacteria,Pseudomonadota,WP_087496729.1
36,tr|Q92L12|Q92L12_RHIME,WP_087496564.1_GCF_002163715.1_ASM216371v1,40.385,2.91e-05,40.0,267,0,0,WP_087496564.1_GCF_002163715.1_ASM216371v1,1844971,...,Curvibacter sp. AEP1-3,transcriptional regulator EpsA [Curvibacter sp...,Curvibacter sp. AEP1-3,Curvibacter,Comamonadaceae,unknown,Burkholderiales,Betaproteobacteria,Pseudomonadota,WP_087496564.1
37,tr|Q92L12|Q92L12_RHIME,WP_232460039.1_GCF_002163715.1_ASM216371v1,32.203,0.000113,37.0,98,0,0,WP_232460039.1_GCF_002163715.1_ASM216371v1,1844971,...,Curvibacter sp. AEP1-3,response regulator transcription factor [Curvi...,Curvibacter sp. AEP1-3,Curvibacter,Comamonadaceae,unknown,Burkholderiales,Betaproteobacteria,Pseudomonadota,WP_232460039.1


In [55]:
blast_results[blast_results["WP nr"].duplicated()]["sacc"].unique()

array(['WP_087497238.1_GCF_002163715.1_ASM216371v1',
       'WP_087496564.1_GCF_002163715.1_ASM216371v1',
       'WP_157673178.1_GCF_002163715.1_ASM216371v1',
       'WP_087495460.1_GCF_002163715.1_ASM216371v1',
       'WP_087496729.1_GCF_002163715.1_ASM216371v1',
       'WP_232460039.1_GCF_002163715.1_ASM216371v1'], dtype=object)

In [36]:
for wp_number in blast_results["WP nr"]:
    if wp_number in wp_numbers:
        print(wp_number)

# Plotting Circular Genome Plot

In [None]:
from pycirclize import Circos
from pycirclize.parser import Genbank
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

In [None]:
gbk = Genbank(gbfile)

In [None]:
df_500 = pd.read_excel("../data/curvibacter_first_promoter_lib.xlsx")

In [None]:
aep_to_cpl = pd.read_csv("../data/cpls_to_aep.csv")
aep_to_cpl = aep_to_cpl[["AEP nr", "strain ID"]]

In [None]:
circos = Circos(sectors={gbk.name: gbk.range_size})
circos.text("$\it{Curvibacter\ sp.}$ AEP1-3\n" + f"\n{gbk.name}", size=9, r=20)
sector = circos.get_sector(gbk.name)
#circos.rect(r_lim=(60, 100), fc="lightgrey", ec="none", alpha=0.5)
#sector = circos.sectors[0]

major_ticks_interval = 500000
minor_ticks_interval = 100000
outer_track = sector.add_track((80, 85))
outer_track.xticks_by_interval(
    major_ticks_interval, label_formatter=lambda v: f"{v/ 10 ** 6:.1f} Mb"
)
outer_track.xticks_by_interval(minor_ticks_interval, tick_length=1, show_label=False)
f_cds_track = sector.add_track((60, 69), r_pad_ratio=0.1)
r_cds_track = sector.add_track((70, 79), r_pad_ratio=0.1)
highlight_cds_track = sector.add_track((95,100), r_pad_ratio=0.1)

highlights = []

# extracting forward genes
f_cds_feats = gbk.extract_features("CDS", target_strand=1)

f_plot_features = []
for feature in f_cds_feats:
    if 'old_locus_tag' in feature.qualifiers.keys(): 
        if feature.qualifiers['old_locus_tag'][0] in list(df_500.name):
            f_plot_features.append(feature)
        if feature.qualifiers['old_locus_tag'][0] in target_promoters:
            highlights.append(feature)
    else:
        if feature.qualifiers['locus_tag'][0] in list(df_500.name):
            f_plot_features.append(feature)
        if feature.qualifiers['locus_tag'][0] in target_promoters:
            highlights.append(feature)

# plot forward genes
f_cds_track.genomic_features(f_plot_features, fc="red", lw=0.2, edgecolor="red")


# extracting reverse genes
r_cds_feats = gbk.extract_features("CDS", target_strand=-1)

r_plot_features = []
for feature in r_cds_feats:
    if 'old_locus_tag' in feature.qualifiers.keys(): 
        if feature.qualifiers['old_locus_tag'][0] in list(df_500.name):
            r_plot_features.append(feature)
        if feature.qualifiers['old_locus_tag'][0] in target_promoters:
            highlights.append(feature)
    else:
        if feature.qualifiers['locus_tag'][0] in list(df_500.name):
            r_plot_features.append(feature)
        if feature.qualifiers['locus_tag'][0] in target_promoters:
            highlights.append(feature)

# plot reverse genes and highlight genes
r_cds_track.genomic_features(r_plot_features, fc="blue", lw=0.2, edgecolor="blue")
highlight_cds_track.genomic_features(highlights, fc="orange", lw=1, edgecolor="orange")

# fetch labels for highlight genes
labels, label_pos_list = [], []
labels_cpl, label_pos_list_cpl = [], []

for index, feat in enumerate(highlights):
    if index == 0:
        previous_label = 0
    else:
        previous_label = label_pos
    
    start = int(str(feat.location.start))
    end = int(str(feat.location.end))
    label_pos = (start + end) / 2
    
    cpl_id = aep_to_cpl[aep_to_cpl["AEP nr"] == feat.qualifiers["locus_tag"][0]]["strain ID"].values
    if len(cpl_id) > 0:
        cpl_number = cpl_id[0]
    else:
        cpl_number = ""
    
    gene_name = feat.qualifiers.get("gene", [None])[0]
    if gene_name is not None:
        labels.append(gene_name + " - " + cpl_number)
        label_pos_list.append(label_pos)
    else:
        labels_cpl.append(cpl_number)
        label_pos_list_cpl.append(label_pos)
        
highlight_cds_track.xticks(label_pos_list, labels, label_size=8, label_orientation="vertical")
highlight_cds_track.xticks(label_pos_list_cpl, labels_cpl, label_size=5, label_orientation="vertical")

## Plot xticks (interval = 10 Kb)
#highlight_cds_track.xticks_by_interval(
#    10000, outer=False, label_formatter=lambda v: f"{v/1000:.1f} Kb"
#)

# plot gc content
gc_content_track = sector.add_track((40, 55))

pos_list, gc_contents = gbk.calc_gc_content()
gc_contents = gc_contents - gbk.calc_genome_gc_content()
positive_gc_contents = np.where(gc_contents > 0, gc_contents, 0)
negative_gc_contents = np.where(gc_contents < 0, gc_contents, 0)
abs_max_gc_content = np.max(np.abs(gc_contents))
vmin, vmax = -abs_max_gc_content, abs_max_gc_content
gc_content_track.fill_between(
    pos_list, positive_gc_contents, 0, vmin=vmin, vmax=vmax, color="black"
)
gc_content_track.fill_between(
    pos_list, negative_gc_contents, 0, vmin=vmin, vmax=vmax, color="grey"
)


fig = circos.plotfig()

# Add legend
handles = [
    Patch(color="red", label="Forward 5'UTR Regions"),
    Patch(color="blue", label="Reverse 5'UTR Regions"),
    Patch(color="orange", label="Analyzed 5'UTR Regions"),
    Line2D([], [], color="black", label="Positive GC Content", marker="^", ms=6, ls="None"),
    Line2D([], [], color="grey", label="Negative GC Content", marker="v", ms=6, ls="None")
]
_ = circos.ax.legend(handles=handles, bbox_to_anchor=(0.5, 0.475), loc="center", fontsize=8)

plt.savefig("../results/circos_curvibacter_genome.png", dpi=400)

# Trimming Length Of Sequences

In [None]:
#import the list of promoter sequences in 5' to 3' direction in a list
raw_library = list(result_df['seq'])

# determine maximum length of promoter site to be used for expression and the maximum size of oligos to order
promoter_max_len = 98
synthesis_maxlen = 150
# determine sequences for cloning to be attached to the promoters
# designed in snapgene by maurice mager
upstream_cloningsite = "tcgagtacgacttcggtctcaGGAGc"
downstream_cloningsite = "cAATGtgagaccgaacgtcagtgatc"

# random sequence to fill synthesis order, will be cut during cloning and has no impact on construct
# filling of random nucleotides behind cloning sites if sequence too small e.g. < 98
fillup = "ATCGATCGCTAGCTAGCTAGCATCGACTATCGTCGATCGATCGATGCATGCATCTGTACGATCGACTAGCTAGTCGACTATCGACTGACTGACTGACTG"

aep_seq={}
for aep,seq in zip(result_df['aep'],result_df['seq']):
    #save the original sequence to get raw_library.index in the end
    rawstring = seq
    # cut off first base to match final RBS distance in construct - why?
    seq = seq[:-1]
    # trimming too long promoters
    if len(seq) > promoter_max_len:
        print("[+] Trimming sequence: {} with length: {}".format(aep,len(seq)))
        overlength = len(seq) - promoter_max_len
        seq = seq[overlength:]
        
    # adding cloning sites to both ends
    seq = upstream_cloningsite + seq
    seq += downstream_cloningsite
    
    #fill up synthesis for order of equal length oligos
    seq = fillup + seq
    overlength_syn = len(seq) - synthesis_maxlen
    seq = seq[overlength_syn:]

    aep_seq[aep]=seq


# checks if size of the list, size of each oligo is correct
for aep in aep_seq.keys():
    if len(aep_seq[aep]) == synthesis_maxlen:
        print("[+] {}".format(aep))
        print("\t[+] length:" + str(len(aep_seq[aep])))
    else:
        print("[-] ERROR: oligo for {} has not the correct length".format(aep))
if len(aep_seq.keys()) == len(raw_library):
    print("[+] size of order is equal to library")
else:
    print("[-] ERROR: size of order doesnt match original library")

In [None]:
trimmed_seqs_df=pd.DataFrame(aep_seq, index=['seq']).transpose()
trimmed_seqs_df = trimmed_seqs_df.reset_index()
trimmed_seqs_df.columns=['aep','trimmed_seq']
trimmed_seqs_df.head()

In [None]:
result_df=result_df.merge(trimmed_seqs_df,on='aep')

In [None]:
result_df.head()

## Checking Trimming Procedure

In [None]:
pContent = lambda cont:print(str(cont))

In [None]:
result_df[result_df['aep'] == 'AEP_00006']['seq'].apply(pContent)

In [None]:
result_df[result_df['aep'] == 'AEP_00006']['trimmed_seq'].apply(pContent)

## Filtering for restriction sites

In [None]:
# restriction size filtering:
bbs1="GAAGAC"
bsmb1="CGTCTC"
bsa1="GGTCTC"

In [None]:
filtered=[]
for aep,seq in zip(result_df['aep'],result_df['seq']):
    if bsa1 in str(seq):
        filtered.append(aep)
        print("[+] Found BSA1 Restriction site: {} in {}".format(bsa1,aep))
        idx=result_df[result_df['aep'] == aep].index.to_list()
        if len(idx) > 1:
            raise Exception("[-] Multiple rows for one target")
        else:
            idx=idx[0]
            print("[+] Removing row {} from dataframe".format(idx))
            result_df=result_df.drop(idx)#deletion of sequences with BSA1 restriction sites
    if bbs1 in str(seq):
        print("\t[+] Found BBS1 Restriction site: {} in {}".format(bbs1,aep))
    if bsmb1 in str(seq):
        print("\t[+] Found BSMB1 Restriction site: {} in {}".format(bsmb1,aep))
print("[-] Number of promotors with restriction sites: {}".format(len(filtered)))

In [None]:
print("[+] length of final dataframe: {}".format(len(result_df)))

In [None]:
data_dir_excel=data_dir + "target_promotors.xlsx"
result_df.to_excel(data_dir_excel)

In [None]:
additional_promotor_seqs=["AEP_03486","AEP_03466","AEP_03465","AEP_03459","AEP_02900"]