In [1]:
# install packages

import re
import pandas as pd
from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import SeqFeature
from Bio.SeqFeature import SeqFeature, FeatureLocation

#use own email
Entrez.email ='Your Email'

In [2]:
# J23119 E. coli promoter

five_seq = 'gcttccggcttgattctaaagatctttgacagctagctcagtcctaggtataatactagt'

# 3` primer binding site

three_seq = 'gggcacaaattttctgtccg'

# Oligo synethesis length maximum

TWIST_max_nt_synthsis = 300

In [3]:
# Specify the file path to the FASTA dowloaded from Rfam
file_path = '20230524_RF01734.fa'

file = open('20230524_RF01734.fa','r')
dna_seq = file.readlines()
file.close()

In [4]:
# Initialize lists

ID = []
Sequence = []
Start = []
Stop = []

# Organize Rfam FASTA file into DataFrame

for line in dna_seq:
    if re.search('>', line) != None: # If the line is the Accession code
        line = line.split('/')
        ID.append(line[0])
        genome_start = line[1].split('-')
        Start.append(genome_start[0])
        genome_stop = genome_start[1].split(' ')
        Stop.append(genome_stop[0])
    else:
        Sequence.append(line)

In [5]:
# Create DataFrame

fasta_df = pd.DataFrame({'Accession': ID, 'Start': Start, 'Stop': Stop, 'Sequence': Sequence})

In [6]:
def access_NCBI(accession, start, stop, strand, aptmer_length):
    """
    This function takes genomic positions and saves the nucleotide sequence of the next 150 nucleotides downstream the region stop
    
    Input: start and stop positions within a genome
    
    Output:
        sequence = full sequence from RFam next 150 nts downstream
    
    """
    
    extend_length = TWIST_max_nt_synthsis - (len(five_seq)+
                                              len(three_seq)+
                                             aptamer_length) - 1
    if strand == 'Positive':
        stop = int(stop) + extend_length
        handle = Entrez.efetch(db="nucleotide", 
                               id=accession, 
                               rettype="gb", 
                               retmode="text", 
                               seq_start=start, 
                               seq_stop=stop)
    
        record = SeqIO.read(handle, "genbank")
        sequence = record.seq
        handle.close()
    
    else: # For scaffolds on the Negative Strand
        start = int(start) - extend_length
        handle = Entrez.efetch(db="nucleotide", 
                                       id=accession, 
                                       rettype="gb", 
                                       retmode="text", 
                                       seq_start=start, 
                                       seq_stop=stop)
        record = SeqIO.read(handle, "genbank")
        sequence = record.seq
        handle.close()
        sequence = sequence.reverse_complement()
    
    return str(sequence), extend_length

In [7]:
# Define lists
nucleotides_list = []
aptamer_length_list = []
strand_list = []
codon_start_list = []
aptamer_downstream_gene_list = []
nt_len = []
extended_list = []

# Run NCBI information collection for each row

for index, row in fasta_df.iterrows():
    accession = row['Accession'][1:]
    if re.search(r'^U', accession) == None:
    
        if int(row['Start']) < int(row['Stop']):
            start = row['Start']
            stop = row['Stop']
            strand = 'Positive'
        else:
            start = row['Stop']
            stop = row['Start']
            strand = 'Negative'
        
        aptamer_length = int(stop)-int(start)
        
# Follow along with the search

        print('Currently looking for: '+ accession+' with aptamer length of '+str(aptamer_length))

        nucleotides, extended = access_NCBI(accession, start, stop, strand, aptamer_length)

        # Save information into list

        aptamer_length_list.append(int(stop) - int(start))
        nucleotides_list.append(nucleotides)
        strand_list.append(strand)
        nt_len.append(len(nucleotides))
        extended_list.append(extended)
    else:
        aptamer_length_list.append('tbd')
        nucleotides_list.append('tbd')
        strand_list.append('tbd')
        nt_len.append('tbd')
        extended_list.append('tbd')

In [None]:
# Add gathered information to DataFrame

fasta_df['Extended Sequences'] = nucleotides_list
fasta_df['Strand'] = strand_list

In [None]:
# Remove gathered sequences that contain an intetermindant nucleotide 

fasta_df = fasta_df[~fasta_df['Extended Sequences'].str.contains('N')]

In [None]:
# Add on experimental sequences 

fasta_df['Twist Prepped Sequence'] =  (five_seq + fasta_df['Extended Sequences'].str[5:] + three_seq) # Remove first 5 nucleotides of leader as can interfere with aptamer structure and aptamer starts at position 10

In [None]:
# Calculate length 

fasta_df['Twist Length'] = fasta_df['Twist Prepped Sequence'].apply(len)

In [None]:
# Remove duplicate oligos 

fasta_df = fasta_df.drop_duplicates(subset=['Extended Sequences'], keep='first')

In [None]:
# Export prepared oligos to Excel 

fasta_df.to_excel('20240408_Twist_Order.xlsx',index=False)