In [11]:
# Packages

import re
import pandas as pd
from Bio import SeqIO

In [38]:
# Initialize DataFrame Columns 

column_names = ["ID", "Start", "Stop", "Aptamer alignment"]

# Initialize list 

data = []

# Open .sto file from nhmmer aptamer alignment and organize contents to be placed in a DataFrame

with open("Predicted_RF01734_profile.sto", "r") as file:
    for line in file:
        if re.search('#', line) == None:
            if len(line) > 4:
                parts = line.strip().split("/")
                part_1 = parts[0]
                part_2 = parts[1].split(' ')
                part_2_position = part_2[0].split('-')
                part_2_start = part_2_position[0]
                part_2_stop = part_2_position[1]
                part_2_alignment = ' '.join(part_2[1:])
                parts = [part_1, part_2_start, part_2_stop, part_2_alignment]
                data.append(parts)

sto_df = pd.DataFrame(data, columns=column_names)

In [39]:
# Initialize list 

sequence_list = []

# Create a translation table to convert .sto alignment into just the sequence

translation_table = str.maketrans({'.':'','-':'','1':'','2':'','3':'','4':'','5':'','6':'','7':'','8':'','9':'','0':''})

# Translate alignment into sequence

for index,row in sto_df.iterrows():
    sequence = row['Aptamer alignment']
    sequence = sequence.translate(translation_table)
    sequence = sequence.replace(' ','').upper()
    sequence_list.append(sequence)

# Add sequence to DataFrame

sto_df['alignment sequence'] = sequence_list

In [40]:
# Initialize list 

fasta_data = []

# Open FASTA file of terminators

fasta_file = open('ARNold_Fluoride_terminators.fasta', 'r')

# Convert FASTA file into DateFrame

records = SeqIO.parse(fasta_file, "fasta")

for record in records:
    
    # Append sequence ID and sequence to the list
    
    fasta_data.append([record.id, str(record.seq)])
    
fasta_df = pd.DataFrame(fasta_data, columns=["ID", "Full Sequence"])

In [41]:
# Merge aptamer STO and ARNold FASTA DataFrames so that aligned aptamer is associated with remainer of sequence

merged_df = pd.merge(sto_df, fasta_df, on='ID', how='inner')

In [42]:
# Initialize list 

unaligned_sequence = []

# Gather the sequence not aligned to fluoride aptamer

for index, row in merged_df.iterrows():
    
    # Get position that aptamer alignment Stops from STO file
    
    stop = int(row['Stop'])
    
    # Save full sequence from FASTA file
    full_sequence = row['Full Sequence']
    
    # Save the post-aptamer alignment sequence
    unaligned_sequence.append(full_sequence[stop:])

# Add post-aptamer alignemnt sequence to DataFrame

merged_df['Unaligned Sequence'] = unaligned_sequence

In [44]:
# Export FASTA file of post-aptame alignment sequences

with open('Predicted_Terminators_Post_Aptamer.fasta', 'w') as f:
    for index, row in merged_df.iterrows():
        line = row['Unaligned Sequence']
        f.write('>'+str(row['ID'])+'\n'+str(line)+'\n')

In [45]:
# Initialize DataFrame Columns 

column_names = ["ID", "Start 3' end alignment", "Stop 3' end alignment", "EP Alignment"]

# Initialize list 

data = []

# Open .sto file from MUSCLE post-aptamer alignment and organize contents to be placed in a DataFrame

with open("Predicted_Post_Aptamer_Alignment.sto", "r") as file:
    for line in file:
        if re.search('#', line) == None:
            if len(line) > 4:
                parts = line.strip().split("/")
                part_1 = parts[0]
                part_2 = parts[1].split(' ')
                part_2_position = part_2[0].split('-')
                part_2_start = part_2_position[0]
                part_2_stop = part_2_position[1]
                part_2_alignment = ' '.join(part_2[1:])
                parts = [part_1, part_2_start, part_2_stop, part_2_alignment]
                data.append(parts)

alignment_df = pd.DataFrame(data, columns=column_names)

In [46]:
# Merge post-aptamer STO and FASTA DataFrames so that aligned aptamer is associated with remainer of sequence

merged_alignment_df = pd.merge(merged_df, alignment_df, on='ID', how='inner')

In [47]:
#Remove row duplicates

df_no_duplicates = merged_alignment_df.drop_duplicates(subset=['ID'])

# Export STO file that combines aptamer alignment from nhmmer and post-aptamer alignment from MUSCLE

with open('Predicted_Whole_Fluoride_Alignment.sto', 'w') as f:
    f.write('# STOCKHOLM 1.0\n')
    f.write('#=GF Alignment: cmalign to RF01734 for aptamer region\n')
    f.write('#=GF Alignment Program MuscleWS for post-aptamer\n')
    f.write('\n')
    
    for index, row in df_no_duplicates.iterrows():
        accession = str(row['ID'])
        sequence = (str(row['Aptamer alignment'])+str(row['EP Alignment'])).replace(' ','')
        sequence = sequence.replace('U','T')
        f.write(accession+'\t'+sequence+'\n')
        
    f.write('//')