In [1]:
# Packages

import pandas as pd
import re
import os
import sys
from Bio import SeqIO

# Install Selenium and chromedriver binary

import chromedriver_binary
import selenium
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Set chromedriver-binary version

!{sys.executable} -m pip install selenium chromedriver-binary==127.0.6533.119
#!{sys.executable} -m pip install --upgrade --force-reinstall chromedriver-binary-auto

# Set chrome_options

chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox") # linux only
chrome_options.add_argument("--headless")

# Get the current working directory

cwd = os.getcwd()

In [2]:
def runARNold(file_path):
    
    '''
    This function takes a FASTA file and sends it to the ARNold webserver then retrives and organizes the results. 
    
    Input: file path to FASTA file of extended Rfam entries
    
    Output:
        results = DataFrame of ARNold output for predicted intrinsic terminators
    '''
    
    
    # Opens a Chrome window (headless)
    driver = webdriver.Chrome(options=chrome_options)

    # Opens ARNOLD website on Chrome window
    driver.get('http://rssf.i2bc.paris-saclay.fr/toolbox/arnold/index.php')

    #Changes 'Search Strand' from 'Both' to 'Forward'
    strand = driver.find_element(By.ID, '5prime')
    strand.click()
    
    # Finds submit/'Run' button
    submit = driver.find_element(By.NAME, 'Run')

    # Finds button to upload input file
    file = driver.find_element(By.NAME, 'TTPfile')

    # Upload file
    file.send_keys(file_path)

    # Click submit
    submit.click()

    # Save results
    results = driver.find_element(By.XPATH, "//div[@id='content']/fieldset/span").text
    results = [x for x in results.split('\n')]
    
    element = driver.find_element(By.XPATH, "//div[@id='content']/fieldset/span")
    html_content = element.get_attribute('innerHTML')
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract sequences and their corresponding blue and red span
    result_hairpins = []
    panel = []
    
    for line in soup.prettify().split('<br>'):
        # Clean up the line
        line = re.sub(r'[\n\s]+', ' ', line).strip()
        panel = line.split('&gt')

        # Skip empty lines
        if not line:
            continue

        # Extract sequence identifier
        for item in panel:
            match = re.match(r';(\S+),(\d+)', item)
            if match:
                seq_id = match.group(1)
                position = match.group(2)
                result = {"sequence_id": seq_id, "position": position, "blue": [], "red": []}
                result_hairpins.append(result)
            soup_line = BeautifulSoup(item, 'html.parser')
            blue_texts = [span.text for span in soup_line.find_all('span', style="color: blue;")]
            red_texts = [span.text for span in soup_line.find_all('span', style="color: red;")]
            if blue_texts or red_texts:
                result_hairpins[-1]["blue"].extend(blue_texts)
                result_hairpins[-1]["red"].append(red_texts)
          
    # Close Chrome window
    driver.quit()
    
    #finding sequences with dual terminator hits
    for i in range(0, len(results)): #go through list
         if re.match('   ', results[i]) != None: #identify sequences
                if re.match('>', results[i+2]) == None: #identify duplicated sequence
                    results.insert(i+1, "")
                    species = results[i-1]
                    results.insert(i+2, species) #give second terminator the same accession number

    return results, result_hairpins

In [3]:
# Open Excel of extended fluoride sequences with their coding region as a DataFrame

df = pd.read_excel('Fluoride_CDS.xlsx')

In [4]:
# Turn DataFrame into FASTA file to send to ARNold

with open('ARNold_Submit.fasta', 'w') as f:
    for index, row in df.iterrows():
        f.write('>'+row['Accession']+','+str(row['Start'])+'\n'+row['Extended Sequences']+'\n')

In [8]:
# Turn DataFrame into FASTA file for downstream R-Scape cmsearch

# Remove duplicate entries that'll cause R-scape issues

df = df.drop_duplicates(subset=['Accession'], keep='first')

with open('Fluoride_extended.fasta', 'w') as f:
    for index, row in df.iterrows():
        f.write('>'+row['Accession']+'\n'+row['Extended Sequences']+'\n')

In [5]:
# Specify the path to your FASTA file that ARNold will accept

fasta_file_path = 'ARNold_Submit.fasta'
absolute_path = os.path.abspath(fasta_file_path)

In [6]:
# Run FASTA file through ARNold

ARNold_results, ARNold_hairpins = runARNold(absolute_path)

In [None]:
# Create DataFrame from ARNold results

ARNold_df = pd.DataFrame(ARNold_hairpins)

# Add a character or string to each entry in the 'sequence_id' column

ARNold_df['sequence_id'] = ARNold_df['sequence_id'].apply(lambda x: ">" + x)

# Rename columns

ARNold_df = ARNold_df.rename(columns={
    "sequence_id": "Accession",
    "position": "Start",
    "blue": "Stem",
    "red": "Loops"
})

In [None]:
# Create DataFrame that can be combined with the active information collection

accession_list = []
region_start_list = []
ARNold_result_list = []

current_accession = None
accession = ''

for line in ARNold_results:
    if len(line) > 0:
        if re.search('>', line) != None:
            line = line.split(',')
            accession = line[0]
            current_accession = accession
            accession_list.append(accession)
            region_start_list.append(line[1]) 
        else:
            if current_accession == accession:
                if re.search('Total number', line) == None:
                    ARNold_result_list.append(line)
                    accession = None
            else:
                print(line)
            
ARNold_All_df = pd.DataFrame({'Accession': accession_list,
 'Start': region_start_list,
 'ARNold Results': ARNold_result_list}) #subtract the last line item of totalling the terminators
#ARNold_All_df['Start'] = ARNold_All_df['Start'].astype(int)


In [None]:
# Merge DataFrames based on the 'ID' column

merged_df = pd.merge(ARNold_All_df, ARNold_df, on=['Accession','Start'], how='outer')

In [None]:
# Initialize list 

fasta_data = []

# Open FASTA file of terminators

fasta_file = open(fasta_file_path, 'r')

# Convert FASTA file into DateFrame

records = SeqIO.parse(fasta_file, "fasta")

for record in records:
    
    # Split the sequence ID into Accession and Start
    
    sequence_id = record.id.split(',')[0]
    sequence_id = '>' + sequence_id
    sequence_suffix = record.id.split(',')[1]
    
    # Append the sequence ID and sequence to the list
    
    fasta_data.append([sequence_id, sequence_suffix, str(record.seq)])

fasta_df = pd.DataFrame(fasta_data, columns=["Accession", 'Start', "Full Sequence"])

In [None]:
# Merge DataFrames based on the 'ID' column

merged_2_df = pd.merge(fasta_df, merged_df, on=['Accession','Start'], how='inner')

# Drop NaN rows

merged_2_df = merged_2_df.dropna()

In [None]:
# Export DataFrames to excel

merged_2_df.to_excel('Fluoride_CDS_ARNold.xlsx', index=False)

In [None]:
# Create FASTA file of predicted terminators to create alignment for R-scape

merged_2_df = merged_2_df.drop_duplicates(subset=['Accession'], keep='first')

with open('ARNold_Fluoride_terminators.fasta','w') as f:
    for index, row in merged_2_df.iterrows():
        ARNold_result = row['ARNold Results']
        if ARNold_result != 'No predicted transcription terminator. ':
            result = ARNold_result.split(' ')
            result = [item for item in result if item]
            full_sequence = row['Full Sequence']
            stems = row['Stem']
            loops = row['Loops']
            loop = loops[0][0]
            pre_polyU = stems[1]
            terminator_start = int(result[0])
            terminator_end = len(result[3])
            riboswitch_length = terminator_start + terminator_end
            terminator = full_sequence[:riboswitch_length]
            term_len = len(terminator)
            if 90 < term_len < 110: # By limiting sequences on length, the alignment will be cleaner
                f.write(row['Accession']+'\n'+terminator+'\n')