In [4]:
import os
import pandas as pd
import subprocess
os.environ["DATAPATH"] = "/Users/edricchoi/LucksLab Dropbox/LucksLab/Group_Members/EKC/Projects/Analysis_tools/rna_repos/RNAstructure/data_tables/"

In [9]:
def break_base_pairs(dot_bracket):
    pairs = []
    stack = []
    
    # Identify base pairs
    for i, char in enumerate(dot_bracket):
        if char == '(':
            stack.append(i)
        elif char == ')':
            if stack:
                pairs.append((stack.pop(), i))
    
    # Generate new structures with one broken base pair
    broken_structures = []
    for bp in pairs:
        i, j = bp
        new_structure = list(dot_bracket)
        new_structure[i] = '.'
        new_structure[j] = '.'
        broken_structures.append("".join(new_structure))
    
    return (broken_structures, pairs)

# read sequence and structures from 01.061.001.NN_penalty/fourU_thermometer/fourU_structures.db (fasta style >sequence\nstructure)
def parse_rna_data(file_path):
    entries = []
    with open(file_path, "r") as f:
        lines = [line.strip() for line in f if line.strip()]
    for i in range(0, len(lines), 3):
        if '>' not in lines[i]:
            raise ValueError("Invalid FASTA format")
        entry = {
            "name": lines[i][1:],  # Remove '>' from name
            "sequence": lines[i + 1],
            "structure": lines[i + 2]
        }
        entries.append(entry)
    
    return entries

def generate_db_all(entry):
    dot_bracket = entry["structure"]
    broken_structures, pairs = break_base_pairs(dot_bracket)
    # insert (0, 0) at the beginning of pairs
    pairs.insert(0, (-1, -1))

    # create new db file with >name\nsequence\nstructure followed by broken structures
    with open('temp.db', 'w') as f:
        f.write(f">{entry['name']}\n{entry['sequence']}\n{entry['structure']}\n")
        for i, structure in enumerate(broken_structures):
            f.write(f"{structure}\n")
    return 'temp.db', pairs

def calc_bp_penalties(db_file, temp = 37):
    entries = parse_rna_data(db_file)

    for entry in entries:
        name = entry['name']
        sequence = entry['sequence']

        # skip if contains "noprimer" because results are exactly the same
        if 'noprimer' in name:
            continue

        print(f'Processing {name} ...')

        dball_file, pairs = generate_db_all(entry)
        print(f'Generated db with single bp broken ...')

        # run efn2
        subprocess.run(['efn2', dball_file, 'temp_efn2_output.txt', '-t', str(temp + 273.15)])

        # read efn2_output into pd dataframe
        df = pd.read_csv('temp_efn2_output.txt', sep='Energy = ', header = None, engine = 'python')
        df[0] = pairs
        df['bp_res1'] = df[0].str[0]
        df['bp_base1'] = df['bp_res1'].apply(lambda x: sequence[x])
        df['bp_res2'] = df[0].str[1]
        df['bp_base2'] = df['bp_res2'].apply(lambda x: sequence[x])
        df['energy'] = df[1].str.split(' ± ').str[0].astype(float)
        df['err'] = df[1].str.split(' ± ').str[1].astype(float)
        df.drop(columns = [0, 1], inplace = True)
        
        # delete temp files
        os.remove('temp.db')
        os.remove('temp_efn2_output.txt')

        # output dir to base folder of db_file
        output_dir = f'{os.path.dirname(db_file)}/energy_calcs_{temp}'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        # save to csv in 01.061.001.NN_penalty/energy_calcs
        df.to_csv(f'{output_dir}/{name}_energy_calc.csv', index = False)
        print(f'Output saved to {output_dir}/{sequence}_energy_calc.csv\n')

def calc_bp_prob(db_file, temp = 37):
    entries = parse_rna_data(db_file)

    for entry in entries:
        name = entry['name']
        seq = entry['sequence']

        # Write a temporary seq file
        with open('temp.seq', 'w') as f:
            f.write(f";\n{name}\n")
            f.write(f"{seq}1\n")
        
        # Run partition and save to temporary pfs file
        subprocess.run(['partition', 'temp.seq', 'temp_partition_output.pfs', '-t', str(temp + 273.15)])

        # output dir to base folder of db_file
        output_dir = f'{os.path.dirname(db_file)}/pfunc_calcs_{temp}'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        # Save partition function output to a temporary file
        subprocess.run(['probabilityplot', 'temp_partition_output.pfs', f'{output_dir}/{name}_pfunc_bp_prob.txt', '--text'])
        subprocess.run(['probabilityplot', 'temp_partition_output.pfs', f'{output_dir}/{name}_pfunc_bp_prob.svg', '--svg'])

        # Remove temporary files
        os.remove('temp.seq')
        os.remove('temp_partition_output.pfs')

        # Read txt into dataframe
        df = pd.read_csv(f'{output_dir}/{name}_pfunc_bp_prob.txt', sep = '\t', skiprows=1)
        # i and j are 1-indexed
        df['bp_res1'] = df['i'] - 1
        df['bp_base1'] = df['bp_res1'].apply(lambda x: seq[x])
        df['bp_res2'] = df['j'] - 1
        df['bp_base2'] = df['bp_res2'].apply(lambda x: seq[x])
        df['prob'] = 10 ** -df['-log10(Probability)']
        df.drop(columns = ['i', 'j'], inplace = True)

        df.to_csv(f'{output_dir}/{name}_pfunc_bp_prob.csv', index = False)

In [6]:
## Calculate bp penalties (efn2) 37C

main_dir = '/Users/edricchoi/LucksLab Dropbox/LucksLab/Group_Members/EKC/Projects/EKC.01_SHAPE_Standardization/Experiments/01.061.calculate_energy_correlations/01.061.001.NN_penalty'
calc_bp_penalties(f'{main_dir}/fourU_thermometer/fourU_structures.db')
calc_bp_penalties(f'{main_dir}/HIV/HIV_structures.db')
calc_bp_penalties(f'{main_dir}/P4P6/P4P6_structures.db')

Processing fourU_A8C ...
Generated db with single bp broken ...
Initializing nucleic acids...done.
Calculating free energies...done.
efn2 complete.
Output saved to /Users/edricchoi/LucksLab Dropbox/LucksLab/Group_Members/EKC/Projects/EKC.01_SHAPE_Standardization/Experiments/01.061.calculate_energy_correlations/01.061.001.NN_penalty/fourU_thermometer/energy_calcs_37/gguguaagggugaaguguaAGGUUGACCUUUUGAAUAGUGAUUCAGGAGGUUAAUGGAAGuaaagguaaugaaggugaag_energy_calc.csv

Processing fourU_WT ...
Generated db with single bp broken ...
Initializing nucleic acids...done.
Calculating free energies...done.
efn2 complete.
Output saved to /Users/edricchoi/LucksLab Dropbox/LucksLab/Group_Members/EKC/Projects/EKC.01_SHAPE_Standardization/Experiments/01.061.calculate_energy_correlations/01.061.001.NN_penalty/fourU_thermometer/energy_calcs_37/gguguaagggugaaguguaAGGUUGAACUUUUGAAUAGUGAUUCAGGAGGUUAAUGGAAGuaaagguaaugaaggugaag_energy_calc.csv

Processing HIV-1_TAR_WT_GS ...
Generated db with single bp broken ...

In [7]:
## Calculate bp penalties (efn2) RT

calc_bp_penalties(f'{main_dir}/fourU_thermometer/fourU_structures.db', 20)
calc_bp_penalties(f'{main_dir}/HIV/HIV_structures.db', 25)
calc_bp_penalties(f'{main_dir}/P4P6/P4P6_structures.db', 23)

Processing fourU_A8C ...
Generated db with single bp broken ...
Initializing nucleic acids...done.
Setting temperature...done.
Calculating free energies...done.
efn2 complete.
Output saved to /Users/edricchoi/LucksLab Dropbox/LucksLab/Group_Members/EKC/Projects/EKC.01_SHAPE_Standardization/Experiments/01.061.calculate_energy_correlations/01.061.001.NN_penalty/fourU_thermometer/energy_calcs_20/gguguaagggugaaguguaAGGUUGACCUUUUGAAUAGUGAUUCAGGAGGUUAAUGGAAGuaaagguaaugaaggugaag_energy_calc.csv

Processing fourU_WT ...
Generated db with single bp broken ...
Initializing nucleic acids...done.
Setting temperature...done.
Calculating free energies...done.
efn2 complete.
Output saved to /Users/edricchoi/LucksLab Dropbox/LucksLab/Group_Members/EKC/Projects/EKC.01_SHAPE_Standardization/Experiments/01.061.calculate_energy_correlations/01.061.001.NN_penalty/fourU_thermometer/energy_calcs_20/gguguaagggugaaguguaAGGUUGAACUUUUGAAUAGUGAUUCAGGAGGUUAAUGGAAGuaaagguaaugaaggugaag_energy_calc.csv

Processing HI

In [10]:
## Calculate bp probabilities (pfunc) 37C

main_dir = '/Users/edricchoi/LucksLab Dropbox/LucksLab/Group_Members/EKC/Projects/EKC.01_SHAPE_Standardization/Experiments/01.061.calculate_energy_correlations/01.061.001.NN_penalty'
calc_bp_prob(f'{main_dir}/fourU_thermometer/fourU_structures.db', 37)
calc_bp_prob(f'{main_dir}/HIV/HIV_structures.db', 37)
calc_bp_prob(f'{main_dir}/P4P6/P4P6_structures.db', 37)

Initializing nucleic acids...done.
Single strand partition function complete.
Reading dot plot data...done.
Writing text file...done.
Probability dot plot complete.
Reading dot plot data...done.
Writing SVG image...done.
Probability dot plot complete.
Initializing nucleic acids...done.
Single strand partition function complete.
Reading dot plot data...done.
Writing text file...done.
Probability dot plot complete.
Reading dot plot data...done.
Writing SVG image...done.
Probability dot plot complete.
Initializing nucleic acids...done.
Single strand partition function complete.
Reading dot plot data...done.
Writing text file...done.
Probability dot plot complete.
Reading dot plot data...done.
Writing SVG image...done.
Probability dot plot complete.
Initializing nucleic acids...done.
Single strand partition function complete.
Reading dot plot data...done.
Writing text file...done.
Probability dot plot complete.
Reading dot plot data...done.
Writing SVG image...done.
Probability dot plot co

In [11]:
## Calculate bp probabilities (pfunc) RT

calc_bp_prob(f'{main_dir}/fourU_thermometer/fourU_structures.db', 20)
calc_bp_prob(f'{main_dir}/HIV/HIV_structures.db', 25)
calc_bp_prob(f'{main_dir}/P4P6/P4P6_structures.db', 23)

Initializing nucleic acids...done.
Setting temperature...done.
Single strand partition function complete.
Reading dot plot data...done.
Writing text file...done.
Probability dot plot complete.
Reading dot plot data...done.
Writing SVG image...done.
Probability dot plot complete.
Initializing nucleic acids...done.
Setting temperature...done.
Single strand partition function complete.
Reading dot plot data...done.
Writing text file...done.
Probability dot plot complete.
Reading dot plot data...done.
Writing SVG image...done.
Probability dot plot complete.
Initializing nucleic acids...done.
Setting temperature...done.
Single strand partition function complete.
Reading dot plot data...done.
Writing text file...done.
Probability dot plot complete.
Reading dot plot data...done.
Writing SVG image...done.
Probability dot plot complete.
Initializing nucleic acids...done.
Setting temperature...done.
Single strand partition function complete.
Reading dot plot data...done.
Writing text file...done