# Step 3-4: identify the stable regions, stem-loops, bulges and internal loops

In [3]:
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
import glob
import os
import csv
import shutil
print("pandas == {}".format(pd.__version__))
print("numpy == {}".format(np.__version__))

pandas == 0.24.2
numpy == 1.16.5


# Functions to recognize stem-loops, bulges and internal loops

In [2]:
# Define functions to identify any m x n bulges/internal loops
def find_m_n_bulge(df, start_line, end_line, m, n):
    # Adjust start_line and end_line from 1-based line numbers to 0-based indices
    start_index = start_line - 1
    end_index = end_line - 1

    # Ensure the range is within the DataFrame index bounds
    if start_index < df.index.min() or end_index > df.index.max():
        raise ValueError("Start or end line number is out of bounds")

    # Slice the DataFrame to only consider the given line number range
    sub_df = df.iloc[start_index:end_index + 1, :]
    #print(sub_df)
    # List to store the sequences of four consecutive decreasing numbers
    sequence_num = []

    # Iterate through the sliced DataFrame to find sequences
    for i in range(len(sub_df) - (3 + m)):  # adjust range to account for m
        # Additional condition check for zeros if m >= 1
        if m >= 1 and not all(sub_df.iloc[i + 2:i + 2 + m]["pair_num"] == 0):
            continue  # Skip this iteration if the condition is not met
        if sub_df.iloc[i]["pair_num"] < sub_df.iloc[i]["nuc_num"]:
            continue  # This line will avoid counting the same bulge twice!!
        # Check the sequence of four numbers considering m and n
        if (sub_df.iloc[i]["pair_num"] == sub_df.iloc[i + 1]["pair_num"] + 1 and
            sub_df.iloc[i + 1]["pair_num"] == sub_df.iloc[i + 2 + m]["pair_num"] + 1 + n and
            sub_df.iloc[i + 2 + m]["pair_num"] == sub_df.iloc[i + 3 + m]["pair_num"] + 1 and 
            sub_df.iloc[i]["nuc_num"] + m + 3 != sub_df.iloc[i]["pair_num"]):
            # Generate number ranges
            number_range1 = list(range(sub_df.iloc[i]["nuc_num"] + 2, sub_df.iloc[i]["nuc_num"] + m + 2))
            number_range2 = list(range(sub_df.iloc[i]["pair_num"] - 2, sub_df.iloc[i]["pair_num"] - 2 - n, -1))
            bulge_id = "B{}".format(sub_df.iloc[i]["nuc_num"] + 2)
            for number in number_range1:
                sequence_num.append((number, df.at[number-1, 'nuc_id'], bulge_id))
            for number in number_range2:
                sequence_num.append((number, df.at[number-1, 'nuc_id'], bulge_id))
    return sequence_num

In [3]:
# Define functions to list all bulges/internal loops in certain range
def bulge_search(df, start_line, end_line):
    bulge_results = []
    for m in range(11):
        for n in range(11):
            if m==0 and n==0:
                continue
            sequence_num = find_m_n_bulge(df, start_line, end_line, m, n)
            if sequence_num:
                bulge_results.append({"m": m, "n": n, "sequence_num": sequence_num})
    return bulge_results

In [4]:
# Define function to identify any stem-loop with m-nucleotide loop
def find_m_loop(df, start_line, end_line, m=4):
    # Adjust start_line and end_line from 1-based line numbers to 0-based indices
    start_index = start_line - 1
    end_index = end_line - 1

    # Ensure the range is within the DataFrame index bounds
    if start_index < df.index.min() or end_index > df.index.max():
        raise ValueError("Start or end line number is out of bounds")

    # Slice the DataFrame to only consider the given line number range
    sub_df = df.iloc[start_index:end_index + 1, :]
    #print(sub_df)
    # List to store the sequences of four consecutive decreasing numbers
    sequence_num = []

    # Iterate through the sliced DataFrame to find sequences
    for i in range(len(sub_df) - (3 + m)):  # adjust range to account for m
        # Additional condition check for zeros
        if not all(sub_df.iloc[i + 2:i + 2 + m]["pair_num"] == 0):
            continue  # Skip this iteration if the condition is not met
        # Check the loop
        if (sub_df.iloc[i]["pair_num"] == sub_df.iloc[i + 1]["pair_num"] + 1 and
            sub_df.iloc[i]["pair_num"] == sub_df.iloc[i]["nuc_num"] + 3 + m and
            sub_df.iloc[i + 1]["pair_num"] == sub_df.iloc[i + 1]["nuc_num"] + 1 + m):
            # Generate number ranges
            number_range1 = list(range(sub_df.iloc[i]["nuc_num"] + 2, sub_df.iloc[i]["nuc_num"] + m + 2))
            # Append results to sequence_num
            loop_id = "L{}".format(sub_df.iloc[i]["nuc_num"] + 2)            
            for number in number_range1:
                sequence_num.append((number, df.at[number-1, 'nuc_id'], loop_id))
    sequence_num = list(set(sequence_num))
    return sequence_num

In [5]:
# Define functions to list all stem-loops in certain range
def loop_search(df, start_line, end_line):
    loop_results = []
    for m in range(1, 11):
        sequence_num = find_m_loop(df, start_line, end_line, m)
        if sequence_num:
            loop_results.append({"m": m, "sequence_num": sequence_num})
    return loop_results

# Function to search for stable regions that contain at least one stem-loop

In [42]:
def search_stable_region(transcript_name, threshold_dp=1, threshold_c1=0, threshold_c2=0.301): 
    pattern = './data/results_{0}.map_*/merged_{0}.map_*.dp'.format(transcript_name)
    file_list = glob.glob(pattern)
    if not file_list:
        raise FileNotFoundError("No files found for pattern {0}".format(pattern))
    elif len(file_list) > 1:
        raise Exception("Multiple files found for pattern {0}, please specify further.".format(pattern))
    file_path = file_list[0]
    df = pd.read_csv(file_path, sep='\t', header=1)
    # Filter data based on the '-log10(Probability)' threshold
    df_rel = df[df['-log10(Probability)'] < threshold_dp]
    df_rel = df_rel.reset_index(drop=True)
    # Initialize columns for calculations
    df_rel['c1'] = 0 # c1 defines if a single structure/conformer is strictly predicted
    df_rel['c2'] = df_rel['-log10(Probability)'] # c2 defines the average probability within the range
    df_rel['stable_closing'] = 0 # determine good closing pairs >= 3 bp
    # Calculate 'c1' and 'c2' as specified
    for index, row in df_rel.iterrows():
        a = row['i']
        b = row['j']
        probabilities = []
        if (
            ((df_rel['i'] == a + 1) & (df_rel['j'] == b - 1)).any() and
            ((df_rel['i'] == a + 2) & (df_rel['j'] == b - 2)).any() and
            ((df_rel['i'] == a + 3) & (df_rel['j'] == b - 3)).any()
        ):
            df_rel.at[index, 'stable_closing'] += 1
        for _, row2 in df_rel.iterrows():
            a2 = row2['i']
            b2 = row2['j']
            if not ((a2 == a) and (b2 == b)): # targeting other rows 
                if (a <= a2 <= b) and not (a <= b2 <= b): # test if any alternative bp outside range
                    df_rel.at[index, 'c1'] += 1
                if ((a2 == a) and (a < b2 < b)) | ((a < a2 < b) and (b2 == b)): # test if any alternative bp within range
                    df_rel.at[index, 'c1'] += 1
            if a <= a2 <= b or a <= b2 <= b: # "<=" ensure to include the row in the first for loop
                probabilities.append(row2['-log10(Probability)'])
        if probabilities:
            avg_probability = np.mean(probabilities)
            df_rel.at[index, 'c2'] = avg_probability
    # Calculate 'c_combine'
    df_rel['c_combine'] = 0
    for index, row in df_rel.iterrows():
        a = row['i']
        b = row['j']
        c_count = 0
        for index2, row2 in df_rel.iterrows():
            if a <= row2['i'] <= b and a <= row2['j'] <= b:
                if row2['c1'] > threshold_c1:
                    c_count += 3
                if row2['c2'] > threshold_c2:
                    c_count += 1  # can tolerate low probability count twice
        df_rel.at[index, 'c_combine'] = c_count
    df_filt = df_rel[(df_rel['c_combine'] <= 1) & (df_rel['stable_closing'] == 1)]
    df_filt = df_filt.reset_index(drop=True)
    # Calculate 'max_range'; See if the range is covered by a larger region
    df_filt['max_range'] = 0
    for index, row in df_filt.iterrows():
        a = row['i']
        b = row['j']
        max_range_count = 0
        for index2, other_row in df_filt.iterrows():
            if index != index2:
                if other_row['i'] <= a and other_row['j'] >= b:
                    max_range_count += 1
        df_filt.at[index, 'max_range'] = max_range_count
    df_filt2 = df_filt[df_filt['max_range'] == 0]
    df_filt2 = df_filt2.reset_index(drop=True)
    # Code for searching for desired bulges
    pattern2 = './data/results_%s.map_*/merged_%s.map_*.ct' % (transcript_name, transcript_name)
    file_list2 = glob.glob(pattern2)
    if not file_list2:
        raise FileNotFoundError("No files found for pattern: " + pattern)
    elif len(file_list2) > 1:
        raise Exception("Multiple files found for pattern: " + pattern + ", please specify further.")
    file_path2 = file_list2[0]
    fold_df = pd.read_csv(file_path2, sep='\\s+', header=None, skiprows=1) ## changed \s into \\s ##
    fold_df = fold_df.drop(columns=[2, 3, 5])
    fold_df.columns = ["nuc_num", "nuc_id", "pair_num"]
    bulge_seqs = []
    loop_seqs = []
    df_filt2['basepair'] = 0 
    for index, row in df_filt2.iterrows():
        a = int(row['i'])  # Do not need to adjust for zero-based index
        b = int(row['j'])
        bulge_seq = bulge_search(fold_df, a, b)
        flat_bulge_seq = [item for sublist in bulge_seq for item in sublist['sequence_num']]
        loop_seq = loop_search(fold_df, a, b)
        flat_loop_seq = [item for sublist in loop_seq for item in sublist['sequence_num']]
        if len(flat_loop_seq) ==0: # At least a loop should be found in the secondary structure!
            df_filt2.loc[index, 'basepair'] = 0 
            continue
        df_filt2.loc[index, 'basepair'] = b - a + 1 - len(flat_bulge_seq) - len(flat_loop_seq)
        if df_filt2.loc[index, 'basepair'] < 8: # Need at least 4 base pairs throughout the region
            continue
        bulge_seqs.append(bulge_seq)
        loop_seqs.append(loop_seq)
    # Drop rows where 'validate' column is equal to 0
    df_filt2 = df_filt2[df_filt2['basepair'] >= 8]
    bulge_seqs = [element for element in bulge_seqs if element] # Drop empty elements
    bulge_flist = [item for sublist in bulge_seqs for item in sublist] # Flatten the list to only one element
    loop_seqs = [element for element in loop_seqs if element]
    loop_flist = [item for sublist in loop_seqs for item in sublist]
    # End for searching bulges
    df_map_path = './data/{0}.map'.format(transcript_name)
    df_map = pd.read_csv(df_map_path, sep='\t', header=None)
    formatted_lines = []
    for index, row in df_filt2.iterrows():
        a = int(row['i']) - 1
        b = int(row['j'])
        extracted_values = df_map.iloc[a:b, [3]].values.T  # Transpose the values
        sequence = ''.join(map(str, extracted_values.flatten()))  # Flatten and join without tabs
        formatted_lines.append([a + 1, b, sequence])
    output_file_path = './data/{0}_stableseq.csv'.format(transcript_name)
    with open(output_file_path, 'w') as f: 
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(['a', 'b', 'sequence'])  # Write the header
        writer.writerows(formatted_lines)  # Write all rows
    
    bulge_file_path = './data/{0}_bulge.csv'.format(transcript_name)
    with open(bulge_file_path, 'w') as f:
        writer = csv.writer(f)
        flat_list = []
        for item in bulge_flist:
            m_value = item['m']
            n_value = item.get('n', '')  # Get 'n' value if it exists, else use an empty string
            for seq in item['sequence_num']:
                flat_list.append((m_value, n_value, seq[0], seq[1], seq[2]))  # Append m, n, number, nuc_id
        writer.writerow(['m', 'n', 'number', 'nuc_id', 'bulge_id'])  # Write header
        writer.writerows(flat_list)
    
    loop_file_path = './data/{0}_loop.csv'.format(transcript_name)
    with open(loop_file_path, 'w') as f:  # Use 'w' mode for binary writing in Python 2
        writer = csv.writer(f)
        flat_list = []
        for item in loop_flist:
            m_value = item['m']
            for seq in item['sequence_num']:
                flat_list.append((m_value, seq[0], seq[1], seq[2]))  # Append m, number, nuc_id
        writer.writerow(['m', 'number', 'nuc_id', 'loop_id'])  # Write header
        writer.writerows(flat_list)

In [8]:
# human invivo data
# Define the folder path and the output CSV file name
folder_name = "human_invivo_map"
data_folder = os.path.join('./', folder_name + '_result')
output_csv = os.path.join(data_folder, folder_name + '_list.csv')

# Get the list of all *.map file names in the data folder, excluding the .map extension
map_files = [os.path.splitext(f)[0] for f in os.listdir(data_folder) if f.endswith('.map')]

# Write the list to a CSV file without a header
with open(output_csv, 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    for map_file in map_files:
        csvwriter.writerow([map_file])

In [None]:
# rename "human_invivo_map_result" into "data"
for map_file in tqdm(map_files):
    search_stable_region(map_file, 1, 0, 0.301)

In [22]:
base_folder = "data"
new_folder_name = os.path.join(base_folder, 'data_' + folder_name)
if not os.path.exists(new_folder_name):
    os.makedirs(new_folder_name)
# Move all .csv and .map files into the new folder
for filename in os.listdir(base_folder):
    if filename.endswith('.csv') or filename.endswith('.map'):
        source = os.path.join(base_folder, filename)
        destination = os.path.join(new_folder_name, filename)
        shutil.move(source, destination)

In [None]:
# human invitro data
# Define the folder path and the output CSV file name
folder_name = "human_invitro_map"
data_folder = os.path.join('./', folder_name + '_result')
output_csv = os.path.join(data_folder, folder_name + '_list.csv')

# Get the list of all *.map file names in the data folder, excluding the .map extension
map_files = [os.path.splitext(f)[0] for f in os.listdir(data_folder) if f.endswith('.map')]

# Write the list to a CSV file without a header
with open(output_csv, 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    for map_file in map_files:
        csvwriter.writerow([map_file])

In [None]:
# rename "human_invitro_map_result" into "data"
for map_file in tqdm(map_files):
    search_stable_region(map_file, 1, 0, 0.301)

In [None]:
base_folder = "data"
new_folder_name = os.path.join(base_folder, 'data_' + folder_name)
if not os.path.exists(new_folder_name):
    os.makedirs(new_folder_name)
# Move all .csv and .map files into the new folder
for filename in os.listdir(base_folder):
    if filename.endswith('.csv') or filename.endswith('.map'):
        source = os.path.join(base_folder, filename)
        destination = os.path.join(new_folder_name, filename)
        shutil.move(source, destination)