In [None]:
import os
import csv
import RNA

In [None]:
# ------------------------------------------------------------------------------------------------------------------>
#                                           EIIP features
# ------------------------------------------------------------------------------------------------------------------>


# Define the DNA to number mapping
dna_to_number = {
    "A": 0.126,
    "T": 0.125,
    "C": 0.165,
    "G": 0.5
}

# Function to convert DNA sequence to numbers
def convert_sequence_to_numbers(sequence):
    numbers = ""
    for base in sequence:
        if base in dna_to_number:
            numbers += str(dna_to_number[base]) + ","
    return numbers.strip()


# ------------------------------------------------------------------------------------------------------------------>
#                                           ENAC features
# ------------------------------------------------------------------------------------------------------------------>


# Function to calculate the probabilities of nucleotides in a sliding window
def calculate_sliding_window_probabilities(sequence, k):
    nucleotides = ['A', 'C', 'G', 'T']
    probabilities = []

    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        counts = {n: kmer.count(n) / k for n in nucleotides}
        probabilities.append([counts[n] for n in nucleotides])

    return probabilities

k = 5  # Specify the value of k for k-mers



# ------------------------------------------------------------------------------------------------------------------>
#                                           Secondary Structure
# ------------------------------------------------------------------------------------------------------------------>



# Function to predict secondary structure
def predict_secondary_structure(rna_sequence):
    # Predict the secondary structure
    (ss, mfe) = RNA.fold(rna_sequence)

    # Return the predicted secondary structure
    return ss


# ------------------------------------------------------------------------------------------------------------------>
#                                           NCP Features
# ------------------------------------------------------------------------------------------------------------------>

# Define the DNA to number mapping
ring_structure = {
    "A": 1,
    "T": 0,
    "C": 0,
    "G": 1
}

amino = {
    "A": 1,
    "T": 0,
    "C": 1,
    "G": 0
}

hydrogen = {
    "A": 0,
    "T": 0,
    "C": 1,
    "G": 1
}



# Function to convert DNA sequence to numbers
def convert_sequence_to_ring(sequence):
    numbers = ""
    for base in sequence:
        if base in ring_structure:
            numbers += str(ring_structure[base]) + ","
    return numbers.strip()

def convert_sequence_to_amino(sequence):
    numbers = ""
    for base in sequence:
        if base in amino:
            numbers += str(amino[base]) + ","
    return numbers.strip()

def convert_sequence_to_hydrogen(sequence):
    numbers = ""
    for base in sequence:
        if base in hydrogen:
            numbers += str(hydrogen[base]) + ","
    return numbers.strip()


In [1]:




# Function to get the output folder name based on sequence length
def get_output_folder_name(sequence_length):
    min_bucket = (sequence_length // 100) * 100
    max_bucket = min_bucket + 100
    return f"{min_bucket}-{max_bucket}"

# Process each "Train_" file
def process_train_files(input_folder, output_folder):
    # Ensure the output folder exists, create if not
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.startswith("Test_"):
            input_filepath = os.path.join(input_folder, filename)
            
            # Open input file for reading
            with open(input_filepath, 'r') as infile:
                lines = infile.readlines()
                sequence_number = 1
                sequence = ''
                rna_type = ''
                for line in lines:
                    if line.startswith('>'):
                        if sequence:
                            sequence_numbers = convert_sequence_to_numbers(sequence)
                            output_foldername = get_output_folder_name(len(sequence))
                            output_filepath = os.path.join(output_folder, output_foldername + '.csv')
                            with open(output_filepath, 'a', newline='') as csvfile:
                                writer = csv.writer(csvfile)
                                writer.writerow([f"RNA Type: {rna_type}"])
                                # writer.writerow([f"Sequence {sequence_number}"])
                                writer.writerow([sequence_numbers])           # Write converted sequence
                                # sequence_number += 1                             # Increment sequence number
                                sequence = ''
                        rna_type = line.strip().split()[-1]  # Extract RNA type
                    else:
                        sequence += line.strip()
                if sequence:
                    sequence_numbers = convert_sequence_to_numbers(sequence)
                    output_foldername = get_output_folder_name(len(sequence))
                    output_filepath = os.path.join(output_folder, output_foldername + '.csv')
                    with open(output_filepath, 'a', newline='') as csvfile:
                        writer = csv.writer(csvfile)
                        writer.writerow([f"RNA Type: {rna_type}"])
                        writer.writerow([f"Sequence {sequence_number}"])
                        writer.writerow([sequence_numbers])           # Write converted sequence

input_folder = "Original_data"  # Specify the path to the folder containing "Train_" files
output_folder = "EIIP_Test"  # Specify the path to the folder where output files will be saved

process_train_files(input_folder, output_folder)


In [4]:


# Function to get the output filename based on sequence length
def get_output_filename(sequence_length):
    min_bucket = (sequence_length // 100) * 100
    max_bucket = min_bucket + 100
    return f"{min_bucket}-{max_bucket}.csv"

# Function to process "Train_" files in a folder
def process_train_files(input_folder, output_folder, k):
    # Ensure the output folder exists, create if not
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.startswith("Train_"):
            input_filepath = os.path.join(input_folder, filename)
            output_subfolder = os.path.join(output_folder, filename[:-4])
            if not os.path.exists(output_subfolder):
                os.makedirs(output_subfolder)
            
            # Open input file for reading
            with open(input_filepath, 'r') as infile:
                lines = infile.readlines()
                sequence_number = 1
                sequence = ''
                rna_type = ''
                for line in lines:
                    if line.startswith('>'):
                        if sequence:
                            output_filename = get_output_filename(len(sequence))
                            output_filepath = os.path.join(output_subfolder, output_filename)
                            probabilities = calculate_sliding_window_probabilities(sequence, k)
                            with open(output_filepath, 'a', newline='') as csvfile:
                                writer = csv.writer(csvfile)
                                for prob in probabilities:
                                    writer.writerow(prob)
                            sequence_number += 1
                            sequence = ''
                        rna_type = line.strip().split()[-1]  # Extract RNA type
                    else:
                        sequence += line.strip()
                if sequence:
                    output_filename = get_output_filename(len(sequence))
                    output_filepath = os.path.join(output_subfolder, output_filename)
                    probabilities = calculate_sliding_window_probabilities(sequence, k)
                    with open(output_filepath, 'a', newline='') as csvfile:
                        writer = csv.writer(csvfile)
                        for prob in probabilities:
                            writer.writerow(prob)

# Input and output folder paths
input_folder = "Original_data"  # Specify the path to the folder containing "Train_" files
output_folder = "ENAC"  # Specify the path to the folder where output files will be saved


# Process "Train_" files
process_train_files(input_folder, output_folder, k)


In [5]:


# Function to get the output filename based on sequence length range
def get_output_filename(sequence_length):
    min_bucket = (sequence_length // 100) * 100
    max_bucket = min_bucket + 100
    return f"{min_bucket}-{max_bucket}.csv"

# Function to process "Train_" files in a folder
def process_train_files(input_folder, output_folder):
    # Ensure the output folder exists, create if not
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.startswith("Test_"):
            input_filepath = os.path.join(input_folder, filename)
            output_subfolder = os.path.join(output_folder, filename[:-4])
            if not os.path.exists(output_subfolder):
                os.makedirs(output_subfolder)
            
            # Open input file for reading
            with open(input_filepath, 'r') as infile:
                lines = infile.readlines()
                rna_sequence = ''
                for line in lines:
                    if line.startswith('>'):
                        if rna_sequence:
                            sequence_length = len(rna_sequence)
                            output_filename = get_output_filename(sequence_length)
                            output_filepath = os.path.join(output_subfolder, output_filename)
                            with open(output_filepath, 'a', newline='') as csvfile:
                                writer = csv.writer(csvfile)
                                # Write RNA sequence type
                                writer.writerow(["RNA Sequence Type:", line.strip()])
                                # Predict secondary structure and write output
                                ss = predict_secondary_structure(rna_sequence)
                                writer.writerow(["Predicted Secondary Structure:", ss])
                            rna_sequence = ''
                    else:
                        rna_sequence += line.strip()

                if rna_sequence:
                    sequence_length = len(rna_sequence)
                    output_filename = get_output_filename(sequence_length)
                    output_filepath = os.path.join(output_subfolder, output_filename)
                    with open(output_filepath, 'a', newline='') as csvfile:
                        writer = csv.writer(csvfile)
                        # Write RNA sequence type
                        writer.writerow(["RNA Sequence Type:", line.strip()])
                        # Predict secondary structure and write output
                        ss = predict_secondary_structure(rna_sequence)
                        writer.writerow(["Predicted Secondary Structure:", ss])

# Input and output folder paths
input_folder = "Original_data"  # Specify the path to the folder containing "Train_" files
output_folder = "Secondary_Structures_Test"  # Specify the path to the folder where output files will be saved

# Process "Train_" files
process_train_files(input_folder, output_folder)



In [2]:
import os
import csv


# Function to get the output folder name based on sequence length
def get_output_folder_name(sequence_length):
    min_bucket = (sequence_length // 100) * 100
    max_bucket = min_bucket + 100
    return f"{min_bucket}-{max_bucket}"

# Process each "Train_" file
def process_train_files(input_folder, output_folder):
    # Ensure the output folder exists, create if not
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.startswith("Test_"):
            input_filepath = os.path.join(input_folder, filename)
            
            # Open input file for reading
            with open(input_filepath, 'r') as infile:
                lines = infile.readlines()
                sequence_number = 1
                sequence = ''
                rna_type = ''
                for line in lines:
                    if line.startswith('>'):
                        if sequence:
                            # sequence_numbers = convert_sequence_to_numbers(sequence)
                            output_foldername = get_output_folder_name(len(sequence))
                            output_filepath = os.path.join(output_folder, output_foldername + '.csv')
                            with open(output_filepath, 'a', newline='') as csvfile:
                                writer = csv.writer(csvfile)
                                writer.writerow([f"RNA Type: {rna_type}"])
                                # writer.writerow([f"Sequence {sequence_number}"])
                                writer.writerow([convert_sequence_to_ring(sequence)])
                                writer.writerow([convert_sequence_to_amino(sequence)])
                                writer.writerow([convert_sequence_to_hydrogen(sequence)])           # Write converted sequence
                                # sequence_number += 1                             # Increment sequence number
                                sequence = ''
                        rna_type = line.strip().split()[-1]  # Extract RNA type
                    else:
                        sequence += line.strip()
                if sequence:
                    # sequence_numbers = convert_sequence_to_numbers(sequence)
                    output_foldername = get_output_folder_name(len(sequence))
                    output_filepath = os.path.join(output_folder, output_foldername + '.csv')
                    with open(output_filepath, 'a', newline='') as csvfile:
                        writer = csv.writer(csvfile)
                        writer.writerow([f"RNA Type: {rna_type}"])
                        # writer.writerow([f"Sequence {sequence_number}"])
                        writer.writerow([convert_sequence_to_ring(sequence)])
                        writer.writerow([convert_sequence_to_amino(sequence)])
                        writer.writerow([convert_sequence_to_hydrogen(sequence)]) 
                        # writer.writerow([sequence_numbers])           # Write converted sequence

input_folder = "Original_data"  # Specify the path to the folder containing "Train_" files
output_folder = "NCP_Test"  # Specify the path to the folder where output files will be saved

process_train_files(input_folder, output_folder)
