In [2]:
#Importing all the necessary libraries
import pandas as pd
from Bio import SeqIO
from functions import *
import os
import time
import subprocess

In [93]:
BASE_DIR = os.getcwd()

### Make sure you have the following files ready - 
1. A proper .bed file containing the genomic coordinates of the regions of interest
2. Fasta file/s containing the correct DNA sequences regarding the study

In [19]:
bed = pd.read_csv('CHM13_centric_transition.bed', sep = '\t') # This is an example. Please change the path according to your needs

In [20]:
df = random_sequence_position_generator(length = 1000, number = 5, bed_file = bed) # Select the sequence length and the number of sequences

   Chromosome      Start        End              Region
0        chr1  145640216  145641216  centric_transition
1        chr1  146700594  146701594  centric_transition
2        chr1  146892080  146893080  centric_transition
3        chr1  146328948  146329948  centric_transition
4        chr1  144850695  144851695  centric_transition
5       chr10   45192297   45193297  centric_transition
6       chr10   44947068   44948068  centric_transition
7       chr10   43812043   43813043  centric_transition
8       chr10   43694666   43695666  centric_transition
9       chr10   44463655   44464655  centric_transition
10      chr11   56267648   56268648  centric_transition
11      chr11   58041871   58042871  centric_transition
12      chr11   57009309   57010309  centric_transition
13      chr11   58446528   58447528  centric_transition
14      chr11   58096330   58097330  centric_transition
15      chr12   39498897   39499897  centric_transition
16      chr12   39405636   39406636  centric_tra

In [21]:
#df.to_csv("centric_transition_region_positions.csv", index=False) #save it to a csv file

Get the DNA sequences from the fasta file

In [36]:
df_with_sequences = get_sequences(positions_df=df, fasta_file='CHM13.fasta')

  Chromosome      Start        End              Region  \
0       chr1  145640216  145641216  centric_transition   
1       chr1  146700594  146701594  centric_transition   
2       chr1  146892080  146893080  centric_transition   
3       chr1  146328948  146329948  centric_transition   
4       chr1  144850695  144851695  centric_transition   
5      chr10   45192297   45193297  centric_transition   
6      chr10   44947068   44948068  centric_transition   
7      chr10   43812043   43813043  centric_transition   
8      chr10   43694666   43695666  centric_transition   
9      chr10   44463655   44464655  centric_transition   

                                            Sequence  
0  CGCTTCTAGGCCCTTTTAAGAGAACAGAACTAGCAATGAATATTTT...  
1  GTAATGAAACTGTAGTCTCAGCTGGAAGCCTAGACATGAAATGGGT...  
2  ACCTGACCCTTGACCAGGGGATGCTGGGTTAACATGACTAAGGGTT...  
3  ATGGCTTAAAATATCATGTCATGAGTAGACAACTTCCAATCCTAGT...  
4  TCCTAAGTTTTAATTAAACGTTTAAACTAAAAACTGTACTCCTAGC...  
5  TCGCCTGCAAAGGGCTTAGCTTCTCATG

In [37]:
#df_with_sequences.to_csv("centric_transition_positions_and_sequences.csv", index=False) #save it to a csv file

Convert the DNA sequences to fasta files

In [98]:
seqDF = pd.read_csv("centric_transition_positions_and_sequences.csv")
file_path = "./input_seqs/"
os.makedirs(file_path, exist_ok=True)
file_names = []

for index, row in seqDF.iterrows():
    chromosome = row.iloc[0]
    start = row.iloc[1]
    end = row.iloc[2]
    region = row.iloc[3]
    sequence_id = f"{chromosome}_{region}_{start}_{end}"
    sequences = {sequence_id: row.iloc[4]}

    region_path = os.path.join(file_path, region)
    os.makedirs(region_path, exist_ok=True)
    seq_filename = f"{region}/{sequence_id}.fa"
    file_names.append(seq_filename)
    write_fasta(file_path+seq_filename, sequences)

    os.makedirs(f"./output/{region}/", exist_ok=True)

seqDF['File'] = file_names

Running RNAFOLD

In [None]:
BASE_DIR2 = os.getcwd()
command = "RNAfold -d2 -g --noLP -P dna_mathews2004.par --noconv <"
prev_region = ""

for index,row in seqDF.iterrows():
    print(f"{row.iloc[5]} started at {time.asctime()}")
    region = row.iloc[3]

    if prev_region != region:
        os.chdir(f"{BASE_DIR2}/output/{region}")

    filename = row.iloc[5]
    input_seq_file = f"../../input_seqs/{filename}"
    output_file = f"{row.iloc[0]}_{row.iloc[3]}_{row.iloc[1]}_{row.iloc[2]}.out"

    fc = command + f"{input_seq_file} >  {output_file}"
    os.system(fc)

    time.sleep(8)
    seqDF.loc[index, "status"] = 1
    seqDF.to_csv(f"{BASE_DIR2}/status.csv", index=False)

    prev_region = region
    print(row.iloc[5], "done at",  time.asctime())

Interpreting the results

In [None]:
val_list = ['mfe','fete', 'ed']
for l in val_list:
    seqDF[l] = ""

emptylist=[]
for index, row in seqDF.iterrows():
    filepath = f"./output/{seqDF['region'][index]}/{row.iloc[0]}_{row.iloc[3]}_{row.iloc[1]}_{row.iloc[2]}.out"
    with open(filepath, 'r') as file:
        file_contents = file.read()

    arr = []
    if file_contents=="":
        emptylist.append(index)
    else:
        arr = read_output(file_contents)
        for index, val in enumerate(val_list):
            seqDF[val][index] = arr[index]

emptylist

In [None]:
seqDF.to_csv("Final_Output.csv", index = False)