To plot: number of potential stems, overlaps, and pseudoknots as a function of RNA length

In [1]:
# import packages:

import numpy as np
import pandas as pd
import math
import os
import glob
import matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

In [2]:
def potential_stems(seq_ps):
    
    with open("./data/archiveII/"+seq_ps) as file:
        lines = file.readlines()
    
    rna = lines[2]
    
    matrix = np.zeros((len(rna),len(rna)))
    for diag in range(0, len(matrix)):
        for row in range(0, len(matrix)-diag):
            col = row + diag
            base1 = rna[row]
            base2 = rna[col]
            if row != col:
                if ((base1 == ("A" or "a")) and (base2 == ("U" or "u"))) or ((base1 == ("U" or "u")) and (base2 == ("A" or "a"))) or ((base1 == ("G" or "g")) and (base2 == ("U" or "u"))) or ((base1 == ("U" or "u")) and (base2 == ("G" or "g"))) or ((base1 == ("G" or "g")) and (base2 == ("C" or "c"))) or ((base1 == ("C" or "c")) and (base2 == ("G" or "g"))):
                    matrix[row][col] = 1
    
    stems_potential = []
    mu = 0

    for row in range(0, len(matrix)):
        for col in range (row, len(matrix)):
            if row != col:
                if matrix[row][col] != 0:
                    sp = []                 # stem pairs
                    temp_row = row
                    temp_col = col
                    stem = [row+1,col+1,0,0]
                    length = 0
                    while (matrix[temp_row][temp_col] != 0) and (temp_row != temp_col):
                        base1 = rna[temp_row]
                        base2 = rna[temp_col]
                        if (base1 == ('G' or 'g') and base2 == ('C' or 'c')):
                            sp.append("GC")
                        if (base1 == ('C' or 'c') and base2 == ('G' or 'g')):
                            sp.append("CG")
                        if (base1 == ('G' or 'g') and base2 == ('U' or 'u')):
                            sp.append("GU")
                        if (base1 == ('U' or 'u') and base2 == ('G' or 'g')):
                            sp.append("UG")
                        if (base1 == ('A' or 'a') and base2 == ('U' or 'u')):
                            sp.append("AU")
                        if (base1 == ('U' or 'u') and base2 == ('A' or 'a')):
                            sp.append("UA")
                        length += 1
                        temp_row += 1
                        temp_col -= 1
                        if length >= 3:
                            stem[2] = int(length)
                            stem[3] = int(col-row-2*length)
                            stems_potential.append(stem.copy())
    
    return [stems_potential, len(rna)]

In [3]:
def overlaps_and_pseudoknots(sp):
    
    overlap = 0
    pseudoknot = 0

    for i in range(len(sp)):
        for j in range(i+1, len(sp)):
    
            stem1 = sp[i]
            stem2 = sp[j]
            
            i_a = stem1[0]
            j_a = stem1[1]
            i_b = stem2[0]
            j_b = stem2[1]
    
            stem1_cspan1 = set(range(stem1[1]-int(stem1[2])+1, stem1[1]+1))
            stem2_cspan1 = set(range(stem2[1]-int(stem2[2])+1, stem2[1]+1))
            
            stem1_cspan2 = set(range(stem1[0], stem1[0]+int(stem1[2])))
            stem2_cspan2 = set(range(stem2[0], stem2[0]+int(stem2[2])))
    
            if (len(stem1_cspan1 & stem2_cspan1) != 0) or (len(stem1_cspan2 & stem2_cspan2) != 0)  or (len(stem1_cspan1 & stem2_cspan2) != 0) or (len(stem1_cspan2 & stem2_cspan1) != 0):
                overlap += 1
            elif (i_a < i_b and i_b < j_a and j_a < j_b) or (i_b < i_a and i_a < j_b and j_b < j_a):
                pseudoknot += 1
                
    return (overlap, pseudoknot)

In [4]:
seq = [f for f in os.listdir("./data/archiveII") if f.endswith('.seq')]

for i in seq:
    x = potential_stems(i)
    y = overlaps_and_pseudoknots(x[0])
    with open("preprocessing_all.txt", "a") as f:
        f.write(str(x[1])+","+str(len(x[0]))+","+str(y[0])+","+str(y[1])+"\n")

KeyboardInterrupt: 

In [None]:
seq = [f for f in os.listdir("./data/archiveII") if f.endswith('.seq')]

for i in seq:
    x = potential_stems(i)
    with open("preprocessing_stems.txt", "a") as f:
        f.write(str(x[1])+","+str(len(x[0]))+"\n")

In [None]:
def quadratic(x, a, b, c):
    return a*x**2 + b*x + c

def cubic(x, a, b, c, d):
    return a*x**3 + b*x**2 + c*x + d

def exponent(x, a, b):
    return a*x**b

def worst(x):
    return (1/24)*(x**3 - 9*x**2 + 23*x - 15)

In [None]:
pp_stems = pd.read_csv('preprocessing_stems.csv', header=None)

In [None]:
pars_q, cov_q = curve_fit(f=quadratic, xdata=np.array(pp_stems[0]), ydata=np.array(pp_stems[1]), p0=[0, 0, 0], bounds=(-np.inf, np.inf))
pars_c, cov_c = curve_fit(f=cubic, xdata=np.array(pp_stems[0]), ydata=np.array(pp_stems[1]), p0=[0, 0, 0, 0], bounds=(-np.inf, np.inf))
pars_e, cov_e = curve_fit(f=exponent, xdata=np.array(pp_stems[0]), ydata=np.array(pp_stems[1]), p0=[0, 0], bounds=(-np.inf, np.inf))
print("quadratic:", pars_q)
print("cubic:    ", pars_c)
print("exponent: ", pars_e)

In [None]:
plt.figure(figsize=(7.5, 7.5))

x = np.linspace(start=0, stop=3000, num=100)
plt.plot(np.array(pp_stems[0]), np.array(pp_stems[1]), 'ro')
#plt.plot(x, quadratic(x, *pars_q), linestyle='--', linewidth=2, color='black')
#plt.plot(x, cubic(x, *pars_c), linestyle='-.', linewidth=2, color='black')
plt.plot(x, exponent(x, *pars_e), linestyle=':', linewidth=2, color='black', label=r"""$y = ax^b$""")
plt.plot(x, worst(x), linestyle="-", linewidth=2, color='black', label='Worst Case')

plt.xlabel('RNA Length')
plt.ylabel('Number of Potential Stems')
plt.axis([-100, 3100, -10000, 390000])
plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
plt.legend()
#plt.text(100, 350000, r'$y = ax^b$')

plt.savefig('stems_vs_length.png')