This notebook implements our RNA folding QUBO.

Basically, the QUBO takes on this form, where $q_i$ and $q_j$ are each stems:

$$H = -\sum_i \{\alpha(k_i-\mu)^2-\beta(k_i-l_i)\}q_i + \sum_{i>j} \{ P_1\ln(e^2N_{SS})+P_2\ln \Big(\sum_k\lambda_{IL_n}^k\Big) + \delta_{ij}^{\prime} \}q_iq_j $$

Where $k_i$ is the nearest-neighbor stem energy of stem $i$, $\mu$ is the largest nearest-neighbor potential stem energy, $l_i$ is the loop energy of stem $i$, $N_SS$ is the number of single-stranded nucleotides involved in the psedoknot between stems $i$ and $j$, $\lambda^k_{IL_n}$ is the in-line helix penalty for a stem of length $n$ involved in pseudoknotting, $\delta_{ij}^{\prime}$ is $\infty$ if stems $i$ and $j$ are in pseudoknot and $0$ otherwise, and $\alpha$, $\beta$, $P_1$ and $P_2$ are tunable parameters.

In [None]:
# import packages:

import numpy as np
import pandas as pd
import math
import os
import glob

In [None]:
# function to return the stem energy based on nearest-neighbor interactions:

def stem_energy(sp):
    se = 0
    if len(sp) > 1:
        for i in range(1, len(sp)):
            if sp[i] == "AU":
                if sp[i-1] == "AU": 
                    se += 0.9
                if sp[i-1] == "CG":
                    se += 2.2
                if sp[i-1] == "GC":
                    se += 2.1
                if sp[i-1] == "UA":
                    se += 1.1
                if sp[i-1] == "GU":
                    se += 0.6
                if sp[i-1] == "UG":
                    se += 1.4
            if sp[i] == "CG":
                if sp[i-1] == "AU": 
                    se += 2.1
                if sp[i-1] == "CG":
                    se += 3.3
                if sp[i-1] == "GC":
                    se += 2.4
                if sp[i-1] == "UA":
                    se += 2.1
                if sp[i-1] == "GU":
                    se += 1.4
                if sp[i-1] == "UG":
                    se += 2.1
            if sp[i] == "GC":
                if sp[i-1] == "AU": 
                    se += 2.4
                if sp[i-1] == "CG":
                    se += 3.4
                if sp[i-1] == "GC":
                    se += 3.3
                if sp[i-1] == "UA":
                    se += 2.2
                if sp[i-1] == "GU":
                    se += 1.5
                if sp[i-1] == "UG":
                    se += 2.5
            if sp[i] == "UA":
                if sp[i-1] == "AU": 
                    se += 1.3
                if sp[i-1] == "CG":
                    se += 2.4
                if sp[i-1] == "GC":
                    se += 2.1
                if sp[i-1] == "UA":
                    se += 0.9
                if sp[i-1] == "GU":
                    se += 1.0
                if sp[i-1] == "UG":
                    se += 1.3
            if sp[i] == "GU":
                if sp[i-1] == "AU": 
                    se += 1.3
                if sp[i-1] == "CG":
                    se += 2.5
                if sp[i-1] == "GC":
                    se += 2.1
                if sp[i-1] == "UA":
                    se += 1.4
                if sp[i-1] == "GU":
                    se += 0.5
                if sp[i-1] == "UG":
                    se += -1.3
            if sp[i] == "UG":
                if sp[i-1] == "AU": 
                    se += 1.0
                if sp[i-1] == "CG":
                    se += 1.5
                if sp[i-1] == "GC":
                    se += 1.4
                if sp[i-1] == "UA":
                    se += 0.6
                if sp[i-1] == "GU":
                    se += -0.3
                if sp[i-1] == "UG":
                    se += 0.5
    return se

In [None]:
# function to read in .ct file and give a list of known structure stems:

def actual_stems(seq_ss, seq_ps):    # seq_ss: secondary structure, seq_ps: primary structure (sequence)
    
    with open(subdirectory+"/"+seq_ss) as file:
        ss_lines = file.readlines()
    
    with open(subdirectory+"/"+seq_ps) as file:
        ps_lines = file.readlines()
    
    rna = ps_lines[1]
    
    stems_actual = []

    sip = False                       # stem in progress?
    sl = 0                            # stem length
    sp = []                           # stem pairs
    last_line = [0, 0, 0, 0, 0, 0]    # initiate last line

    for i in range(0, len(ss_lines)):
        line = ss_lines[i].strip().split()
        
        if (int(line[4]) != 0 and sip == False):
            sip = True
            temp = [int(line[0]), int(line[4])]
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('C' or 'c')):
                sp.append("GC")
            if (rna[i] == ('C' or 'c') and rna[int(line[4])-1] == ('G' or 'g')):
                sp.append("CG")
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('U' or 'u')):
                sp.append("GU")
            if (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('G' or 'g')):
                sp.append("UG")
            if (rna[i] == ('A' or 'a') and rna[int(line[4])-1] == ('U' or 'u')):
                sp.append("AU")
            if (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('A' or 'a')):
                sp.append("UA")
            sl += 1
            
        elif (int(line[4]) != 0 and sip == True and (int(last_line[4])-int(line[4]) == 1)):
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('C' or 'c')):
                sp.append("GC")
            if (rna[i] == ('C' or 'c') and rna[int(line[4])-1] == ('G' or 'g')):
                sp.append("CG")
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('U' or 'u')):
                sp.append("GU")
            if (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('G' or 'g')):
                sp.append("UG")
            if (rna[i] == ('A' or 'a') and rna[int(line[4])-1] == ('U' or 'u')):
                sp.append("AU")
            if (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('A' or 'a')):
                sp.append("UA")
            sl += 1
            
        elif (int(line[4]) == 0 and sip == True):
            sip = False
            temp.append(sl)
            temp.append(int(temp[1]-temp[0]-2*sl))
            temp.append(stem_energy(sp))
            if temp[1] > temp[0]:
                stems_actual.append(temp)
            sl = 0
            sp = []
            
        elif ((int(last_line[4])-int(line[4]) != 1) and int(last_line[4]) != 0  and sip == True):
            temp.append(sl)
            temp.append(int(temp[1]-temp[0]-2*sl))
            temp.append(stem_energy(sp))
            if temp[1] > temp[0]:
                stems_actual.append(temp)
            temp = [int(line[0]), int(line[4])]
            sl = 0
            sp = []
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('C' or 'c')):
                sp.append("GC")
            if (rna[i] == ('C' or 'c') and rna[int(line[4])-1] == ('G' or 'g')):
                sp.append("CG")
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('U' or 'u')):
                sp.append("GU")
            if (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('G' or 'g')):
                sp.append("UG")
            if (rna[i] == ('A' or 'a') and rna[int(line[4])-1] == ('U' or 'u')):
                sp.append("AU")
            if (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('A' or 'a')):
                sp.append("UA")
            sl += 1
        
        last_line = line
        
    return stems_actual

In [None]:
# function to read in .fasta file and generate list of potential stems at least 3 base-pairs long:

def potential_stems(seq_ps):
    
    with open(subdirectory+"/"+seq_ps) as file:
        lines = file.readlines()
    
    rna = lines[1]
    
    matrix = np.zeros((len(rna),len(rna)))
    for diag in range(0, len(matrix)):
        for row in range(0, len(matrix)-diag):
            col = row + diag
            base1 = rna[row]
            base2 = rna[col]
            if row != col:
                if ((base1 == ("A" or "a")) and (base2 == ("U" or "u"))) or ((base1 == ("U" or "u")) and (base2 == ("A" or "a"))) or ((base1 == ("G" or "g")) and (base2 == ("U" or "u"))) or ((base1 == ("U" or "u")) and (base2 == ("G" or "g"))) or ((base1 == ("G" or "g")) and (base2 == ("C" or "c"))) or ((base1 == ("C" or "c")) and (base2 == ("G" or "g"))):
                    matrix[row][col] = 1
    
    stems_potential = []
    mu = 0

    for row in range(0, len(matrix)):
        for col in range (row, len(matrix)):
            if row != col:
                if matrix[row][col] != 0:
                    sp = []                    # stem pairs
                    temp_row = row
                    temp_col = col
                    stem = [row+1,col+1,0,0,0] # [start, end, length, loop-size, stem energy]
                    length = 0
                    while (matrix[temp_row][temp_col] != 0) and (temp_row != temp_col):
                        base1 = rna[temp_row]
                        base2 = rna[temp_col]
                        if (base1 == ('G' or 'g') and base2 == ('C' or 'c')):
                            sp.append("GC")
                        if (base1 == ('C' or 'c') and base2 == ('G' or 'g')):
                            sp.append("CG")
                        if (base1 == ('G' or 'g') and base2 == ('U' or 'u')):
                            sp.append("GU")
                        if (base1 == ('U' or 'u') and base2 == ('G' or 'g')):
                            sp.append("UG")
                        if (base1 == ('A' or 'a') and base2 == ('U' or 'u')):
                            sp.append("AU")
                        if (base1 == ('U' or 'u') and base2 == ('A' or 'a')):
                            sp.append("UA")
                        length += 1
                        temp_row += 1
                        temp_col -= 1
                        if length >= 2 and col-row-2*length >= 3:
                            stem[2] = int(length)
                            stem[3] = int(col-row-2*length)
                            stem[4] = stem_energy(sp)
                            stems_potential.append(stem.copy())
                    if stem_energy(sp) > mu:
                        mu = stem_energy(sp)
    
    return [stems_potential, mu, rna, len(rna)]

In [None]:
# function to generate energy per in-line pseudoknotted helix of length n:

def pseudoknot_sub_penalty(length):
    return np.exp(0.572992*length+0.219677)

In [None]:
# function to generate list of potential stem pairs that form pseudoknots:

def potential_pseudoknots(stems_potential):

    pseudoknots_potential = []

    for i in range(len(stems_potential)):
        for j in range(i + 1, len(stems_potential)):
            
            stem1 = stems_potential[i]
            stem2 = stems_potential[j]
    
            i_a = stem1[0]
            j_a = stem1[1]
            i_b = stem2[0]
            j_b = stem2[1]
    
            pseudoknot = [i, j, 0, 0]
            
            if (i_a < i_b and i_b < j_a and j_a < j_b):
                
                nss = (stem2[0]-(stem1[0]+stem1[2])) + ((stem1[1]-stem1[2])-(stem2[0]+stem2[2]-1)) + ((stem2[1]-stem2[2])-stem1[1])
                if nss > 0:
                    pseudoknot[2] = np.log((6.5**2)*nss)
                pseudoknot[3] = np.log(stem1[2]*pseudoknot_sub_penalty(stem1[2])**2+stem2[2]*pseudoknot_sub_penalty(stem2[2])**2)
            
            elif (i_b < i_a and i_a < j_b and j_b < j_a):
            
                nss = (stem1[0]-(stem2[0]+stem2[2])) + ((stem2[1]-stem2[2])-(stem1[0]+stem1[2]-1)) + ((stem1[1]-stem1[2])-stem2[1])
                if nss > 0:
                    pseudoknot[2] = np.log((6.5**2)*nss)
                pseudoknot[3] = np.log(stem1[2]*pseudoknot_sub_penalty(stem1[2])**2+stem2[2]*pseudoknot_sub_penalty(stem2[2])**2)
            
            pseudoknots_potential.append(pseudoknot)
            
    return pseudoknots_potential

In [None]:
# function to generate list of stem pairs that overlap:

def potential_overlaps(stems_potential):
    
    overlaps_potential = []
    overlap_penalty = 1e6

    for i in range(len(stems_potential)):
        for j in range(i+1, len(stems_potential)):
    
            stem1 = stems_potential[i]
            stem2 = stems_potential[j]
    
            overlap = [i, j, 0]
    
            stem1_cspan1 = set(range(stem1[1]-int(stem1[2])+1, stem1[1]+1))
            stem2_cspan1 = set(range(stem2[1]-int(stem2[2])+1, stem2[1]+1))
            
            stem1_cspan2 = set(range(stem1[0], stem1[0]+int(stem1[2])))
            stem2_cspan2 = set(range(stem2[0], stem2[0]+int(stem2[2])))
    
            if (len(stem1_cspan1 & stem2_cspan1) != 0) or (len(stem1_cspan2 & stem2_cspan2) != 0)  or (len(stem1_cspan1 & stem2_cspan2) != 0) or (len(stem1_cspan2 & stem2_cspan1) != 0):
        
                overlap[2] = overlap_penalty
        
            overlaps_potential.append(overlap)
            
    return overlaps_potential

In [None]:
def loop_penalty(ll):
    lp = 0
    if ll == 0 or ll == 1 or ll == 2:
        lp = 1000
    if ll == 3:
        lp = 7.4
    if ll == 4:
        lp = 5.9
    if ll == 5:
        lp = 4.4
    if ll == 6:
        lp = 4.3
    if ll >= 7:
        lp = 4.1
    return lp

In [None]:
# function to generate the Hamiltonian of a given RNA structure from potential stems, overlaps, and pseudoknots:

def model(stems_potential, overlaps_potential, pseudoknots_potential, mu, alpha, beta, p1, p2):
    
    L = {}
    Q = {}
    k = 0

    for i in range(0, len(stems_potential)):
        k_i = stems_potential[i][4]
        loop_i  = stems_potential[i][3]
        L[str(i)] = alpha*((k_i-mu)**2)-beta*(k_i-loop_penalty(loop_i))
        for j in range(i+1, len(stems_potential)):
            Q[(str(i), str(j))] = p1*pseudoknots_potential[k][2] + p2*pseudoknots_potential[k][3] + overlaps_potential[k][2]
            k += 1
        
    return L, Q

In [None]:
# function to evaluate the energy of the known structure under the model Hamiltonian:

def energy(stems_actual, mu, alpha, beta, p1, p2):

    k = 0
    pseudoknots_actual = potential_pseudoknots(stems_actual)
    cost = 0
        
    for i in range(0, len(stems_actual)):
        k_i = stems_actual[i][4]
        loop_i  = stems_actual[i][3]
        cost += alpha*((k_i-mu)**2)-beta*(k_i-loop_penalty(loop_i))
        for j in range(i+1, len(stems_actual)):
            cost += p1*pseudoknots_actual[k][2] + p2*pseudoknots_actual[k][3]
            k += 1
    
    return cost

In [None]:
# function to compare actual and predicted structure based on comparison of base-pairs:

def evaluation_1(stems_actual, stems_potential):
    
    bp_actual = []
    bp_predicted = []

    for i in range(0, len(stems_actual)):
        for j in range(0, stems_actual[i][2]):
            bp_actual.append((stems_actual[i][0]+j, stems_actual[i][1]-j))
        
    for i in range(0, len(stems_potential)):
        for j in range(0, stems_potential[i][2]):
            bp_predicted.append((stems_potential[i][0]+j, stems_potential[i][1]-j))
            
    C = 0    # number of correctly identified base pairs
    M = 0    # number of the predicted base pairs missing from the known structure
    I = 0    # number of non-predicted base pairs present in the known structure

    for i in range(0, len(bp_predicted)):
        if bp_predicted[i] in bp_actual:
            C += 1
        else:
            M += 1

    for i in range(0, len(bp_actual)):
        if bp_actual[i] not in bp_predicted:
            I += 1
            
    if C+M != 0:
        ppv = C/(C+M)
    else:
        ppv = 0
    if C+I != 0:
        sensitivity = C/(C+I)
    else:
        sensitivity = 0
    
    return [ppv, sensitivity]

In [None]:
# function to compare actual and predicted structure based on comparison of bases involved in pairing:

def evaluation_2(stems_actual, stems_predicted):
    
    pb_actual = []
    pb_predicted = []

    for i in range(0, len(stems_actual)):
        for j in range(0, stems_actual[i][2]):
            pb_actual.append(stems_actual[i][0]+j)
            pb_actual.append(stems_actual[i][1]-j)
        
    for i in range(0, len(stems_predicted)):
        for j in range(0, stems_predicted[i][2]):
            pb_predicted.append(stems_predicted[i][0]+j)
            pb_predicted.append(stems_predicted[i][1]-j)
            
    C = 0    # number of correctly identified bases that are paired
    M = 0    # number of the predicted paired bases missing from the known structure
    I = 0    # number of non-predicted paired bases present in the known structure

    for i in range(0, len(pb_predicted)):
        if pb_predicted[i] in pb_actual:
            C += 1
        else:
            M += 1

    for i in range(0, len(pb_actual)):
        if pb_actual[i] not in pb_predicted:
            I += 1
            
    if C+M != 0:
        ppv = C/(C+M)
    else:
        ppv = 0
    if C+I != 0:
        sensitivity = C/(C+I)
    else:
        sensitivity = 0
    
    return [ppv, sensitivity]

In [None]:
def connectivity_table(bpRNA_id, sequence, stems, t):
    print(len(sequence), bpRNA_id, file=open("./results/cts/"+bpRNA_id+"_"+t+"_model3.ct", "w"))
    for i in range(0, len(sequence)):
        pair = 0
        for j in stems:
            for k in range(j[0], j[0]+j[2]):
                if i+1 == k:
                    pair = j[1]+j[0]-k
            for k in range(j[1]-j[2]+1, j[1]+1):
                if i+1 == k:
                    pair = j[0]+j[1]-k
        print(i+1, sequence[i], i, i+2, pair, i+1, file=open("./results/cts/"+bpRNA_id+"_"+t+"_model3.ct", "a"))

In [None]:
# connecting with D-Wave:

from dwave.cloud import Client

client = Client.from_config(token="DEV-6b38e4697eaa586b361595c629788f595b810a14")
client.get_solvers()

from dwave.system.samplers import DWaveSampler
from dwave.system.samplers import LeapHybridSampler
from dwave.system.composites import EmbeddingComposite

import dimod

sampler_q = EmbeddingComposite(DWaveSampler(token="DEV-6b38e4697eaa586b361595c629788f595b810a14", solver={'topology__type': 'pegasus'}))
sampler_h = LeapHybridSampler(token="DEV-6b38e4697eaa586b361595c629788f595b810a14")

The following cell runs all structures of a given folder (`subdirectory`), where:

- `alpha` is to be tuned
- `beta` is to be tuned
- `p1` and `p2` are the pseudoknot penalties, and are to be tuned

1. First, the actual stems are found, and then their energies. 
2. Second, the potential stems are found, as well as their potential overlaps, nestings, and pseudoknots. 
3. Third, the QUBO model for is built. 
4. Fourth, the model is run on DWave.
5. Fifth, connectivity table files are built for the actual and predicted structures.
6. Sixth, the predicted structure is evaluated against the actual structure using BP and PB Sensitivity and PPV. 

We will need to take this cell and modify it for both the training and testing steps to our protocol.

In [None]:
alpha = 1
beta = 1
p1 = 1
p2 = 1

subdirectory = "./data/woutPKs/s"

ct = [f for f in os.listdir(subdirectory) if f.endswith('.ct.txt')]
fasta = [f for f in os.listdir(subdirectory) if f.endswith('.fasta.txt')]

bprna_id       = [] # IDs

a_stems        = [] # actual structure stems
a_energies     = [] # actual structure energies

p_stems        = [] # potential structure stems
p_overlaps     = [] # potential structure overlaps
p_pseudoknots  = [] # potential structure pseudoknots

models         = [] # models

problem = []        # intiate list of problems
prediction = []     # intiate list of predictions
evaluation = []     # initiate list of evaluations
min_time = []       # intiate list of time to solution

for i in range(0, len(ct)):

    try:                                                            # try/except here b/c some wrong CT files
        print("building model for:", ct[i].split('.')[0])
        bprna_id.append(ct[i].split('.')[0])                        # append ID of structure
        
        p_stems.append(potential_stems(fasta[i]))                   # find potential stems of structure
        p_overlaps.append(potential_overlaps(p_stems[i][0]))        # find potential overlaps
        p_pseudoknots.append(potential_pseudoknots(p_stems[i][0]))  # find potential pseudoknots
        
        a_stems.append(actual_stems(ct[i], fasta[i]))               # find actual stems of structure
        a_energies.append(energy(a_stems[i], p_stems[i][1], alpha, beta, p1, p2))        # compute energy of actual structure
        
        models.append(model(p_stems[i][0], p_overlaps[i], p_pseudoknots[i], p_stems[i][1], alpha, beta, p1, p2))
    
    except:
        print("error in preprocessing, skipping...")

    try:
        print("running model for:", ct[i].split('.')[0])
        problem.append(dimod.BinaryQuadraticModel(models[i][0], models[i][1], vartype = 'BINARY', offset = 0.0))    

        #sampleset = sampler_q.sample(problem[i], num_reads=1000)
        #min_time.append("placeholder")
        sampleset = sampler_h.sample(problem[i])              # hybrid
        min_time.append(sampler_h.min_time_limit(problem[i])) # hybrid

        for datum in sampleset.data(['sample', 'energy', 'num_occurrences']):
            results = datum.sample
            predicted_energy = datum.energy
    
        stems_found = []           # initiate list of predicted stems

        for j in range(0, len(results)):
            if results[str(j)] == 1:
                stems_found.append(p_stems[i][0][j])
            
        prediction.append([stems_found, predicted_energy]) # record predicted stems and structure energy
                
        connectivity_table(bprna_id[i], p_stems[i][2], stems_found, "predicted") # write predicted CT file
        connectivity_table(bprna_id[i], p_stems[i][2], a_stems[i], "actual")     # write actual CT file
        
    except:
        print("no embedding found, skipping...")

    try:
        print("evaluating model for:", ct[i].split('.')[0])
        metrics_1 = []
        metrics_2 = []
        metrics_1.append(evaluation_1(a_stems[i], stems_found))                  # compute BP metrics
        metrics_2.append(evaluation_2(a_stems[i], stems_found))                  # compute PB metrics
        evaluation.append((metrics_1, metrics_2))
    
    except:
        print("no structure found, skipping...")