# RNA Structure Generation

This notebook generates random RNA secondary structures using a weighted Context-Free Grammar (CFG). 
We will generate multiple datasets with different properties (length, stack weights, unpaired weights).

In [11]:
import random
import sys

# Increase recursion depth to handle long structures and deep nestings
sys.setrecursionlimit(10000)

def countS(n,cache,weight_unpaired,weight_stack):
    if ("S",n) not in cache:
        if n == 0:
            val = 1
        else:
            val = weight_unpaired*countS(n-1,cache,weight_unpaired,weight_stack)
            if n>1:
                val += countS(n-2,cache,weight_unpaired,weight_stack)
            if n>2:
                for i in range(1,n-1):
                    val += countT(i,cache,weight_unpaired,weight_stack)*countS(n-2-i,cache,weight_unpaired,weight_stack)
        cache[("S",n)] = val
    return cache[("S",n)]
    
def generateS(n,cache,weight_unpaired,weight_stack):
    if n == 0:
        return ""
    else:
        r = random.random() * countS(n,cache,weight_unpaired,weight_stack)
        r -= weight_unpaired * countS(n-1,cache,weight_unpaired,weight_stack)
        if r<0:
            return "." + generateS(n-1,cache,weight_unpaired,weight_stack)
        if n>1:
            r -= countS(n-2,cache,weight_unpaired,weight_stack)
            if r<0:
                return "()" + generateS(n-2,cache,weight_unpaired,weight_stack)
        if n>2:
            for i in range(1,n-1):
                r -= countT(i,cache,weight_unpaired,weight_stack)*countS(n-2-i,cache,weight_unpaired,weight_stack)
                if r<0:
                    return "("+generateT(i,cache,weight_unpaired,weight_stack)+")"+generateS(n-2-i,cache,weight_unpaired,weight_stack)
    return None

def countT(n,cache,weight_unpaired,weight_stack):
    if ("T",n) not in cache:
        if n == 0:
            val = 0
        else:
            val = weight_unpaired*countS(n-1,cache,weight_unpaired,weight_stack)
            if n==2:
                val += weight_stack
            if n>2:
                val += countT(n-2,cache,weight_unpaired,weight_stack)
            if n>2:
                val += weight_stack*countT(n-2,cache,weight_unpaired,weight_stack)
            if n>3:
                for i in range(1,n-2):
                    val += countT(i,cache,weight_unpaired,weight_stack)*countT(n-2-i,cache,weight_unpaired,weight_stack)
        cache[("T",n)] = val
    return cache[("T",n)]
    
def generateT(n,cache,weight_unpaired,weight_stack):
    if n == 0:
        return "#"
    else:
        r = random.random()*countT(n,cache,weight_unpaired,weight_stack)
        r -= weight_unpaired*countS(n-1,cache,weight_unpaired,weight_stack)
        if r<0:
            return "."+generateS(n-1,cache,weight_unpaired,weight_stack)
        if n==2:
            r -= weight_stack
            if r<0:
                return "()"
        if n>2:
            r -= countT(n-2,cache,weight_unpaired,weight_stack)
            if r<0:
                return "()"+generateT(n-2,cache,weight_unpaired,weight_stack)
        if n>2:
            r -= weight_stack*countT(n-2,cache,weight_unpaired,weight_stack)
            if r<0:
                return "("+generateT(n-2,cache,weight_unpaired,weight_stack)+")"
        if n>3:
            for i in range(1,n-2):
                r -= countT(i,cache,weight_unpaired,weight_stack)*countT(n-2-i,cache,weight_unpaired,weight_stack)
                if r<0:
                    return "("+generateT(i,cache,weight_unpaired,weight_stack)+")"+generateT(n-2-i,cache,weight_unpaired,weight_stack)
    return None

def generate_and_save(filename, n_structures, length, w_unpaired, w_stack):
    """Helper function to generate structures and save to file"""
    cache = {}
    structures = []
    print(f"Generating {n_structures} structures (L={length}, Wu={w_unpaired}, Ws={w_stack})...")
    for i in range(n_structures):
        s = generateS(length, cache, w_unpaired, w_stack)
        if s:
            structures.append(s)
    
    with open(filename, "w") as f:
        for s in structures:
            f.write(s + "\n")
    print(f"Saved {len(structures)} structures to '{filename}'")

### 1. Standard Set
Length: 100, balanced weights.

In [12]:
generate_and_save(
    filename="structures_standard_L100.txt",
    n_structures=20,
    length=100,
    w_unpaired=1.0,
    w_stack=1.0
)

Generating 20 structures (L=100, Wu=1.0, Ws=1.0)...
Saved 20 structures to 'structures_standard_L100.txt'


### 2. Short & Highly Structured
Length: 50, high stack weight ($w_s=5.0$) creates more base pairs.

In [13]:
generate_and_save(
    filename="structures_short_structured_L50.txt",
    n_structures=20,
    length=50,
    w_unpaired=0.5,
    w_stack=5.0
)

Generating 20 structures (L=50, Wu=0.5, Ws=5.0)...
Saved 20 structures to 'structures_short_structured_L50.txt'


### 3. Long & Unstructured
Length: 200, high unpaired weight ($w_u=2.0$) creates more loops and unpaired regions.

In [14]:
generate_and_save(
    filename="structures_long_unstructured_L200.txt",
    n_structures=20,
    length=200,
    w_unpaired=2.0,
    w_stack=0.5
)

Generating 20 structures (L=200, Wu=2.0, Ws=0.5)...
Saved 20 structures to 'structures_long_unstructured_L200.txt'


### 4. Unpaired Bias
Length: 100. Very high unpaired weight ($w_u=5.0$).

In [15]:
generate_and_save(
    filename="structures_unpaired_bias_L100.txt",
    n_structures=20,
    length=100,
    w_unpaired=5.0,
    w_stack=1.0
)

Generating 20 structures (L=100, Wu=5.0, Ws=1.0)...
Saved 20 structures to 'structures_unpaired_bias_L100.txt'


### 5. Stack Bias
Length: 100. Very high stack weight ($w_s=5.0$).

In [16]:
generate_and_save(
    filename="structures_stack_bias_L100.txt",
    n_structures=20,
    length=100,
    w_unpaired=1.0,
    w_stack=5.0
)

Generating 20 structures (L=100, Wu=1.0, Ws=5.0)...
Saved 20 structures to 'structures_stack_bias_L100.txt'
