## **Hotspot Prediction Tool (Trinity)**
##### This hotspot prediction tool is specifically designed to analyze genomic sequences, generating all potential secondary structures within a specified range of 50 to 200 base pairs. It then identifies the structure with the lowest free energy, indicative of a potential hotspot-related RNA-interference (RNAi) precursor. The input sequences typically originate from experimental data, such as small RNA sequencing studies on viruses, aimed at identifying regions of the viral genome targeted by the host RNAi machinery.

0. Starting line to import and install of all the needed packages

In [1]:
pip install biopython

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install viennarna

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install pillow

Note: you may need to restart the kernel to use updated packages.


1. Code line to input fasta sequence and read through it

In [4]:
from Bio import SeqIO
import subprocess
import os
from PIL import Image

def calculate_gibbs_free_energy(sequence):
    # Use RNAfold to predict secondary structure and calculate Gibbs free energy at room temperature
    cmd = f"echo '{sequence}' | RNAfold --noPS --noLP --temp=25"
    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Initialize variables
    energy = None
    structure = None

    # Parse the output to get the Gibbs free energy and secondary structure
    output_lines = result.stdout.split('\n')
    for line in output_lines:
        if line.startswith("free energy:"):
            _, energy_str = line.split(":")
            energy = float(energy_str.strip().split(" ")[0])
        elif line.startswith("."):
            structure = line.strip()

    return energy, structure


def calculate_kmers(sequence, k_min, k_max):
    total_kmers = 0
    kmer_dict = {}

    for k in range(k_min, k_max + 1):
        kmers = [sequence[i:i + k] for i in range(len(sequence) - k + 1)]
        total_kmers += len(kmers)
        kmer_dict[k] = kmers

    return total_kmers, kmer_dict

def write_kmers_to_fasta(output_filename, kmer_dict):
    with open(output_filename, 'w') as output_file:
        for k, kmers in kmer_dict.items():
            for i, kmer_seq in enumerate(kmers, start=1):
                output_file.write(f">k-{k}_{i}\n")
                output_file.write(f"{kmer_seq}\n")

def main(input_fasta, output_fasta, k_min, k_max):
    sequence = str(next(SeqIO.parse(input_fasta, "fasta")).seq)
    total_kmers, kmer_dict = calculate_kmers(sequence, k_min, k_max)

    # Calculate Gibbs free energy and secondary structure for each k-mer
    kmer_info = []
    for k, kmers in kmer_dict.items():
        for kmer in kmers:
            energy, structure = calculate_gibbs_free_energy(kmer)
            kmer_info.append((kmer, energy, structure))

    # Sort k-mers based on Gibbs free energy in ascending order
    sorted_kmers_info = sorted(kmer_info, key=lambda x: x[1])

    # Print the first 5 k-mers with the lowest Gibbs free energy and their secondary structure
    print("Top 5 k-mers with lowest Gibbs free energy:")
    for i, (kmer, energy, structure) in enumerate(sorted_kmers_info[:5], start=1):
        print(f"{i}. Sequence: {kmer}, Gibbs Free Energy: {energy}, Secondary Structure: {structure}")

        # Generate PNG file for secondary structure
        output_png = f"kmer_{i}.png"
        cmd = f"echo '{structure}' | RNAplot --output-format=png --output={output_png}"
        subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        # Display the PNG file
        if os.path.exists(output_png):
            with Image.open(output_png) as img:
                img.show()

    # Write all k-mers to the output FASTA file
    write_kmers_to_fasta(output_fasta, kmer_dict)

if __name__ == "__main__":
    input_fasta = "Hotspot_1.fasta"  # Change this to your input FASTA file
    output_fasta = "output.fasta"  # Change this to your desired output FASTA file
    k_min = 50
    k_max = 300
    main(input_fasta, output_fasta, k_min, k_max)

FileNotFoundError: [Errno 2] No such file or directory: 'Hotspot_1.fasta'

2. print out all the listed sequences and their IDs

3. For each candidate sequence, estimate the k-mers ranging from 50 to 250 bp, on which the structure prediction will be performed

*Notebook Created By: Christian Mandelli, Oregon State University*