In [None]:
import subprocess
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
import os
import certifi
import pickle
from Bio import Entrez, SeqIO, AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import CompoundLocation, ExactPosition, BeforePosition, AfterPosition
from io import StringIO
import glob
from tqdm.notebook import tqdm
import string
from ete3 import NCBITaxa
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from Bio.Data import CodonTable
import json
import copy
import threading
import sys
import tempfile
import numpy as np



First we run Infernal's cmsearch of the group I intron covariance model (obtained from Rfam) against the entire NT database. We use the parameters given in Rfam for cmsearch, but changing the database size in Mbp to the size of NT (1.3 trillion bases).

In [None]:
# Define the variables
num_cpus = 100
output_file_path = "/path/to/output/tbl/out_file.tbl"
cm_model_path = "/path/to/cm/model.cm"
input_fasta_file_path = "/path/to/nt/database/nt"
path_to_cmsearch = "/path/to/cmsearch/bin/cmsearch"

# Define the command
command = f"{path_to_cmsearch} --cpu {num_cpus} --verbose --nohmmonly -E 1000 -Z 1300000 --tblout {output_file_path} {cm_model_path} {input_fasta_file_path}"

# Run the command
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()


We then read the output of cmsearch.

In [None]:
def read_infernal_output(file_path):
    """Read an Infernal output file into a pandas DataFrame."""
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            fields = line.split(maxsplit=17)
            data.append(fields)
    df = pd.DataFrame(data)
    df.columns = ['target name', 'accession', 'query name', 'accession', 'mdl', 'mdl from', 'mdl to', 'seq from', 'seq to', 'strand', 'trunc', 'pass', 'gc', 'bias', 'score', 'E-value', 'inc', 'description of target']
    return df

infernal_NT_search_path = output_file_path  # Path to the Infernal output file

infernal_NT_search_all_hits = read_infernal_output(infernal_NT_search_path)

We then need to fetch the GenBank entries of all records that gave a cmsearch hit

In [None]:
os.environ['SSL_CERT_FILE'] = certifi.where()

email = "your.email@email.com"  # Your email

def fetch_genbank_records_batch(email, batch_ids, max_tries=100):
    """Fetch GenBank records for a batch of IDs."""
    Entrez.email = email
    records = {}
    for _ in range(max_tries):
        try:
            handle = Entrez.efetch(db="nucleotide", id=batch_ids, rettype="gb", retmode="text")
            for record in SeqIO.parse(handle, "genbank"):
                records[record.id] = record
            handle.close()
            return records
        except:
            time.sleep(1)  # Wait for a second before retrying

def fetch_genbank_records(email, ids, batch_size=500, max_workers=10):
    """Fetch GenBank records for a list of IDs in batches using multiple threads."""
    records = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_genbank_records_batch, email, ids[i:i+batch_size]) for i in range(0, len(ids), batch_size)}
        for i, future in enumerate(concurrent.futures.as_completed(futures), 1):
            records.update(future.result())  # Update the records dictionary with the result of the future
            if i % 50 == 0:  # Every 50 batches
                with open('genbank_records.pkl', 'wb') as f:
                    pickle.dump(records, f)  # Save the records to a file, to be able to restore partial results
                print(f"Saved results after processing {i} batches")
    return records

ids = list(set(infernal_NT_search_all_hits["target name"].tolist()))  # Remove duplicates before passing to function

infernal_NT_search_genbank_recs = fetch_genbank_records(email, ids, batch_size=50, max_workers=10)

# save the final results 
with open('genbank_records.pkl', 'wb') as f:
    pickle.dump(infernal_NT_search_genbank_recs, f) # note this will require large amount of memory to write and also to read later

In [None]:
import pickle

with open('genbank_records.pkl', 'rb') as f:
    infernal_NT_search_genbank_recs = pickle.load(f)

Some keys will have records with undefined sequences because their sequences are too large and must be explicitly requested as fasta. So we find these and handle them separately. We save these separately as they are very large

In [None]:
genbank_recs_with_undefined_seqs = [key for key, value in infernal_NT_search_genbank_recs.items() if value.seq.defined is False]

batch_size = 100
genbank_ids_to_refetch = genbank_recs_with_undefined_seqs
seqs_of_records_with_undefined_seqs = {}

for i in range(0, len(genbank_ids_to_refetch), batch_size):
    print(f"Processing batch {i+1}-{i+batch_size}")
    batch_ids = genbank_ids_to_refetch[i:i+batch_size]
    handle = Entrez.efetch(db="nucleotide", id=batch_ids, rettype="fasta", retmode="text")
    records = list(SeqIO.parse(handle, "fasta"))  # Convert iterator to list
    with open(f'records_with_previously_undefined_seqs_batch_{i+1}_{i+batch_size}.fasta', 'w') as f:
        SeqIO.write(records, f, "fasta")  # Write records to file
    for record in records:
        seqs_of_records_with_undefined_seqs[record.id] = record.seq
    print(f"Finished processing batch {i+1}-{i+batch_size}")


If required we can restore the full sequences from the fasta files.

In [None]:
seqs_of_records_with_undefined_seqs = {}
merged_records = []

# Find all files that match the pattern
for file_name in glob.glob("records_with_previously_undefined_seqs_batch_*.fasta"):
    # Open the file and parse the records
    with open(file_name, "r") as handle:
        records = list(SeqIO.parse(handle, "fasta"))
    # Add the records to the merged_records list
    merged_records.extend(records)

# Add the sequences to the seqs_of_records_with_undefined_seqs dictionary
for record in merged_records:
    seqs_of_records_with_undefined_seqs[record.id] = record.seq

len(seqs_of_records_with_undefined_seqs)

We now want to filter the intron hits to keep only those for which we can find reliable boundaries.
In order to do so, we look for features labeled as intron in the corresponding GenBank entries that overlap by a minimum overlap threshold with the cmsearch hit.
If there are no such features, we look for features with a location of type CompoundLocation (those shown as join(...)) and look for a gap between exons that overlaps with the cmsearch hit.
We perform the search in both the direct and the complementary strand.

In [None]:
overlap_threshold = 50
intron_boundaries_genbank_infernal_NT_search = {}
rows_with_undefined_seqs = []

for index, row in tqdm(infernal_NT_search_all_hits.iterrows(), total=infernal_NT_search_all_hits.shape[0]):
    infernal_intron_start = int(row["seq from"])
    infernal_intron_end = int(row["seq to"])
    infernal_boundaries = (infernal_intron_start, infernal_intron_end)
    infernal_hit_strand = row["strand"]
    genbank_ID = row["target name"]
    if genbank_ID == 'MT229979.1':
        # skip this iteration since this seems to be a strange record that was removed from GenBank
        continue
    genbank_record = infernal_NT_search_genbank_recs[genbank_ID]
    if genbank_record.seq.defined:
        genbank_sequence_raw = genbank_record.seq
    else:
        genbank_sequence_raw = seqs_of_records_with_undefined_seqs[genbank_ID]
    genbank_sequence = genbank_sequence_raw.replace('U', 'T')
    genbank_features = genbank_record.features 
    new_intron_boundaries = None
    source = None
    candidate_reliable_introns = []
    if(infernal_intron_start < infernal_intron_end):
        infernal_intron_range = range(infernal_intron_start, infernal_intron_end)
        for feature in genbank_features:
            if feature.type == 'intron':
            # Check for 'intron' features
                if isinstance(feature.location.start, ExactPosition) and isinstance(feature.location.end, ExactPosition):
                    candidate_intron_range = range(int(feature.location.start), int(feature.location.end))
                    overlap = len(set(infernal_intron_range) & set(candidate_intron_range))
                    if overlap >= overlap_threshold and candidate_intron_range[0] >= 1 and candidate_intron_range[1] < (len(genbank_sequence) - 1):
                        source = 'intron'
                        new_intron_start = int(feature.location.start) - 1
                        new_intron_end = int(feature.location.end)
                        candidate_intron_sequence = genbank_sequence[new_intron_start:(new_intron_end + 1)]
                        candidate_reliable_introns.append((genbank_ID, new_intron_start, new_intron_end, candidate_intron_sequence, overlap, source))
            elif isinstance(feature.location, CompoundLocation):
            # Check for 'compound location' features
                for i in range(len(feature.location.parts) - 1):
                    # Get the end of the first part and the start of the second part
                    end_first_part = feature.location.parts[i].end
                    start_second_part = feature.location.parts[i+1].start
                    candidate_intron_range = range(int(end_first_part), int(start_second_part))
                    overlap = len(set(infernal_intron_range) & set(candidate_intron_range))
                    if overlap >= overlap_threshold and start_second_part - end_first_part < 3000:
                        source = 'compoundLocation'
                        #if(key == 'AY518280.1'):
                        #    breakpoint()
                        new_intron_start = end_first_part - 1
                        new_intron_end = start_second_part
                        candidate_intron_sequence = genbank_sequence[new_intron_start:(new_intron_end + 1)]
                        candidate_reliable_introns.append((genbank_ID, new_intron_start, new_intron_end, candidate_intron_sequence, overlap, source))
    else:
        infernal_intron_range = range(infernal_intron_start, infernal_intron_end, -1)
        for feature in genbank_features:
            if feature.type == 'intron' and feature.location.strand == -1:
            # Check for 'intron' features
                if isinstance(feature.location.start, ExactPosition) and isinstance(feature.location.end, ExactPosition):
                    candidate_intron_range = range(int(feature.location.end), int(feature.location.start), -1)
                    overlap = len(set(infernal_intron_range) & set(candidate_intron_range))
                    # It seems that ALWAYS int(feature.location.end) > int(feature.location.start) in these cases of features of type intron in strand -1
                    if overlap >= overlap_threshold and candidate_intron_range[1] >= 1 and candidate_intron_range[0] < (len(genbank_sequence) - 1): #note indexes for accession on the range are reversed
                        source = 'intron_reverseStrand'
                        new_intron_start = int(feature.location.start) -1
                        new_intron_end = int(feature.location.end)
                        candidate_intron_sequence = genbank_sequence[new_intron_start:(new_intron_end + 1)]
                        # Reverse complement the candidate_intron_sequence
                        reverse_complement_sequence = Seq(candidate_intron_sequence).reverse_complement()
                        candidate_reliable_introns.append((genbank_ID, new_intron_start, new_intron_end, reverse_complement_sequence, overlap, source))
            elif isinstance(feature.location, CompoundLocation) and feature.location.strand == -1:
                for i in range(len(feature.location.parts) - 1):
                    start_first_part = feature.location.parts[i].start
                    end_second_part = feature.location.parts[i+1].end
                    candidate_intron_range = range(int(start_first_part), int(end_second_part), -1)
                    overlap = len(set(infernal_intron_range) & set(candidate_intron_range))
                    if overlap >= overlap_threshold and start_first_part - end_second_part < 3000:
                        source = 'compoundLocation_reverseStrand'
                        new_intron_start = end_second_part
                        new_intron_end = start_first_part
                        candidate_intron_sequence = genbank_sequence[new_intron_start:(new_intron_end + 1)]
                        reverse_complement_sequence = Seq(candidate_intron_sequence).reverse_complement()
                        candidate_reliable_introns.append((genbank_ID, new_intron_start, new_intron_end, reverse_complement_sequence, overlap, source))
    if len(candidate_reliable_introns) >= 1:
        if(len(candidate_reliable_introns) == 1):
            selected_intron = candidate_reliable_introns[0]
        else:
            # Sort the candidate introns by overlap in descending order
            candidate_reliable_introns.sort(key=lambda x: x[4], reverse=True)
            # Get the highest overlap value
            highest_overlap = candidate_reliable_introns[0][4]
            # Filter the candidate introns with the highest overlap value
            highest_overlap_introns = [intron for intron in candidate_reliable_introns if intron[4] == highest_overlap]
            # Check if all the sequences are the same
            sequences = [intron[3] for intron in highest_overlap_introns]
            if len(set(sequences)) == 1:
                selected_intron = highest_overlap_introns[0]
            else:
                # Check which introns have 'T' as their first letter
                introns_with_T = [intron for intron in highest_overlap_introns if intron[3][0] == 'T']
                if len(introns_with_T) > 0:
                    selected_intron = introns_with_T[0]
                else:
                    selected_intron = highest_overlap_introns[0]
    else:
        selected_intron = None
    if genbank_ID in intron_boundaries_genbank_infernal_NT_search:
        if selected_intron is not None:
            intron_boundaries_genbank_infernal_NT_search[genbank_ID] = [intron_boundaries_genbank_infernal_NT_search[genbank_ID], selected_intron]
    else:
        intron_boundaries_genbank_infernal_NT_search[genbank_ID] = selected_intron


intron_boundaries_genbank_infernal_NT_search_found = {key: value for key, value in intron_boundaries_genbank_infernal_NT_search.items() if value is not None}

We then select only introns that start with T (keeping in mind that the first nucleotide of each extracted intron is the last nucleotide of the exon), and we add the sequence length to each intron

In [None]:
intron_boundaries_genbank_infernal_NT_search_found_startsWithT = {}
for key, value in intron_boundaries_genbank_infernal_NT_search_found.items():
    if isinstance(value, list):
        # Filter out introns that don't start with a "T"
        good_introns = [intron for intron in value if intron is not None and intron[3].startswith('T')]
        if good_introns:
            # If there's only one good intron, store it as a tuple instead of a list
            if len(good_introns) == 1:
                intron_boundaries_genbank_infernal_NT_search_found_startsWithT[key] = good_introns[0]
            else:
                intron_boundaries_genbank_infernal_NT_search_found_startsWithT[key] = good_introns
    else:
        # If the intron doesn't start with a "T", skip it
        if not value[3].startswith('T'):
            continue
        intron_boundaries_genbank_infernal_NT_search_found_startsWithT[key] = value

intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths = {}
for key, value in intron_boundaries_genbank_infernal_NT_search_found_startsWithT.items():
    if isinstance(value, list):
        new_value = []
        for intron in value:
            intron_length = len(intron[3])
            new_intron = intron + (intron_length,)
            new_value.append(new_intron)
        intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths[key] = new_value
    else:
        intron_length = len(value[3])
        new_value = value + (intron_length,)
        intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths[key] = new_value

Additionally, there seems to be a single abnormal occurrence of an intron too long due to it having a compound feature annotated with a gap of nearly 100000 nucleotides. We simply remove this instance with a very high max length threshold

In [None]:
# Specify the maximum length
max_length = 10000

intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs = {}
for key, value in intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths.items():
    if isinstance(value, list):
        # Filter out introns that are too long
        good_introns = [intron for intron in value if intron[-1] <= max_length]
        if good_introns:
            # If there's only one good intron, store it as a tuple instead of a list
            if len(good_introns) == 1:
                intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs[key] = good_introns[0]
            else:
                intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs[key] = good_introns
    else:
        # If the intron is too long, skip it
        if value[-1] > max_length:
            continue
        intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs[key] = value

At this point, the structure that we have is a dictionary where each key is a GenBank entry that contains at least 1 group I intron with reliable boundaries. If there is a single such intron, the value is a tuple describing the intron. If there are multiple, the value is a list of such tuples. We now want to extend each intron tuple with additional metadata: organism name, nucleic acid molecular type, taxonomy ID and subcellular location (organelle). We can extract these from the source feature of each GenBank entry. 

In [None]:
intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend = {}
for key, value in intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs.items():
    # Extract the additional data from the 'source' feature
    source_feature = next(feature for feature in infernal_NT_search_genbank_recs[key].features if feature.type == 'source')
    organism = source_feature.qualifiers.get('organism', [''])[0]
    mol_type = source_feature.qualifiers.get('mol_type', [''])[0]
    db_xref = source_feature.qualifiers.get('db_xref', [''])
    # db_xref might be a list, so we want to take the element that contains the substring taxon
    if isinstance(db_xref, list):
        db_xref = next(xref for xref in db_xref if 'taxon' in xref)
    organelle = source_feature.qualifiers.get('organelle', [''])[0]
    if isinstance(value, list):
        new_value = []
        for intron in value:
            new_intron = intron + (organism, mol_type, db_xref, organelle)
            new_value.append(new_intron)
        intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend[key] = new_value
    else:
        new_value = value + (organism, mol_type, db_xref, organelle)
        intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend[key] = new_value

We now flatten the dictionary so that each entry has a tuple as value. When required, we append _i to the keys with more than 1 intron.

In [None]:
intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened = {}
for key, value in intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend.items():
    if isinstance(value, list):
        for i, intron in enumerate(value, start=1):
            new_key = f"{key}_{i}"
            intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened[new_key] = intron
    else:
        intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened[key] = value

This gives us the 43913 group I introns with reliable boundaries. We can save the dictionary for future use.

In [None]:
with open('intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened.pkl', 'wb') as f:
    pickle.dump(intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened, f)

In [3]:
with open('intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened.pkl', 'rb') as f:
    intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened = pickle.load(f)

Each value in the dictionary of introns is a tuple describing the intron. We convert these to dictionaries to make clearer what each element is.

In [4]:
intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened_dict = {}

for key, value in intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened.items():
    genbank_id, start_position, end_position, sequence, overlap, boundariesSource, length, organism, molecule_type, tax_id, organelle = value
    entry_dict = {
        'GenBankID': genbank_id,
        'startPosition': start_position,
        'endPosition': end_position,
        'sequence': sequence,
        'overlapWithInfernalHit': overlap,
        'boundariesSource': boundariesSource,
        'length': length,
        'organism': organism,
        'moleculeType': molecule_type,
        'taxID': tax_id,
        'organelle': organelle
    }
    intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened_dict[key] = entry_dict

Through manual inspection of entries with strange taxonomy, we found 3 outliers that require manual fixing.

In [5]:
# tFirstly, there is a case where authors seem to have by mistake assigned taxid of Olea gastropod genus,
# whereas it should be the olive Olea as indicated by the organism name. So we are going to fix it manually
# the corrected data are taken from Table S1 of the corresponding paper
MT560017_1_entry = intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened_dict['MT560017.1']
MT560017_1_entry["organism"] = "Olea sp. POC544315"
MT560017_1_entry["taxID"] = "taxon:2813885"

In [6]:
# then we need to fix 2 entries that are annotated as artificial DNA because they were deposited as clones into vectors
# First entry for K03428.1
K03428_1_entry = intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened_dict['K03428.1']
K03428_1_entry["organism"] = "Tetrahymena thermophila"
K03428_1_entry["taxID"] = "taxon:5911"

# and then entry JN563930.1
JN563930_1_entry = intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened_dict['JN563930.1']
JN563930_1_entry["organism"] = "Nicotiana undulata"
JN563930_1_entry["taxID"] = "taxon:118713"
JN563930_1_entry["organelle"] = "plastid:chloroplast"

We now want to classify the introns into organism types based on the taxonomy ID. We use a local copy of NCBI's taxonomy database for it

In [None]:
ncbi = NCBITaxa()
# Update the local database
ncbi.update_taxonomy_database()

In [None]:
def classify_intron(key_value):
    ncbi = NCBITaxa()
    key, value = key_value
    taxid_number = value.split(":")[1]
    recovery_successful = False
    try:
        lineage = ncbi.get_lineage(taxid_number)
        recovery_successful = True
    except Exception as e:
        # then we are going to try to do the organism search from organisn name
        # first we get the organism name from the flattened dictionary using the same key
        organism_name = intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened_dict[key]['organism']
        # then if the organism name contains the substring "aff. ", we remove it
        if "aff. " in organism_name:
            organism_name = organism_name.replace("aff. ", "")
        # then we are going to search for the organism name in the NCBI taxonomy database
        taxid_number = ncbi.get_name_translator([organism_name])
        taxid_number = taxid_number[organism_name][0]
        # and then we try with this new taxid_number, using error handling in case it is again not found
        try:
            lineage = ncbi.get_lineage(taxid_number)
            recovery_successful = True
        except Exception as e:
            print(f"No result found for organism name: {organism_name}. Error: {e}")
            category = "other"

    if recovery_successful:
        names = ncbi.get_taxid_translator(lineage)
        names_values = names.values()
        lineages_dict[key] = names_values
    else:
        # this should not happen for any key
        lineages_dict[key] = None
        print(f"No result found for organism name: {organism_name}. Error: {e}")
        category = "other"
    if "Bacteria" in names_values:
        category = "bacteria"
    elif 'Viruses' in names_values:
        category = "virus"
    elif "Eukaryota" in names_values:
        if "Viridiplantae" in names_values or 'Diphylleia' in names_values:
            category = "plants"
        elif "Mollusca" in names_values:
            category = "molluscs"
        elif "Fungi" in names_values:
            category = "fungi"
        elif "Ciliophora" in names_values:
            category = "ciliates"
        elif "Oomycota" in names_values:
            category = "oomycetes"
        elif 'Acanthamoeba' in names_values or 'Amoebidium' in names_values or 'Dictyostelia' in names_values or 'Dictyostelium' in names_values or 'Heterostelium pallidum' in names_values or 'Myxogastria' in names_values or 'Physariida' in names_values or any("amoeba" in element for element in names_values) or 'Amoebozoa' in names_values or 'Nuclearia' in names_values:
            category = "amoebae"
        elif 'Bacillariophyta' in names_values:
            category = "diatoms"
        elif 'Chlorarachniophyceae' in names_values:
            category = "green algae"
        elif 'Choanoflagellata' in names_values:
            category = "choanoflagellates"
        elif 'Cryptophyceae' in names_values:
            category = "cryptophytes"
        elif 'Cyanophora' in names_values:
            category = "glaucophytes"
        elif 'Euglenida' in names_values:
            category = "euglenids"
        elif 'Eustigmatophyceae' in names_values:
            category = "eustigmatophytes"
        elif 'Heterolobosea' in names_values:
            category = "percolozoa"
        elif 'Heteromitidae' in names_values:
            category = "cercomonads"
        elif 'Phaeophyceae' in names_values or 'Schizocladia' in names_values:
            category = "brown algae"
        elif 'Plasmodiophorida' in names_values:
            category = "plasmodiophores"
        elif 'Porifera' in names_values:
            category = "sponges"
        elif 'Rhodophyta' in names_values:
            category = "red algae"
        elif 'Xanthophyceae' in names_values:
            category = "yellow-green algae"
        elif 'Hexacorallia' in names_values:
            category = "corals"
        elif 'Insecta' in names_values:
            category = "insects"
        elif 'Vertebrata' in names_values:
            category = "vertebrates"
        elif 'Cercozoa' in names_values:
            category = "cercozoans"
        elif 'Centroplasthelida' in names_values:
            category = "centrohelids"
        elif 'Placozoa' in names_values:
            category = "placozoans"
        else:
            category = "other"
        # note that many of 
    else:
        category = "other"
    entry_to_update = intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened_dict[key]
    extended_entry = entry_to_update.copy()
    extended_entry["organismType"] = category
    extended_entry["lineage"] = list(names_values)
    return key, extended_entry

classified_introns = {}
lineages_dict = {}
# Extract all taxon IDs
taxon_ids = {key: entry['taxID'] for key, entry in intron_boundaries_genbank_infernal_NT_search_found_startsWithT_plus_lengths_removeTooLongs_orgExtend_flattened_dict.items() if entry['taxID']}

with ThreadPoolExecutor() as executor:
    future_to_key = {executor.submit(classify_intron, item): item[0] for item in taxon_ids.items()}
    for future in tqdm(concurrent.futures.as_completed(future_to_key)):
        key = future_to_key[future]
        try:
            result = future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (key, exc))
        else:
            classified_introns[result[0]] = result[1]

We can now save this dictionary with categories of organisms as well. It also contains the entire taxonomic lineages of each organism.

In [19]:
with open('classified_introns.pkl', 'wb') as f:
    pickle.dump(classified_introns, f)

In [3]:
with open('classified_introns.pkl', 'rb') as f:
    classified_introns = pickle.load(f)

We now make a barplot with the number of introns in each organism group. For this plot, we group all protists other than amoebae under category "protists". We group corals and sponges together also

In [None]:
# Extract the organism types
organism_types = [entry['organismType'] for entry in classified_introns.values()]

# Count the occurrences of each organism type
counts = Counter(organism_types)

# Create a new dictionary where we group the categories for other protists
grouped_counts = {'plants': 0, 'fungi': 0, 'amoebae': 0, 'virus': 0, 'bacteria': 0, 'corals/sponges': 0, 'other protists': 0}
for organism_type, count in counts.items():
    if organism_type in grouped_counts:
        grouped_counts[organism_type] += count
    elif organism_type in ['corals', 'sponges']:
        grouped_counts['corals/sponges'] += count
    else:
        grouped_counts['other protists'] += count

# Sort the dictionary by the frequencies, except 'other protists' which should be last
sorted_counts = {k: v for k, v in sorted(grouped_counts.items(), key=lambda item: (-item[1], item[0] == 'other protists'))}

# Create the barplot

plt.rc('font', family='Arial', size=9)
plt.figure(figsize=(5.2/2.54, 5.2/2.54))

plt.bar(sorted_counts.keys(), sorted_counts.values())
plt.xlabel('Organism Type')
plt.ylabel('Frequency')

# Rotate x-axis labels
plt.xticks(rotation=90)
plt.savefig('Figure1b.png', dpi=300, bbox_inches='tight')

plt.show()

Now let's make a similar barplot but excluding plants and fungi

In [None]:
# Create a new dictionary excluding 'plants' and 'fungi'
filtered_counts = {k: v for k, v in counts.items() if k not in ['plants', 'fungi']}

# Create the barplot
plt.rc('font', family='Arial', size=8)
plt.figure(figsize=(10/2.54, 5.2/2.54))

plt.bar(filtered_counts.keys(), filtered_counts.values())
plt.xlabel('Organism Type')
plt.ylabel('Frequency')

# Rotate x-axis labels
plt.xticks(rotation=90)
plt.savefig('Figure1c.png', dpi=300, bbox_inches='tight')
plt.show()

Now we are going to identify putative homing endonucleases in the introns. We first extract all possible ORFs of a minimum length in all reading frames, taking into consideration that translation to protein should be done with the appropriate genetic code taking into account organism and subcellular location. The correct genetic code also defines with which codons a candidate ORF can start and end

In [None]:
from Bio.Seq import Seq
from Bio.Data import CodonTable

def find_orfs(sequence, frame, genetic_code, min_length):
    start_codons = CodonTable.unambiguous_dna_by_id[genetic_code].start_codons
    if frame < 3:
        sequence_frame = sequence[frame:]
    else:
        sequence_frame = str(Seq(sequence).reverse_complement())[frame-3:]
    protein = str(Seq(sequence_frame).translate(table=genetic_code, to_stop=False))
    orfs = []
    for i in range(0, len(sequence_frame), 3):
        if sequence_frame[i:i+3] in start_codons:
            protein_start = i // 3
            for j in range(protein_start, len(protein)):
                if protein[j] == "*":
                    orf = protein[protein_start:j]
                    if len(orf) >= min_length:
                        orfs.append({
                            'sequence': orf,
                            'startInIntronSeq': frame + i,
                            'endInIntronSeq': frame + i + len(orf) * 3,
                            'geneticCode': genetic_code,
                            'readingFrame': frame
                        })
                    break
    # Remove ORFs that are fully contained within another ORF
    orfs = [orf1 for orf1 in orfs if not any(orf2['startInIntronSeq'] <= orf1['startInIntronSeq'] and orf2['endInIntronSeq'] >= orf1['endInIntronSeq'] for orf2 in orfs if orf2 != orf1)]
    return orfs

min_length = 120
orf_dict = {}
for key, value in classified_introns.items():
    orf_dict[key] = {}
    lineage = [tax.lower() for tax in value['lineage']]
    category = value['organismType']
    organelle = value['organelle']
    if ("mitochondria" in organelle):
        if (("saccharomyces" in lineage and "cerevisiae" in lineage) or ("candida" in lineage and "glabrata" in lineage) or 
            ("hansenula" in lineage and "saturnus" in lineage) or ("kluyveromyces" in lineage and "thermotolerans" in lineage)):
            genetic_code = 3
        elif (("emericella" in lineage and "nidulans" in lineage) or ("neurospora" in lineage and "crassa" in lineage) or
              ("podospora" in lineage and "anserina" in lineage) or ("acremonium" in lineage) or ("candida" in lineage and "parapsilosis" in lineage) or
              ("trichophyton" in lineage and "rubrum" in lineage) or ("dekkera" in lineage) or ("brettanomyces" in lineage) or 
              ("eeniella" in lineage) or ("ascobolus" in lineage and "immersus" in lineage) or ("aspergillus" in lineage and "amstelodami" in lineage) or
              ("claviceps" in lineage and "purpurea" in lineage) or ("cochliobolus" in lineage and "heterostrophus" in lineage) or 
              ("gigartinales" in lineage) or ("trypanosoma" in lineage and "bruceii" in lineage) or ("leishmania" in lineage and "tarentolae" in lineage) or
              ("paramecium" in lineage and "tetraurelia" in lineage) or ("tetrahymena" in lineage and "pyriformis" in lineage) or
              ("plasmodium" in lineage and "gallinaceum" in lineage) or ("coelenterata" in lineage)):
            genetic_code = 4
        elif (("asterozoa" in lineage) or ("echinozoa" in lineage) or ("rhabditophora" in lineage)):
            genetic_code = 9
        elif (("pyura" in lineage and "stolonifera" in lineage) or ("halocynthia" in lineage and "roretzi" in lineage) or 
              ("ciona" in lineage and "savignyi" in lineage)):
              #("ciona" in lineage and "savignyi" in lineage) or ("halocynthia" in lineage and "roretzi" in lineage)):
            genetic_code = 13
        elif (("chlorophyceae" in lineage) or ("spizellomyces" in lineage and "punctatus" in lineage)):
            genetic_code = 16
        elif "trematoda" in lineage:
            genetic_code = 21
        elif (("scenedesmus" in lineage and "obliquus" in lineage)):
            genetic_code = 22
        elif (("thraustochytrium" in lineage and "aureum" in lineage)):
            genetic_code = 23
        elif (("rhabdopleuridae" in lineage and "compacta" in lineage)):
            genetic_code = 24
        elif (("vertebrata" in lineage)):
            genetic_code = 2
        elif (("ascaris" in lineage) or ("caenorhabditis" in lineage) or ("bivalvia" in lineage) or ("polyplacophora" in lineage) or
              ("artemia" in lineage) or ("drosophila" in lineage) or ("locusta" in lineage and "migratoria" in lineage) or
              ("apis" in lineage and "mellifera" in lineage)):
            genetic_code = 5
        elif (("platyhelminthes" in lineage) or ("nematoda" in lineage)):
            genetic_code = 14
        else:
            genetic_code = 1
    elif (organelle == ""):
        if (("oxytricha" in lineage) or ("stylonychia" in lineage) or ("paramecium" in lineage) or ("tetrahymena" in lineage) or
            ("oxytrichidae" in lineage) or ("glaucoma" in lineage and "chattoni" in lineage)):
            genetic_code = 6
        elif (("euplotidae" in lineage)):
            genetic_code = 10
        elif category == "bacteria":
            genetic_code = 11
        elif (("cephaloascaceae" in lineage) or ("debaryomycetaceae" in lineage) or ("metschnikowiaceae" in lineage) or ("babjeviella" in lineage) or
              ("ascoideaceae" in lineage) or ("saccharomycopsidaceae" in lineage)):
            genetic_code = 12
        elif (("blepharisma" in lineage) or ("crassvirales" in lineage)):
            genetic_code = 15
        elif (("candidate division sr1" in lineage) or ("gracilibacteria" in lineage)):
            genetic_code = 25
        elif (("pachysolen" in lineage) or ("nakazawaea" in lineage) or ("peterozyma" in lineage)):
            genetic_code = 26
        elif (("parduczia" in lineage)):
            genetic_code = 27
        elif (("condylostoma" in lineage and "magnum" in lineage)):
            genetic_code = 28
        elif (("mesodinium" in lineage) or ("myrionecta" in lineage)):
            genetic_code = 29
        elif (("carchesium" in lineage)):
            genetic_code = 30
        elif (("blastocrithidia" in lineage)):
            genetic_code = 31
        elif (("cephalodiscidae" in lineage)):
            genetic_code = 33
        else:
            genetic_code = 1
    elif ("plastid" in organelle):
        if category == "plants":
            genetic_code = 11
        else:
            genetic_code = 1
    for frame in range(6):
        orf_dict[key][frame] = find_orfs(value['sequence'], frame, genetic_code, min_length)

In [None]:
total_orfs = sum(len(orfs) for frame_orfs in orf_dict.values() for orfs in frame_orfs.values())
print(f"Total number of candidate ORFs: {total_orfs}")

introns_with_orfs = len([key for key, value in orf_dict.items() if any(value.values())])
print(f"Number of introns with candidate ORFs: {introns_with_orfs}")

In [None]:
with open("orfs.fasta", "w") as output_handle:
    for key, value in orf_dict.items():
        for frame, orfs in value.items():
            for i, orf in enumerate(orfs):
                sequence = Seq(orf['sequence'])
                record = SeqRecord(sequence, id=f"{key}_frame{frame}_orf{i}", description="")
                SeqIO.write(record, output_handle, "fasta")

We now make a run_interproscan.sh file that will be executed to run InterProScan on all ORFs. The file should be something like this:
    
```bash
    #!/bin/bash
    /path/to/interproscan.sh -i orfs.fasta -f json -o orfs.json -cpu 128
```

In [None]:
# Run the shell script
command = ["bash", "run_interproscan.sh"]
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()

if process.returncode != 0:
    print(f"InterProScan failed with error message:\n{stderr.decode()}")
else:
    print("InterProScan completed successfully.")

In [91]:
# Parse the InterProScan results and add them to a dictionary
interpro_results = {}
with open('orfs.json') as json_file:
    data = json.load(json_file)
    for protein in data['results']:
        id_parts = protein['xref'][0]['id'].split('_')
        key = '_'.join(id_parts[:-2])
        frame = int(id_parts[-2][-1])
        orf_index = int(id_parts[-1][3:])
        if key not in interpro_results:
            interpro_results[key] = {}
        if frame not in interpro_results[key]:
            interpro_results[key][frame] = {}
        interpro_results[key][frame][orf_index] = protein['matches']

Now we want to make a new dictionary where we only keep introns for which an ORF likely to be a homing endonuclease was found. We also add the InterProScan hits for these cases. It is possible that some introns might contain multiple homing endonucleases.

In [92]:
def check_substrings(d):
    # Check if the current dictionary contains the substrings
    if any("endonuc" in value.lower() or "homing" in value.lower() or "nuclease" in value.lower() 
           for value in d.values() if isinstance(value, str)):
        return True
    # Recursively check the nested dictionaries
    return any(check_substrings(value) for value in d.values() if isinstance(value, dict))

# Make a deep copy of the orf_dict dictionary
homingEndonucleases_dict = copy.deepcopy(orf_dict)
discarded_ORFs_dict = copy.deepcopy(orf_dict)

# Iterate over each intron in the copied dictionary
for intron, frames in list(homingEndonucleases_dict.items()):  # Use list to allow modifying the dictionary during iteration
    # For each intron, iterate over each frame
    for frame, orfs in list(frames.items()):  # Use list to allow modifying the dictionary during iteration
        # Create a new list that only includes the ORFs you want to keep
        new_orfs = []
        # For each frame, iterate over each ORF
        for i, orf in enumerate(orfs):
            # Check if the ORF has any InterProScan hits
            if intron in interpro_results and frame in interpro_results[intron] and i in interpro_results[intron][frame]:
                hits = interpro_results[intron][frame][i]
                # Check if any hit contains the substrings "endonuc", "homing", or "nuclease", ignoring case
                if any(check_substrings(hit) for hit in hits):
                    # Append the list of InterProScan hits to the ORF's dictionary
                    orf['interProScanHits'] = hits
                    # Add the ORF to the new list
                    new_orfs.append(orf)
                else:
                    # Append the list of InterProScan hits to the ORF's dictionary in discarded_ORFs_dict
                    discarded_ORFs_dict[intron][frame][i]['interProScanHits'] = hits
            else:
                # Treat the ORF as if there were hits but they did not contain any of the specified substrings
                orf['interProScanHits'] = []
        # Replace the old list of ORFs with the new list
        frames[frame] = new_orfs
    # Remove the intron if all its frames are empty
    if all(not orfs for orfs in frames.values()):
        del homingEndonucleases_dict[intron]

# Iterate over each intron in the discarded_ORFs_dict dictionary
for intron, frames in list(discarded_ORFs_dict.items()):  # Use list to allow modifying the dictionary during iteration
    # For each intron, iterate over each frame
    for frame, orfs in list(frames.items()):  # Use list to allow modifying the dictionary during iteration
        # Remove the frame if all its ORFs have no InterProScan hits
        if all('interProScanHits' not in orf or not orf['interProScanHits'] for orf in orfs):
            del frames[frame]
    # Remove the intron if all its frames are empty
    if all(not orfs for orfs in frames.values()):
        del discarded_ORFs_dict[intron]

In [None]:
homingEndonucleases_dict_reshaped = {}

# Iterate over each intron in homingEndonucleases_dict
for intron, frames in homingEndonucleases_dict.items():
    # Initialize an empty list for the current intron
    homingEndonucleases_dict_reshaped[intron] = []
    # For each intron, iterate over each frame
    for frame, orfs in frames.items():
        # For each frame, iterate over each ORF
        for orf in orfs:
            # Create a new dictionary that is a copy of the ORF's dictionary
            orf_copy = copy.deepcopy(orf)
            # Add a new key to the new dictionary with the name 'frame' and the value of the current frame
            orf_copy['frame'] = frame
            # Append the new dictionary to the list of ORFs for the current intron
            homingEndonucleases_dict_reshaped[intron].append(orf_copy)

print(homingEndonucleases_dict_reshaped)

We save this dictionary for future use.

In [97]:
with open('homingEndonucleases_dict_reshaped.pkl', 'wb') as f:
    pickle.dump(homingEndonucleases_dict_reshaped, f)

In [None]:
with open('homingEndonucleases_dict_reshaped.pkl', 'rb') as f:
    homingEndonucleases_dict_reshaped = pickle.load(f)

In [None]:
# lets count the total number of homing endonucleases identified. In order to do so, we sum the length of each list stored as the value of each key of homingEndonucleases_dict_reshaped

total_homing_endonucleases = sum(len(value) for value in homingEndonucleases_dict_reshaped.values())

print(f"Total number of putative homing endonucleases: {total_homing_endonucleases}")

print(f"These are located in a total of {len(homingEndonucleases_dict_reshaped)} introns")

We now make a dictionary where each key is a number of endonucleases per intron and each value is a list with the introns that have that number of endonucleases

In [None]:
endonuclease_counter = Counter(len(value) for value in homingEndonucleases_dict_reshaped.values())

print(endonuclease_counter)

introns_by_number_endonucleases = {f"{count} endonuclease" if count == 1 else f"{count} endonucleases": [] for count in endonuclease_counter.keys()}
for key, value in homingEndonucleases_dict_reshaped.items():
    count = len(value)
    introns_by_number_endonucleases[f"{count} endonuclease" if count == 1 else f"{count} endonucleases"].append(key)

We can extract the key for the intron with 4 detected endonucleases

In [None]:
print(introns_by_number_endonucleases["4 endonucleases"])

Now we are going to compare the histogram of lengths for all introns and for introns where homing endonucleases were detected

In [None]:
# Extract the sizes of the introns from the classified_introns dictionary
all_intron_sizes = [len(intron["sequence"]) for intron in classified_introns.values()]

# Extract the sizes of the introns from the homingEndonucleases_dict_reshaped dictionary
homing_endonuclease_intron_sizes = [len(classified_introns[intron]["sequence"]) for intron in homingEndonucleases_dict_reshaped if intron in classified_introns]

min_size = min(all_intron_sizes)
max_size = max(all_intron_sizes)

# Create a histogram for the distribution of intron sizes of all introns
plt.rc('font', family='Arial', size=8)
plt.figure(figsize=(15/2.54, 7.5/2.54))

plt.hist(all_intron_sizes, bins=60, alpha=0.5, color="blue", range=(min_size, max_size), label='All introns', density=True)

# Create a histogram for the distribution of intron sizes of homing endonuclease introns
plt.hist(homing_endonuclease_intron_sizes, bins=60, alpha=0.5, color="red", range=(min_size, max_size), label='Introns with homing endonucleases', density=True)

plt.xlabel('Intron size')
plt.ylabel('Density')
plt.legend()
plt.savefig('Figure1d.png', dpi=300, bbox_inches='tight')
plt.show()

Now we are going to perform secondary structure predictions. First let's set up the arnie.conf file

In [5]:
!echo "eternafold: /path/to/EternaFold/src" > arnie.conf
!echo "vienna_2: /path/to/ViennaRNA/bin" >> arnie.conf
!echo "nupack: /path/to/nupack3/bin" >> arnie.conf
!echo "contrafold_2: /path/to/contrafold-se/src" >> arnie.conf
!echo "rnastructure: /path/to/RNAstructure/exe" >> arnie.conf
!echo "rnasoft: /path/to/MultiRNAFold" >> arnie.conf
!echo "linearfold: /path/to/LinearFold/bin" >> arnie.conf
!echo "linearpartition: /path/to/LinearPartition/bin" >> arnie.conf
!echo "spotrna: /path/to/SPOT-RNA" >> arnie.conf
!echo "ipknot: /path/to/ipknot/bin" >> arnie.conf
!echo "TMP: /path/to/tmp/folder/" >> arnie.conf

os.environ["ARNIEFILE"] = f'/path/to/arnie.conf'
os.environ["DATAPATH"] = f'/path/to/RNAstructure/data_tables'

import arnie

And now we run predictions of secondary structure and base-pairing probabilities for all introns with all available software. We start with mean free energy (MFE) secondary structure predictions.

In [None]:
from arnie.mfe import mfe

symbol_mapping = {
    'N': 'A',
    'S': 'C',
    'Y': 'C',
    'B': 'C',
    'R': 'A',
    'D': 'A',
    'W': 'A',
    'K': 'G',
    'V': 'A',
    'H': 'A',
    'M': 'A',
    'T': 'U'
}

def process_intron(key, value):
    sequence = value['sequence']
    for non_standard, standard in symbol_mapping.items():
        sequence = sequence.replace(non_standard, standard)
    with tempfile.TemporaryDirectory() as tmpdir:
        os.chdir(tmpdir)
        try:
            eternafold_MFE_structure = mfe(sequence, package='eternafold')
        except Exception as e:
            print(f"Error processing intron {key} with eternafold: {e}")
            eternafold_MFE_structure = e
        try:
            vienna_MFE_structure = mfe(sequence, package='vienna_2', DEBUG=True)
        except Exception as e:
            print(f"Error processing intron {key} with vienna_2: {e}")
            vienna_MFE_structure = e
        try:
            contrafold_MFE_structure = mfe(sequence, package='contrafold_2', viterbi=True)
        except Exception as e:
            print(f"Error processing intron {key} with contrafold_2: {e}")
            contrafold_MFE_structure = e
        try:
            rnastructure_MFE_structure = mfe(sequence, package='rnastructure', pseudo=False)
        except Exception as e:
            print(f"Error processing intron {key} with rnastructure: {e}")
            rnastructure_MFE_structure = e
    return key, eternafold_MFE_structure, vienna_MFE_structure, contrafold_MFE_structure, rnastructure_MFE_structure

eternafold_MFE_secondary_structures = {}
vienna_MFE_secondary_structures = {}
contrafold_MFE_secondary_structures = {}
rnastructure_MFE_secondary_structures = {}

with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = {executor.submit(process_intron, key, value): key for key, value in classified_introns.items()}
    pbar = tqdm(total=len(futures), desc="Processing introns", dynamic_ncols=True)
    for future in concurrent.futures.as_completed(futures):
        key, eternafold_MFE_structure, vienna_MFE_structure, contrafold_MFE_structure, rnastructure_MFE_structure = future.result()
        eternafold_MFE_secondary_structures[key] = eternafold_MFE_structure
        vienna_MFE_secondary_structures[key] = vienna_MFE_structure
        contrafold_MFE_secondary_structures[key] = contrafold_MFE_structure
        rnastructure_MFE_secondary_structures[key] = rnastructure_MFE_structure
        pbar.update(1)
    pbar.close()

with open('results_secondary_structures_MFE.pkl', 'wb') as f:
    pickle.dump({
        'eternafold': eternafold_MFE_secondary_structures,
        'vienna': vienna_MFE_secondary_structures,
        'contrafold': contrafold_MFE_secondary_structures,
        'rnastructure': rnastructure_MFE_secondary_structures
    }, f)

In [100]:
# we can read back the MFE secondary structures

with open('results_secondary_structures_MFE.pkl', 'rb') as f:
    MFE_secondary_structures = pickle.load(f)

eternafold_MFE_secondary_structures = MFE_secondary_structures['eternafold']
vienna_MFE_secondary_structures = MFE_secondary_structures['vienna']
contrafold_MFE_secondary_structures = MFE_secondary_structures['contrafold']
rnastructure_MFE_secondary_structures = MFE_secondary_structures['rnastructure']

Next we are going to calculate base pair probability (BPP) matrixes for all introns with each package. Note this will take a long time, and require large amounts of RAM.

In [None]:
from arnie.bpps import bpps

symbol_mapping = {
    'N': 'A',
    'S': 'C',
    'Y': 'C',
    'B': 'C',
    'R': 'A',
    'D': 'A',
    'W': 'A',
    'K': 'G',
    'V': 'A',
    'H': 'A',
    'M': 'A',
    'T': 'U'
}

def process_intron_bpps(key, value):
    sequence = str(value['sequence'])
    for non_standard, standard in symbol_mapping.items():
        sequence = sequence.replace(non_standard, standard)
    with tempfile.TemporaryDirectory() as tmpdir:
        os.chdir(tmpdir)
        try:
            eternafold_BPPS = bpps(sequence, package='eternafold')
        except Exception as e:
            print(f"Error processing intron {key} with eternafold: {e}")
            eternafold_BPPS = e
        try:
            vienna_BPPS = bpps(sequence, package='vienna_2')
        except Exception as e:
            print(f"Error processing intron {key} with vienna_2: {e}")
            vienna_BPPS = e
        try:
            contrafold_BPPS = bpps(sequence, package='contrafold_2')
        except Exception as e:
            print(f"Error processing intron {key} with contrafold_2: {e}")
            contrafold_BPPS = e
        try:
            rnastructure_BPPS = bpps(sequence, package='rnastructure')
        except Exception as e:
            print(f"Error processing intron {key} with rnastructure: {e}")
            rnastructure_BPPS = e
        try:
            rnasoft_BPPS = bpps(sequence, package='rnasoft')
        except Exception as e:
            print(f"Error processing intron {key} with rnasoft: {e}")
            rnasoft_BPPS = e
        try:
            nupack_BPPS = bpps(sequence, package='nupack')
        except Exception as e:
            print(f"Error processing intron {key} with nupack: {e}")
            nupack_BPPS = e
    return key, eternafold_BPPS, vienna_BPPS, contrafold_BPPS, rnastructure_BPPS, rnasoft_BPPS, nupack_BPPS

eternalfold_BPPS_matrixes = {}
vienna_BPPS_matrixes = {}
contrafold_BPPS_matrixes = {}
rnastructure_BPPS_matrixes = {}
rnasoft_BPPS_matrixes = {}
nupack_BPPS_matrixes = {}

eternalfold_BPPS_matrixes_errors = {}
vienna_BPPS_matrixes_errors = {}
contrafold_BPPS_matrixes_errors = {}
rnastructure_BPPS_matrixes_errors = {}
rnasoft_BPPS_matrixes_errors = {}
nupack_BPPS_matrixes_errors = {}

with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = {executor.submit(process_intron_bpps, key, value): key for key, value in classified_introns.items()}
    pbar = tqdm(total=len(futures), desc="Processing introns", dynamic_ncols=True)
    for future in concurrent.futures.as_completed(futures):
        key, eternafold_BPPS, vienna_BPPS, contrafold_BPPS, rnastructure_BPPS, rnasoft_BPPS, nupack_BPPS = future.result()
        if isinstance(eternafold_BPPS, Exception):
            eternalfold_BPPS_matrixes_errors[key] = eternafold_BPPS
        else:
            with open(f"BPPS_all/eternafold/{key}_eternafold.npy", "wb") as f:
                np.save(f, eternafold_BPPS)
        if isinstance(vienna_BPPS, Exception):
            vienna_BPPS_matrixes_errors[key] = vienna_BPPS
        else:
            with open(f"BPPS_all/vienna/{key}_vienna.npy", "wb") as f:
                np.save(f, vienna_BPPS)
        if isinstance(contrafold_BPPS, Exception):
            contrafold_BPPS_matrixes_errors[key] = contrafold_BPPS
        else:
            with open(f"BPPS_all/contrafold/{key}_contrafold.npy", "wb") as f:
                np.save(f, contrafold_BPPS)
        if isinstance(rnastructure_BPPS, Exception):
            rnastructure_BPPS_matrixes_errors[key] = rnastructure_BPPS
        else:
            with open(f"BPPS_all/rnastructure/{key}_rnastructure.npy", "wb") as f:
                np.save(f, rnastructure_BPPS)
        if isinstance(rnasoft_BPPS, Exception):
            rnasoft_BPPS_matrixes_errors[key] = rnasoft_BPPS
        else:
            with open(f"BPPS_all/rnasoft/{key}_rnasoft.npy", "wb") as f:
                np.save(f, rnasoft_BPPS)
        if isinstance(nupack_BPPS, Exception):
            nupack_BPPS_matrixes_errors[key] = nupack_BPPS
        else:
            with open(f"BPPS_all/nupack/{key}_nupack.npy", "wb") as f:
                np.save(f, nupack_BPPS)
        pbar.update(1)
    pbar.close()

BPPs run successfully with all software for all introns except for a set of 106 introns, which fail for RNAsoft.

In [None]:
failed_keys_rnastructure_BPPS_matrixes = list(rnastructure_BPPS_matrixes_errors.keys())

Now let's extract the preceding and following exonic context. Extracted intron sequences already contained the last base of the preceding exon and the first base of the following exon; these are included again in the exonic contexts. Up to 30 bases of each are extracted (if there are not enough bases available, extracted sequences are padded with "-").

In [None]:
classified_introns_preceding_exons = {}
classified_introns_following_exons = {}
number_bases_preceding_exon = 30
number_bases_following_exon = 30

for key, value in tqdm(classified_introns.items(), total=len(classified_introns)):
    start_position_intron = value['startPosition']
    end_position_intron = value['endPosition'] + 1
    genbankID = value['GenBankID']
    genbank_record = infernal_NT_search_genbank_recs[genbankID]
    if genbank_record.seq.defined:
        genbank_sequence_raw = genbank_record.seq
    else:
        genbank_sequence_raw = seqs_of_records_with_undefined_seqs[genbankID]
    genbank_sequence = genbank_sequence_raw.replace('U', 'T')
    source = value['boundariesSource']
    if("reverseStrand" in source):
        reverseStrand = True
    else:
        reverseStrand = False
    if(reverseStrand):
        following_exon_seq = genbank_sequence_raw[max((start_position_intron - number_bases_following_exon + 1), 0):(start_position_intron+1)].reverse_complement()
        preceding_exon_seq = genbank_sequence_raw[(end_position_intron-1):(end_position_intron + number_bases_preceding_exon - 1)].reverse_complement()
    else:
        following_exon_seq = genbank_sequence_raw[(end_position_intron-1):(end_position_intron + number_bases_following_exon - 1)]
        preceding_exon_seq = genbank_sequence_raw[max((start_position_intron - number_bases_preceding_exon + 1), 0):(start_position_intron+1)]
    if(len(following_exon_seq) < number_bases_following_exon):
        following_exon_seq = following_exon_seq + "-"*(number_bases_following_exon - len(following_exon_seq))
    if(len(preceding_exon_seq) < number_bases_preceding_exon):
        preceding_exon_seq = "-"*(number_bases_preceding_exon - len(preceding_exon_seq)) + preceding_exon_seq
    classified_introns_preceding_exons[key] = preceding_exon_seq
    classified_introns_following_exons[key] = following_exon_seq


In [84]:
with(open("classified_introns_preceding_exons.pkl", "wb")) as f:
    pickle.dump(classified_introns_preceding_exons, f)

with(open("classified_introns_following_exons.pkl", "wb")) as f:
    pickle.dump(classified_introns_following_exons, f)

Finally we create the database files. First the main dataframe of all introns

In [117]:
data = []
for key, value in classified_introns.items():
    genbankID = value['GenBankID']
    startPosition = int(value['startPosition']) + 1
    endPosition = int(value['endPosition']) + 1
    intronSequence = str(value['sequence'])
    intronLength = value['length']
    source = value['boundariesSource']
    if "reverseStrand" in source:
        strand = "reverse"
    else:
        strand = "forward"
    organism = value['organism']
    organismType = value['organismType']
    subcellularLocation = value['organelle']
    if subcellularLocation == '':
        if organismType == "bacteria":
            subcellularLocation = 'cytoplasm'
        elif organismType == "virus":
            subcellularLocation = 'virion'
        else:
            subcellularLocation = 'nucleus'
    taxID = value['taxID']
    taxID = taxID.split(":")[1]
    precedingExonSequence = str(classified_introns_preceding_exons[key])
    followingExonSequence = str(classified_introns_following_exons[key])
    eternafoldSecondaryStructure = eternafold_MFE_secondary_structures[key]
    viennaSecondaryStructure = vienna_MFE_secondary_structures[key]
    contrafoldSecondaryStructure = contrafold_MFE_secondary_structures[key]
    rnastructureSecondaryStructure = rnastructure_MFE_secondary_structures[key]
    data.append([key, genbankID, startPosition, endPosition, intronSequence, intronLength, strand, organism, organismType, subcellularLocation, taxID, precedingExonSequence, followingExonSequence, eternafoldSecondaryStructure, viennaSecondaryStructure, contrafoldSecondaryStructure, rnastructureSecondaryStructure])

intronDatabaseDF = pd.DataFrame(data, columns=['intronID', 'GenBankID', 'startPosition', 'endPosition', 'intronSequence', 'intronLength', 'strand', 'organism', 'organismType', 'subcellularLocation', 'organismTaxID', 'precedingExonSequence', 'followingExonSequence', 'eternafoldSecondaryStructure', 'viennaSecondaryStructure', 'contrafoldSecondaryStructure', 'rnastructureSecondaryStructure'])

In [119]:

intronDatabaseDF.to_csv("intronDatabase.csv", index=False)

And now the putative endonucleases dataframe. Note that the column intronID is shared between both database files.

In [135]:
data = []
for key, value in homingEndonucleases_dict_reshaped.items():
    for orf in value:
        intronID = key
        genbankID = classified_introns[key]['GenBankID']
        startPositionInIntron = orf['startInIntronSeq'] + 1
        endPositionInIntron = orf['endInIntronSeq'] + 3
        sequence = orf['sequence']
        geneticCode = orf['geneticCode']
        readingFrame = orf['readingFrame']
        data.append([intronID, genbankID, startPositionInIntron, endPositionInIntron, sequence, geneticCode, readingFrame])
    
homingEndonucleasesDF = pd.DataFrame(data, columns=['intronID', 'GenBankID', 'startPositionInIntron', 'endPositionInIntron', 'putativeSequence', 'geneticCode', 'readingFrame'])

In [137]:
homingEndonucleasesDF.to_csv("homingEndonucleasesDF.csv", index=False)