In [None]:
import time
import sys
import os
import glob
import math
import threading
import concurrent.futures as cf
import random
import re

import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Input, Model, layers, metrics, losses, callbacks, optimizers, models, utils
from keras import backend as K
import gc
import keras_tuner as kt
from pyfaidx import Fasta

K.clear_session()
gc.collect()

datasets_path = "../../Datasets/"
models_path = "../../Models/"

2025-03-18 01:15:49.937379: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-18 01:15:50.052754: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-18 01:15:50.088427: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-18 01:15:50.387467: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


This notebook was used to calculate intron boundaries using the hg38 basic annotations gtf.

In [None]:
def load_gtf_annotations(gtf_file):
    """
    Loads GTF into a pandas DataFrame and converts cstart and cend to zero-based indexing.
    """
    gtf_data = pd.read_csv(
        gtf_file, sep='\t', comment='#', header=None,
        names=['seqname', 'source', 'feature', 'cstart', 'cend', 
               'score', 'strand', 'frame', 'attribute']
    )
    # Convert to zero-based indexing for cstart
    gtf_data['cstart'] = gtf_data['cstart'] - 1
    return gtf_data

In [None]:
annotation_data = load_gtf_annotations(datasets_path + 'basic_annotations.gtf')
# annotation_data = annotation_data[annotation_data["seqname"]!="chrM"]
annotation_data.head()

Unnamed: 0,seqname,source,feature,cstart,cend,score,strand,frame,attribute
0,chr1,HAVANA,gene,11120,24894,.,+,.,"gene_id ""ENSG00000290825.2""; gene_type ""lncRNA..."
1,chr1,HAVANA,transcript,11425,14409,.,+,.,"gene_id ""ENSG00000290825.2""; transcript_id ""EN..."
2,chr1,HAVANA,exon,11425,11671,.,+,.,"gene_id ""ENSG00000290825.2""; transcript_id ""EN..."
3,chr1,HAVANA,exon,12009,12227,.,+,.,"gene_id ""ENSG00000290825.2""; transcript_id ""EN..."
4,chr1,HAVANA,exon,12612,12721,.,+,.,"gene_id ""ENSG00000290825.2""; transcript_id ""EN..."


In [8]:
seqnames = annotation_data["seqname"]
unique_features = seqnames.unique()
print(len(unique_features))
print(unique_features)

25
['chr1' 'chr2' 'chr3' 'chr4' 'chr5' 'chr6' 'chr7' 'chr8' 'chr9' 'chr10'
 'chr11' 'chr12' 'chr13' 'chr14' 'chr15' 'chr16' 'chr17' 'chr18' 'chr19'
 'chr20' 'chr21' 'chr22' 'chrX' 'chrY' 'chrM']


In [9]:
exons = annotation_data[annotation_data["feature"]=="exon"]
genes = annotation_data[annotation_data["feature"]=="gene"]

In [10]:
chr_exons = exons[exons['seqname']!='chrM']

In [6]:
rows, columns = genes.shape
print(rows) 

63049


In [6]:
exons.head()

Unnamed: 0,seqname,source,feature,cstart,cend,score,strand,frame,attribute
2,chr1,HAVANA,exon,11868,12227,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
3,chr1,HAVANA,exon,12612,12721,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
4,chr1,HAVANA,exon,13220,14409,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
7,chr1,HAVANA,exon,12009,12057,.,+,.,"gene_id ""ENSG00000223972.6""; transcript_id ""EN..."
8,chr1,HAVANA,exon,12178,12227,.,+,.,"gene_id ""ENSG00000223972.6""; transcript_id ""EN..."


In [5]:
exons.describe()

Unnamed: 0,cstart,cend
count,1668590.0,1668590.0
mean,76165180.0,76165450.0
std,56260890.0,56260890.0
min,11102.0,11595.0
25%,32540610.0,32541210.0
50%,63821130.0,63821380.0
75%,111967400.0,111967500.0
max,248936600.0,248937000.0


In [7]:
shared_starts = exons.groupby(['seqname', 'feature', 'cstart', 'strand']).size().reset_index(name='count')
shared_ends = exons.groupby(['seqname', 'feature', 'cend', 'strand']).size().reset_index(name='count')
shared_both = exons.groupby(['seqname', 'feature', 'cstart', 'cend', 'strand']).size().reset_index(name='count')

In [8]:
shared_starts.describe()

Unnamed: 0,cstart,count
count,522134.0,522134.0
mean,74855830.0,3.195712
std,55960110.0,5.246876
min,11102.0,1.0
25%,31412900.0,1.0
50%,61860450.0,1.0
75%,110713700.0,3.0
max,248936600.0,229.0


In [9]:
shared_ends.describe()

Unnamed: 0,cend,count
count,521656.0,521656.0
mean,74889120.0,3.19864
std,55932220.0,5.252831
min,11595.0,1.0
25%,31426900.0,1.0
50%,61920270.0,1.0
75%,110727800.0,3.0
max,248937000.0,235.0


In [11]:
shared_both.describe()

Unnamed: 0,cstart,cend,count
count,670382.0,670382.0,670382.0
mean,74685050.0,74685440.0,2.489014
std,55870400.0,55870410.0,4.339533
min,11102.0,11595.0,1.0
25%,31473500.0,31473750.0,1.0
50%,61639450.0,61641100.0,1.0
75%,110431700.0,110432000.0,2.0
max,248936600.0,248937000.0,215.0


In [7]:
rows, columns = exons.shape
print(rows) 

1668590


In [8]:
rows, columns = chr_exons.shape
print(rows) 

1668590


37 genes and thus 37 exons on chrM removed

In [21]:
genes.head()

Unnamed: 0,seqname,source,feature,cstart,cend,score,strand,frame,attribute
0,chr1,HAVANA,gene,11868,14409,.,+,.,"gene_id ""ENSG00000290825.1""; gene_type ""lncRNA..."
5,chr1,HAVANA,gene,12009,13670,.,+,.,"gene_id ""ENSG00000223972.6""; gene_type ""transc..."
13,chr1,HAVANA,gene,14695,24886,.,-,.,"gene_id ""ENSG00000227232.6""; gene_type ""unproc..."
25,chr1,ENSEMBL,gene,17368,17436,.,-,.,"gene_id ""ENSG00000278267.1""; gene_type ""miRNA""..."
28,chr1,HAVANA,gene,29553,31109,.,+,.,"gene_id ""ENSG00000243485.5""; gene_type ""lncRNA..."
...,...,...,...,...,...,...,...,...,...
3467013,chrM,ENSEMBL,gene,576,647,.,+,.,"gene_id ""ENSG00000210049.1""; gene_type ""Mt_tRN..."
3467016,chrM,ENSEMBL,gene,647,1601,.,+,.,"gene_id ""ENSG00000211459.2""; gene_type ""Mt_rRN..."
3467019,chrM,ENSEMBL,gene,1601,1670,.,+,.,"gene_id ""ENSG00000210077.1""; gene_type ""Mt_tRN..."
3467022,chrM,ENSEMBL,gene,1670,3229,.,+,.,"gene_id ""ENSG00000210082.2""; gene_type ""Mt_rRN..."


In [10]:
exons.head()

Unnamed: 0,seqname,source,feature,cstart,cend,score,strand,frame,attribute
2,chr1,HAVANA,exon,11868,12227,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
3,chr1,HAVANA,exon,12612,12721,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
4,chr1,HAVANA,exon,13220,14409,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
7,chr1,HAVANA,exon,12009,12057,.,+,.,"gene_id ""ENSG00000223972.6""; transcript_id ""EN..."
8,chr1,HAVANA,exon,12178,12227,.,+,.,"gene_id ""ENSG00000223972.6""; transcript_id ""EN..."
...,...,...,...,...,...,...,...,...,...
3467015,chrM,ENSEMBL,exon,576,647,.,+,.,"gene_id ""ENSG00000210049.1""; transcript_id ""EN..."
3467018,chrM,ENSEMBL,exon,647,1601,.,+,.,"gene_id ""ENSG00000211459.2""; transcript_id ""EN..."
3467021,chrM,ENSEMBL,exon,1601,1670,.,+,.,"gene_id ""ENSG00000210077.1""; transcript_id ""EN..."
3467024,chrM,ENSEMBL,exon,1670,3229,.,+,.,"gene_id ""ENSG00000210082.2""; transcript_id ""EN..."


In [9]:
annotation_data.describe()

Unnamed: 0,cstart,cend
count,3467013.0,3467013.0
mean,75893030.0,75896860.0
std,56157570.0,56157830.0
min,11102.0,11595.0
25%,32607650.0,32611280.0
50%,63451270.0,63466370.0
75%,111624300.0,111632400.0
max,248936600.0,248937000.0


In [10]:
features = annotation_data["feature"]
unique_features = features.unique()
print(len(unique_features))
print(unique_features)

In [None]:
def calculate_introns(gtf_df):
    """
    Given a pandas DataFrame of GTF records (with columns: 'seqname', 'feature', 'start',
    'end', 'strand', and 'attribute') in which the start (called cstart) has been converted
    to 0-indexed values, calculate the introns for each gene.
    
    This function assumes that the attribute field contains a gene identifier in a form like:
       gene_id "XYZ";
    as it does in the hg38 gtf and groups features by gene_id. For each gene, it collects the exon 
    intervals, merges overlapping exons (forms the union of exonic regions) and then computes each intron 
    as the gap between consecutive merged exons. For plus-strand genes the intron is reported
    as (previous_exon.end, next_exon.start), while for minus-strand genes the order is reversed so
    that the cstart value is higher than the cend.
    
    Returns:
        A new DataFrame with one row per intron, having columns:
          - seqname
          - feature (with value "intron")
          - cstart (the start coordinate in 0-index system; note that for minus strand this is numerically higher)
          - cend   (the end coordinate)
          - strand
    """
    
    # helper to extract gene_id from the attribute string
    def get_gene_id(attr):
        # look for a pattern like: gene_id "XYZ";
        m = re.search(r'gene_id\s+"([^"]+)"', attr)
        if m:
            return m.group(1)
        else:
            return None

    # Add a gene_id column (if not already present)
    if 'gene_id' not in gtf_df.columns:
        gtf_df = gtf_df.copy()  # avoid modifying the original dataframe
        gtf_df['gene_id'] = gtf_df['attribute'].apply(get_gene_id)
    
    intron_records = []
    
    # Group rows by gene_id (each gene should have one gene-level record, and one or more exon records)
    for gene_id, group in gtf_df.groupby('gene_id'):
        # Skip groups with no gene_id (if any)
        if gene_id is None:
            continue
        
        # Identify the gene-level information (if available)
        gene_rows = group[group['feature'] == 'gene']
        if not gene_rows.empty:
            # Use the gene row to get the chromosome and strand.
            seqname = gene_rows.iloc[0]['seqname']
            strand  = gene_rows.iloc[0]['strand']
            gene_start = gene_rows.iloc[0]['cstart']
            gene_end   = gene_rows.iloc[0]['cend']
        else:
            # Fall back on the first exon if no gene record is available.
            seqname = group.iloc[0]['seqname']
            strand  = group.iloc[0]['strand']
            gene_start = None
            gene_end = None
        
        # Get all exon rows for this gene
        exon_rows = group[group['feature'] == 'exon']
        if exon_rows.empty:
            continue
        
        # Build a list of exon intervals (each as a tuple (start, end))
        # Optionally we could filter to exons that fall within the gene boundaries.
        exon_intervals = list(zip(exon_rows['cstart'], exon_rows['cend']))
        
        # Sort by start (genomic order)
        exon_intervals = sorted(exon_intervals, key=lambda x: x[0])
        
        # Merge overlapping or adjacent exons.
        # (For example, if two exons overlap because of alternative splicing, we want the union.)
        merged_exons = []
        for interval in exon_intervals:
            if not merged_exons:
                merged_exons.append(list(interval))
            else:
                last = merged_exons[-1]
                # If the current exon overlaps or touches the previous one, merge them.
                if interval[0] <= last[1]:
                    last[1] = max(last[1], interval[1])
                else:
                    merged_exons.append(list(interval))
        
        # If there is fewer than two merged exons, then there is no intron.
        if len(merged_exons) < 2:
            continue
        
        # For plus-strand genes, process the merged exons in genomic (ascending) order.
        if strand == '+':
            # For each adjacent pair of merged exons, define an intron between them.
            for i in range(len(merged_exons) - 1):
                # The intron is the gap between the end of exon i and the start of exon i+1.
                intron_start = merged_exons[i][1]
                intron_end   = merged_exons[i+1][0]
                # Only add if there is a gap.
                if intron_end > intron_start:
                    intron_records.append({
                        'seqname': seqname,
                        'feature': 'intron',
                        'cstart': intron_start,
                        'cend': intron_end,
                        'strand': strand
                    })
        else:
            # For minus-strand genes, the transcript order is reversed.
            # Sort the merged exons in descending order by their start coordinate.
            merged_exons_desc = sorted(merged_exons, key=lambda x: x[0], reverse=True)
            for i in range(len(merged_exons_desc) - 1):
                # In transcript order the first exon is the one with the higher coordinate.
                # We want to report the intron so that cstart is higher than cend.
                intron_start = merged_exons_desc[i][0]   # start of the upstream exon
                intron_end   = merged_exons_desc[i+1][1]   # end of the downstream exon
                if intron_start > intron_end:
                    intron_records.append({
                        'seqname': seqname,
                        'feature': 'intron',
                        'cstart': intron_start,
                        'cend': intron_end,
                        'strand': strand
                    })
    
    return pd.DataFrame(intron_records)


In [None]:
annotation_data = load_gtf_annotations(datasets_path + 'basic_annotations.gtf')
annotation_data = annotation_data[annotation_data["seqname"]!="chrM"]
introns = calculate_introns(annotation_data)
print(len(introns))

324792


In [8]:
introns.head()

Unnamed: 0,seqname,feature,cstart,cend,strand
0,chrX,intron,100636607,100635746,-
1,chrX,intron,100635557,100635252,-
2,chrX,intron,100635177,100634029,-
3,chrX,intron,100633930,100633539,-
4,chrX,intron,100633404,100632568,-


In [9]:
trimmed_annotation_data = annotation_data[["seqname", "feature", "cstart", "cend", "strand"]]
IntronExonDF = pd.concat([trimmed_annotation_data, introns])

In [None]:
IntronExonDF.to_csv(datasets_path + 'IntronExonDF.csv', index=False)

In [None]:
introns.to_csv(datasets_path + 'BetterIntrons.csv', index=False)

In [None]:
def swap_columns_if_needed(df, col_a, col_b):
    """
    Turns out the (-) strand lists cstart as smaller than cend.  This fixes the output from
    the above function that calculated intron boundaries.
    For each row in the dataframe, if the value in col_a is greater than the value in col_b,
    swap the two values.

    Parameters:
        df (pd.DataFrame): The dataframe to process.
        col_a (str): The name of the first column.
        col_b (str): The name of the second column.

    Returns:
        pd.DataFrame: The dataframe with swapped values where needed.
    """
    # Create a boolean mask where the value in col_a is greater than col_b.
    mask = df[col_a] > df[col_b]
    
    # Swap the values in col_a and col_b for rows where mask is True.
    df.loc[mask, [col_a, col_b]] = df.loc[mask, [col_b, col_a]].values
    
    return df

In [14]:
FixedIntronExonDF = swap_columns_if_needed(IntronExonDF, 'cstart', 'cend')

In [15]:
print(FixedIntronExonDF["feature"].unique())

['gene' 'transcript' 'exon' 'CDS' 'start_codon' 'stop_codon' 'UTR'
 'Selenocysteine' 'intron']


In [17]:
Trimmed_Intron_Exon_DF = FixedIntronExonDF[((FixedIntronExonDF["feature"]=="exon") | (FixedIntronExonDF["feature"]=="intron"))]

In [18]:
print(Trimmed_Intron_Exon_DF["feature"].unique())

['exon' 'intron']


In [19]:
Trimmed_Intron_Exon_DF = Trimmed_Intron_Exon_DF[["seqname", "feature", "cstart", "cend", "strand"]]

In [21]:
print(Trimmed_Intron_Exon_DF.sample(10))

        seqname feature     cstart       cend strand
1627030   chr15    exon   88905756   88905901      -
450582     chr3    exon   98849460   98849598      -
1820139   chr17    exon   63862288   63862531      +
208742     chr1    exon  241098665  241098762      -
739233     chr6    exon   29945233   29945281      +
139817    chr14  intron   88695454   88696846      -
1185731   chr10    exon  112951482  112951607      +
785077     chr6    exon   85536791   85536924      -
270406     chr3  intron  170467402  170474264      +
2194049    chrX    exon  119616711  119616742      -


On human genome UCSC browser, due to indexing 1 on their end and 0 in python, cstart here is the last base to the right of the feature on the browser.
cend looks like the correct spot, but only because python excludes the last base which cancels out the off by 1 issue

Printing line by line swaps back into 1 indexing in a .txt file so locations are accurate as long as line 1 has the first base.

In [None]:
Trimmed_Intron_Exon_DF.to_csv(datasets_path + "FinalIntronExonDF.csv", index=False)

In [None]:
check_exons = load_gtf_annotations(datasets_path + 'basic_annotations.gtf')
check_exons = check_exons[check_exons["feature"]=="exon"]
check_exons.describe()

Unnamed: 0,cstart,cend
count,1007358.0,1007358.0
mean,76234050.0,76234340.0
std,56456800.0,56456800.0
min,576.0,647.0
25%,32205270.0,32205470.0
50%,63817030.0,63817530.0
75%,112412300.0,112412500.0
max,248936700.0,248937100.0


In [None]:
df = pd.read_csv(datasets_path + 'FinalIntronExonDF.csv')
print(len(df[df['feature']=="intron"]))
print(len(df[df['feature']=="exon"]))
print(len(df[df['seqname']=="chrM"]))

324792
1007321
0


0.2438171536 % Introns

0.7561828464 % Exons

3.101434149:1 Exons : Introns

357737

In [13]:
chr_exons.describe()

Unnamed: 0,cstart,cend
count,1007321.0,1007321.0
mean,76236850.0,76237140.0
std,56455940.0,56455950.0
min,10713.0,10892.0
25%,32206910.0,32207030.0
50%,63820080.0,63820170.0
75%,112415200.0,112415900.0
max,248936700.0,248937100.0
