This notebook was used to explore the GTF file.  It was also used to attempt to calculate the intron boundaries but using the search_gtf_by_range function, 
determined there was some garbage in, garbage out going on with the hg38 comprehensive annotation due to the inclusion of every possible variant.  A significant fraction of introns were missing/wrong.
It's only included because the search function is nice.

In [None]:
import time
import sys
import os
import glob
import math
import threading
import concurrent.futures as cf

import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Input, Model, layers, metrics, losses, callbacks, optimizers, models, utils
from keras import backend as K
import gc
import keras_tuner as kt
from pyfaidx import Fasta

K.clear_session()
gc.collect()

datasets_path = "../../Datasets/"
models_path = "../../Models/"

In [2]:
def load_gtf_annotations(gtf_file):
    """
    Loads GTF into a pandas DataFrame and converts cstart and cend to zero-based indexing.
    """
    gtf_data = pd.read_csv(
        gtf_file, sep='\t', comment='#', header=None,
        names=['seqname', 'source', 'feature', 'cstart', 'cend', 
               'score', 'strand', 'frame', 'attribute']
    )
    # Convert to zero-based indexing for cstart
    gtf_data['cstart'] = gtf_data['cstart'] - 1
    return gtf_data

In [3]:
def search_gtf_by_range(gtf_df, seqname, pos_min, pos_max, require_both=False):
    """
    Search a GTF annotations DataFrame for rows matching a given sequence name and 
    having cstart and/or cend values within a specified range.

    Parameters:
      gtf_df (pd.DataFrame): DataFrame containing GTF annotations. Must include at least 
                             the columns 'seqname', 'cstart', and 'cend'.
      seqname (str): The chromosome or scaffold name to filter by (e.g., 'chr1' or '1').
      pos_min (int): The lower bound of the position range (inclusive).
      pos_max (int): The upper bound of the position range (inclusive).
      require_both (bool): 
           - If False (default), returns rows where either 'cstart' OR 'cend' falls within the range.
           - If True, returns only rows where BOTH 'cstart' and 'cend' fall within the range.
    
    Returns:
      pd.DataFrame: A DataFrame containing only the rows that match the criteria.
    """
    # Filter by seqname first.
    df = gtf_df[gtf_df['seqname'] == seqname]
    
    if require_both:
        condition = (
            (df['cstart'] >= pos_min) & (df['cstart'] <= pos_max) &
            (df['cend']   >= pos_min) & (df['cend']   <= pos_max)
        )
    else:
        condition = (
            ((df['cstart'] >= pos_min) & (df['cstart'] <= pos_max)) |
            ((df['cend']   >= pos_min) & (df['cend']   <= pos_max))
        )
        
    return df[condition]

In [3]:
gtf = load_gtf_annotations("chr_annotations.gtf")

In [4]:
search_df = search_gtf_by_range(gtf, "chr1", 42762000, 42763000, require_both=False)

In [7]:
print(search_df[search_df["feature"]=="exon"])

      seqname   source feature    cstart      cend score strand frame  \
86822    chr1  ENSEMBL    exon  42762322  42762475     .      -     .   
86855    chr1   HAVANA    exon  42762322  42762475     .      -     .   
86872    chr1   HAVANA    exon  42762322  42762475     .      -     .   
86907    chr1   HAVANA    exon  42762322  42762475     .      -     .   
86938    chr1   HAVANA    exon  42762322  42762475     .      -     .   
86992    chr1   HAVANA    exon  42762322  42762403     .      -     .   
87002    chr1   HAVANA    exon  42762322  42762346     .      -     .   
87009    chr1   HAVANA    exon  42759985  42762475     .      -     .   

                                               attribute  
86822  gene_id "ENSG00000117385.16"; transcript_id "E...  
86855  gene_id "ENSG00000117385.16"; transcript_id "E...  
86872  gene_id "ENSG00000117385.16"; transcript_id "E...  
86907  gene_id "ENSG00000117385.16"; transcript_id "E...  
86938  gene_id "ENSG00000117385.16"; transcript

In [8]:
search_df = search_gtf_by_range(gtf, "chr1", 42767000, 42768000, require_both=False)

In [9]:
print(search_df[search_df["feature"]=="exon"])

      seqname   source feature    cstart      cend score strand frame  \
86819    chr1  ENSEMBL    exon  42766506  42767084     .      -     .   
86869    chr1   HAVANA    exon  42766506  42767028     .      -     .   
86904    chr1   HAVANA    exon  42766506  42767012     .      -     .   
86937    chr1   HAVANA    exon  42766506  42767022     .      -     .   
86999    chr1   HAVANA    exon  42766506  42767002     .      -     .   
87008    chr1   HAVANA    exon  42766506  42767017     .      -     .   
87012    chr1   HAVANA    exon  42767244  42767390     .      +     .   
87013    chr1   HAVANA    exon  42767508  42767624     .      +     .   
87016    chr1   HAVANA    exon  42767248  42767390     .      +     .   
87019    chr1   HAVANA    exon  42767508  42767624     .      +     .   
87031    chr1   HAVANA    exon  42767275  42767624     .      +     .   
87042    chr1   HAVANA    exon  42767277  42767390     .      +     .   
87047    chr1   HAVANA    exon  42767285  42767390 

In [10]:
intron_df = pd.read_csv("IntronExonDF.csv")

In [26]:
search_df = search_gtf_by_range(intron_df, "chr1", 42747000, 42768000, require_both=False)

In [27]:
print(search_df[search_df["feature"]=="Intron"])

Empty DataFrame
Columns: [seqname, feature, cstart, cend, strand]
Index: []


In [None]:
# This function didn't actually work
def calculate_introns(df):
    """
    Given a GTF annotation dataframe (with columns including seqname, feature, cstart, cend, strand, attribute)
    where cstart has been adjusted to Python’s 0-based indexing, this function returns a dataframe of
    intron intervals. It finds, for each gene, the exons that belong to it, merges overlapping exons,
    and then calculates the introns as the gaps between these merged exons (only when an exon exists on
    both sides).
    
    The returned dataframe has the columns: seqname, feature (always 'intron'), cstart, cend, strand.
    """
    
    # Helper function to extract gene_id from the attribute string.
    def get_gene_id(attr):
        # Assumes gene_id is formatted like: gene_id "XYZ";
        m = re.search(r'gene_id "([^"]+)"', attr)
        return m.group(1) if m else None

    # Work on a copy so as not to modify the original dataframe.
    df = df.copy()
    # Create a new column with gene ids extracted from the attribute field.
    df['gene_id'] = df['attribute'].apply(get_gene_id)
    
    intron_records = []
    # Process each gene (rows where feature == 'gene')
    genes = df[df['feature'] == 'gene']
    for _, gene_row in genes.iterrows():
        gene_id  = gene_row['gene_id']
        seqname  = gene_row['seqname']
        strand   = gene_row['strand']
        gene_cstart = gene_row['cstart']
        gene_cend   = gene_row['cend']
        
        # Get all exons that belong to this gene.
        gene_exons = df[(df['feature'] == 'exon') & (df['gene_id'] == gene_id)]
        if gene_exons.empty:
            continue  # no exons for this gene, so nothing to do
        
        # Sort exons by their start position.
        gene_exons_sorted = gene_exons.sort_values(by='cstart')
        
        # Merge overlapping (or adjacent) exons.
        merged_exons = []
        for _, exon in gene_exons_sorted.iterrows():
            exon_start = exon['cstart']
            exon_end   = exon['cend']
            if not merged_exons:
                merged_exons.append([exon_start, exon_end])
            else:
                # Compare with the last merged exon.
                last = merged_exons[-1]
                if exon_start <= last[1]:
                    # Overlap (or touching); merge by extending the end if needed.
                    last[1] = max(last[1], exon_end)
                else:
                    merged_exons.append([exon_start, exon_end])
        
        # We need at least two merged exons to have an intron.
        if len(merged_exons) < 2:
            continue
        
        # For each gap between consecutive merged exons, define an intron.
        for i in range(len(merged_exons) - 1):
            intron_start = merged_exons[i][1]   # end of previous exon
            intron_end   = merged_exons[i+1][0]   # start of next exon
            # Only include an intron if there is a gap.
            if intron_end > intron_start:
                intron_records.append({
                    'seqname': seqname,
                    'feature': 'intron',
                    'cstart': intron_start,
                    'cend': intron_end,
                    'strand': strand
                })
    
    # Create and return the final intron dataframe.
    introns_df = pd.DataFrame(intron_records, columns=['seqname', 'feature', 'cstart', 'cend', 'strand'])
    return introns_df


In [22]:
gtf = load_gtf_annotations("chr_annotations.gtf")
introns_df = calculate_introns(gtf)
introns_df.to_csv("New_Introns.csv", index=False)

In [25]:
introns_df.describe()

Unnamed: 0,cstart,cend
count,297278.0,297278.0
mean,75386130.0,75392440.0
std,56112000.0,56112450.0
min,11595.0,12178.0
25%,31437410.0,31445520.0
50%,62554540.0,62560320.0
75%,111633500.0,111646700.0
max,248913900.0,248916600.0


In [32]:
search_df = search_gtf_by_range(introns_df, "chr1", 42747000, 42768000, require_both=False)

In [33]:
print(search_df)

     seqname feature    cstart      cend strand
7333    chr1  intron  42747576  42747722      -
7334    chr1  intron  42747798  42748199      -
7335    chr1  intron  42748743  42749900      -
7336    chr1  intron  42750336  42751640      -
7337    chr1  intron  42751834  42752273      -
7338    chr1  intron  42752369  42752536      -
7339    chr1  intron  42752664  42754868      -
7340    chr1  intron  42755217  42755547      -
7341    chr1  intron  42755637  42756400      -
7342    chr1  intron  42756455  42757782      -
7343    chr1  intron  42757945  42758851      -
7344    chr1  intron  42758983  42759200      -
7345    chr1  intron  42759393  42759985      -
7346    chr1  intron  42762475  42766506      -
7347    chr1  intron  42767624  42769677      +
7351    chr1  intron  42767624  42773545      +


In [38]:
search_df = search_gtf_by_range(gtf, "chr1", 42759980, 42762480, require_both=False)

In [39]:
print(search_df[(search_df["feature"]=="exon") & ((search_df["strand"]=="-"))])

      seqname   source feature    cstart      cend score strand frame  \
86822    chr1  ENSEMBL    exon  42762322  42762475     .      -     .   
86855    chr1   HAVANA    exon  42762322  42762475     .      -     .   
86872    chr1   HAVANA    exon  42762322  42762475     .      -     .   
86907    chr1   HAVANA    exon  42762322  42762475     .      -     .   
86938    chr1   HAVANA    exon  42762322  42762475     .      -     .   
86992    chr1   HAVANA    exon  42762322  42762403     .      -     .   
87002    chr1   HAVANA    exon  42762322  42762346     .      -     .   
87009    chr1   HAVANA    exon  42759985  42762475     .      -     .   

                                               attribute  
86822  gene_id "ENSG00000117385.16"; transcript_id "E...  
86855  gene_id "ENSG00000117385.16"; transcript_id "E...  
86872  gene_id "ENSG00000117385.16"; transcript_id "E...  
86907  gene_id "ENSG00000117385.16"; transcript_id "E...  
86938  gene_id "ENSG00000117385.16"; transcript

In [41]:
print(search_df.loc[86907,:])

seqname                                                   chr1
source                                                  HAVANA
feature                                                   exon
cstart                                                42762322
cend                                                  42762475
score                                                        .
strand                                                       -
frame                                                        .
attribute    gene_id "ENSG00000117385.16"; transcript_id "E...
Name: 86907, dtype: object


In [43]:
for _ in search_df.loc[86907,:]:
    print(_)

chr1
HAVANA
exon
42762322
42762475
.
-
.
gene_id "ENSG00000117385.16"; transcript_id "ENST00000236040.8"; gene_type "protein_coding"; gene_name "P3H1"; transcript_type "protein_coding"; transcript_name "P3H1-201"; exon_number 2; exon_id "ENSE00003691539.1"; level 2; protein_id "ENSP00000236040.4"; transcript_support_level "2"; hgnc_id "HGNC:19316"; tag "basic"; tag "GENCODE_Primary"; tag "CCDS"; ccdsid "CCDS57986.1"; havana_gene "OTTHUMG00000007525.8"; havana_transcript "OTTHUMT00000019791.2";


In [45]:
loc_lst = [86822, 86855, 86872, 86907]
for i in range(len(loc_lst)):
    for _ in search_df.loc[loc_lst[i],:]:
        print(_)

chr1
ENSEMBL
exon
42762322
42762475
.
-
.
gene_id "ENSG00000117385.16"; transcript_id "ENST00000397054.7"; gene_type "protein_coding"; gene_name "P3H1"; transcript_type "protein_coding"; transcript_name "P3H1-204"; exon_number 2; exon_id "ENSE00003691539.1"; level 3; protein_id "ENSP00000380245.3"; transcript_support_level "1"; hgnc_id "HGNC:19316"; tag "basic"; tag "GENCODE_Primary"; tag "CCDS"; ccdsid "CCDS53307.1"; havana_gene "OTTHUMG00000007525.8";
chr1
HAVANA
exon
42762322
42762475
.
-
.
gene_id "ENSG00000117385.16"; transcript_id "ENST00000460031.5"; gene_type "protein_coding"; gene_name "P3H1"; transcript_type "retained_intron"; transcript_name "P3H1-207"; exon_number 2; exon_id "ENSE00003521591.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:19316"; havana_gene "OTTHUMG00000007525.8"; havana_transcript "OTTHUMT00000329581.2";
chr1
HAVANA
exon
42762322
42762475
.
-
.
gene_id "ENSG00000117385.16"; transcript_id "ENST00000296388.10"; gene_type "protein_coding"; gene_name

In [4]:
import pandas as pd
import re

def calculate_introns(df):
    """
    Given a GTF annotation dataframe (with columns including seqname, source, feature, cstart, cend, strand, attribute)
    where cstart has been adjusted to Python’s 0-based indexing, this function returns a dataframe of
    intron intervals using the most common cstart and cend per gene_id and exon number.
    
    For each gene:
      - Extract all exon rows.
      - Group exons by the exon number (extracted from the attribute column).
      - For each exon group, use the mode of cstart and cend as the representative exon boundaries.
        In the event of a tie:
          * For cstart: pick the smallest candidate among those rows with source "HAVANA". If none, use "ENSEMBL".
          * For cend: pick the largest candidate among those rows with source "HAVANA". If none, use "ENSEMBL".
        If neither source is found among the candidates, fallback to the smallest (for cstart) or largest (for cend)
        candidate overall.
      - Sort the exons by exon number (converted to int when possible).
      - Compute each intron as the gap between the representative boundaries of consecutive exons.
    
    The returned dataframe has the columns: seqname, feature (always 'intron'), cstart, cend, strand.
    """
    
    # Helper function to extract gene_id from the attribute string.
    def get_gene_id(attr):
        m = re.search(r'gene_id "([^"]+)"', attr)
        return m.group(1) if m else None

    # Helper function to extract exon number from the attribute string.
    def get_exon_number(attr):
        m = re.search(r'exon_number "([^"]+)"', attr)
        return m.group(1) if m else None

    # Helper function to compute mode with tie-breaking.
    # For cstart (tie_break_direction='min'): in a tie, prefer HAVANA (smallest) then ENSEMBL (smallest) then overall min.
    # For cend (tie_break_direction='max'): in a tie, prefer HAVANA (largest) then ENSEMBL (largest) then overall max.
    def get_mode_with_preferred(series, group, column_name, tie_break_direction='min'):
        modes = series.mode()
        if len(modes) == 1:
            return modes.iloc[0]
        elif len(modes) > 1:
            candidate_values = set(modes)
            # First try HAVANA rows among the tied candidates.
            havana_rows = group[(group[column_name].isin(candidate_values)) & (group['source'] == 'HAVANA')]
            if not havana_rows.empty:
                if tie_break_direction == 'min':
                    return havana_rows[column_name].min()
                else:
                    return havana_rows[column_name].max()
            # Next, try ENSEMBL rows among the tied candidates.
            ensembl_rows = group[(group[column_name].isin(candidate_values)) & (group['source'] == 'ENSEMBL')]
            if not ensembl_rows.empty:
                if tie_break_direction == 'min':
                    return ensembl_rows[column_name].min()
                else:
                    return ensembl_rows[column_name].max()
            # Fallback: return min (for cstart) or max (for cend) from candidate values overall.
            if tie_break_direction == 'min':
                return min(modes)
            else:
                return max(modes)
        else:
            # Fallback if mode() returns an empty Series.
            return series.iloc[0]
    
    # Work on a copy so as not to modify the original dataframe.
    df = df.copy()
    # Extract gene_id and exon_number from the attribute field.
    df['gene_id'] = df['attribute'].apply(get_gene_id)
    df['exon_number'] = df['attribute'].apply(get_exon_number)
    
    intron_records = []
    
    # Process each gene (rows where feature == 'gene')
    genes = df[df['feature'] == 'gene']
    for _, gene_row in genes.iterrows():
        gene_id = gene_row['gene_id']
        seqname = gene_row['seqname']
        strand  = gene_row['strand']
        
        # Get all exons for this gene.
        gene_exons = df[(df['feature'] == 'exon') & (df['gene_id'] == gene_id)]
        if gene_exons.empty:
            continue
        # Keep only exons with a valid exon_number.
        gene_exons = gene_exons[gene_exons['exon_number'].notnull()]
        if gene_exons.empty:
            continue
        
        # For each exon (grouped by exon_number), determine representative boundaries.
        exon_boundaries = []
        for exon_num, group in gene_exons.groupby('exon_number'):
            rep_start = get_mode_with_preferred(group['cstart'], group, 'cstart', tie_break_direction='min')
            rep_end   = get_mode_with_preferred(group['cend'], group, 'cend', tie_break_direction='max')
            exon_boundaries.append({
                'exon_number': exon_num,
                'rep_start': rep_start,
                'rep_end': rep_end
            })
        
        # Sort exons by exon_number (attempt numeric sorting).
        try:
            sorted_exons = sorted(exon_boundaries, key=lambda x: int(x['exon_number']))
        except ValueError:
            sorted_exons = sorted(exon_boundaries, key=lambda x: x['exon_number'])
        
        # Need at least two exons to form an intron.
        if len(sorted_exons) < 2:
            continue
        
        # For each gap between consecutive exons, define an intron.
        for i in range(len(sorted_exons) - 1):
            intron_start = sorted_exons[i]['rep_end']   # End of previous exon.
            intron_end   = sorted_exons[i+1]['rep_start']  # Start of next exon.
            if intron_end > intron_start:
                intron_records.append({
                    'seqname': seqname,
                    'feature': 'intron',
                    'cstart': intron_start,
                    'cend': intron_end,
                    'strand': strand
                })
    
    # Create and return the final intron dataframe.
    introns_df = pd.DataFrame(intron_records, columns=['seqname', 'feature', 'cstart', 'cend', 'strand'])
    return introns_df

In [5]:
gtf = load_gtf_annotations("chr_annotations.gtf")
fixed_introns_df = calculate_introns(gtf)
fixed_introns_df.to_csv("Fixed_Introns.csv", index=False)

In [7]:
fixed_introns_df.head()

Unnamed: 0,seqname,feature,cstart,cend,strand


In [6]:
search_df = search_gtf_by_range(fixed_introns_df, "chr1", 42747000, 42768000, require_both=False)
print(search_df)

Empty DataFrame
Columns: [seqname, feature, cstart, cend, strand]
Index: []


In [None]:
search_df = search_gtf_by_range(fixed_introns_df, "chr1", 42762000, 42762500, require_both=False)
print(search_df)

In [None]:
trimmed_annotation_data = gtf[["seqname", "feature", "cstart", "cend", "strand"]]
FixedIntronExonDF = pd.concat([trimmed_annotation_data, fixed_introns_df])
FixedIntronExonDF = FixedIntronExonDF[FixedIntronExonDF["seqname"]!="chrM"]
FixedIntronExonDF.to_csv('FixedIntronExonDF.csv', index=False)

In [8]:
better_introns = pd.read_csv('BetterIntrons.csv')
search_df = search_gtf_by_range(better_introns, "chr1", 42747000, 42768000, require_both=False)

In [9]:
print(search_df)

       seqname feature    cstart      cend strand
61722     chr1  intron  42766506  42762475      -
61723     chr1  intron  42762322  42759390      -
61724     chr1  intron  42759200  42758983      -
61725     chr1  intron  42758851  42757922      -
61726     chr1  intron  42757782  42755637      -
61727     chr1  intron  42755547  42755217      -
61728     chr1  intron  42755164  42754990      -
61729     chr1  intron  42754868  42752664      -
61730     chr1  intron  42752536  42752369      -
61731     chr1  intron  42752273  42750336      -
61732     chr1  intron  42750185  42748317      -
61733     chr1  intron  42748199  42747798      -
61734     chr1  intron  42747722  42747412      -
135467    chr1  intron  42767390  42767508      +
135468    chr1  intron  42767624  42773562      +
268820    chr1  intron  42767624  42773545      +
