In [1]:
##Title: Get all (whippet) introns intron length, to be used in script 15
## input: whippet output csv with exon coordinates 

In [None]:
import pandas as pd
import gffutils


In [2]:
exons_df = pd.read_csv('all_whippet_exons.csv')
gtf_file = "gencode.v44.basic.annotation.gtf"

In [3]:
exons_df[['chr', 'start', 'end']] = exons_df['Coord'].str.extract(r'(chr[\w]+):(\d+)-(\d+)')
exons_df['start'] = exons_df['start'].astype(int)
exons_df['end'] = exons_df['end'].astype(int)


In [4]:
# Load GTF file and create a database using gffutils with inference disabled
db = gffutils.create_db(
    gtf_file,
    dbfn=":memory:",  # Use an in-memory database
    force=True,
    keep_order=True,
    merge_strategy="merge",
    sort_attribute_values=True,
    disable_infer_transcripts=True,  # Disable inferring transcript features
    disable_infer_genes=True         # Disable inferring gene features
)


In [5]:
# Extract exon information from the GTF file
gtf_exons = pd.DataFrame(
    [
        {
            "chr": feature.seqid,
            "start": feature.start,
            "end": feature.end,
            "strand": feature.strand,
            "gene_id": feature.attributes.get("gene_id", [None])[0],
            "transcript_id": feature.attributes.get("transcript_id", [None])[0],
        }
        for feature in db.features_of_type("exon")
    ]
)

# Ensure proper data types
gtf_exons['start'] = gtf_exons['start'].astype(int)
gtf_exons['end'] = gtf_exons['end'].astype(int)




In [6]:
# Function to calculate downstream intron length
def calculate_intron_length(row, gtf_exons):
    if row['Strand'] == '+':
        # Find the next exon for the same gene on the same strand
        downstream_exon = gtf_exons[
            (gtf_exons['gene_id'] == row['Gene']) &
            (gtf_exons['chr'] == row['chr']) &
            (gtf_exons['strand'] == row['Strand']) &
            (gtf_exons['start'] > row['end'])
        ].sort_values(by='start').head(1)
    else:
        # Find the previous exon for the same gene on the same strand
        downstream_exon = gtf_exons[
            (gtf_exons['gene_id'] == row['Gene']) &
            (gtf_exons['chr'] == row['chr']) &
            (gtf_exons['strand'] == row['Strand']) &
            (gtf_exons['end'] < row['start'])
        ].sort_values(by='end', ascending=False).head(1)

    # Calculate intron length
    if not downstream_exon.empty:
        if row['Strand'] == '+':
            return downstream_exon.iloc[0]['start'] - row['end']
        else:
            return row['start'] - downstream_exon.iloc[0]['end']
    return None


In [7]:
# Apply the function to calculate intron lengths
exons_df['intron_length'] = exons_df.apply(calculate_intron_length, axis=1, gtf_exons=gtf_exons)

In [9]:
# Drop rows with no downstream intron (last exon)
exons_df.dropna(subset=['intron_length'], inplace=True)

In [11]:
exons_df.to_csv('all_whippet_detected_intron_length.csv', index = False)