In [1]:
### what it does: Detremines exon block coordinates and surrounding exon coordinates 
### Input: gtf file and and csv with block exon coords (output from script 3) 
##output: exon-block coordinates csv and csv with upstream and downstream of block exon coordinates (ups/dns)
## Date: 4/28/2025

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('block_exons_ddPSImax0.2_adjacent.csv')
gtf_file = 'gencode.v44.basic.annotation.gtf'

In [4]:
# ---- STEP 1: FIND COORDINATED EXONS ----
# Group by gene_name
grouped = df.groupby('gene_name')

# Function to find exons with small dPSI differences (<= 0.02)
def find_coords_with_small_deltapsi_diff(group):
    result = []
    for i in range(len(group)):
        for j in range(i + 1, len(group)):
            if abs(group.iloc[i]['dPSI'] - group.iloc[j]['dPSI']) <= 0.02:
                result.append(group.iloc[i]['Coord'])
                result.append(group.iloc[j]['Coord'])
    return result

# Apply function to identify relevant exon coordinates
coords_to_keep = grouped.apply(find_coords_with_small_deltapsi_diff).explode().unique()

# Filter dataset to retain only relevant exons
bl_ex = df[df['Coord'].isin(coords_to_keep)]

# ---- STEP 2: IDENTIFY ADJACENT EXONS ----
# Group by gene and sort by exon_number
grouped = bl_ex.groupby('gene_name').apply(lambda x: x.sort_values('exon_number')).reset_index(drop=True)

# Identify neighbors (previous and next exon with difference of 1)
grouped['next_exon_diff'] = grouped.groupby('gene_name')['exon_number'].shift(-1) - grouped['exon_number']
grouped['prev_exon_diff'] = grouped['exon_number'] - grouped.groupby('gene_name')['exon_number'].shift(1)

# Mark exons that have direct neighbors
grouped['has_neighbor'] = ((grouped['next_exon_diff'] == 1) | (grouped['prev_exon_diff'] == 1)).astype(int)

# Keep only exons with adjacent neighbors
bl_ex_ad = grouped[grouped['has_neighbor'] == 1].reset_index(drop=True)

# ---- STEP 3: GROUP INTO BLOCKS BASED ON dPSI PROXIMITY ----
proximity_threshold = 0.02

# Function to create subgroups based on dPSI proximity within each gene_name group
def create_subgroups(group):
    group = group.sort_values(by='dPSI').reset_index(drop=True)
    subgroups = []
    current_group = [group.iloc[0]]
    for i in range(1, len(group)):
        if abs(group.iloc[i]['dPSI'] - group.iloc[i-1]['dPSI']) <= proximity_threshold:
            current_group.append(group.iloc[i])
        else:
            subgroups.append(current_group)
            current_group = [group.iloc[i]]
    subgroups.append(current_group)
    return subgroups

# Apply subgrouping function
grouped = bl_ex_ad.groupby('gene_name')
all_subgroups = grouped.apply(create_subgroups)

# ---- STEP 4: IDENTIFY BLOCKS ----
# Function to parse Coord and return chromosome, start, end
def parse_coord(coord):
    chrom, positions = coord.split(':')
    start, end = map(int, positions.split('-'))
    return chrom, start, end

# Function to check if exons within the same block are adjacent
def are_exons_adjacent(block):
    exon_numbers = sorted(block['exon_number'])
    return all(exon_numbers[i] + 1 == exon_numbers[i + 1] for i in range(len(exon_numbers) - 1))

# Function to calculate block start and end
def calculate_block_start_end(block):
    chrom = block.iloc[0]['Coord'].split(':')[0]
    strand = block.iloc[0]['Strand']
    start_positions = [parse_coord(entry['Coord'])[1] for _, entry in block.iterrows()]
    end_positions = [parse_coord(entry['Coord'])[2] for _, entry in block.iterrows()]
    
    block_start = min(start_positions)
    block_end = max(end_positions)
    
    # Ensure block_start is always smaller than block_end for BED file format
    if block_start > block_end:
        block_start, block_end = block_end, block_start
    
    return chrom, block_start, block_end, strand

# Create a new DataFrame to store block information
blocks_data = []

for gene_name, gene_subgroups in all_subgroups.items():
    for block in gene_subgroups:
        block_df = pd.DataFrame(block)  # Convert block to DataFrame
        
        if not are_exons_adjacent(block_df):
            continue  # Skip blocks where exons are not adjacent
        
        chrom, block_start, block_end, strand = calculate_block_start_end(block_df)
        number_of_exons = len(block_df)
        block_coord = f"{chrom}:{block_start}-{block_end}"
        
        blocks_data.append({
            'chr': chrom,
            'gene_name': gene_name,
            'block_start': block_start,
            'block_end': block_end,
            'strand': strand,
            'number_of_exons': number_of_exons,
            'block_coord': block_coord
        })

# Create final DataFrame
blocks_df = pd.DataFrame(blocks_data)


  coords_to_keep = grouped.apply(find_coords_with_small_deltapsi_diff).explode().unique()
  grouped = bl_ex.groupby('gene_name').apply(lambda x: x.sort_values('exon_number')).reset_index(drop=True)
  all_subgroups = grouped.apply(create_subgroups)


In [5]:
blocks_df.to_csv('exon_block_coordinates.csv', index= False)

In [6]:
### ----- GET SURROUNDING EXON COORDINATES ----

# ---- STEP 1: PARSE GTF FILE ----
# Function to parse the GTF file and extract relevant exon information
def parse_gtf(gtf_file):
    # Read GTF file
    gtf_df = pd.read_csv(gtf_file, sep='\t', comment='#', header=None, names=[
        'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])
    
    # Filter for exon entries only
    gtf_df = gtf_df[gtf_df['feature'] == 'exon']
    
    # Extract gene_name, exon_number, and transcript_support_level (tsl)
    gtf_df['exon_number'] = gtf_df['attribute'].str.extract(r'exon_number\s+"?(\d+)"?')
    gtf_df['tsl'] = gtf_df['attribute'].str.extract(r'transcript_support_level\s+"?(\d+)"?')
    gtf_df['gene_name'] = gtf_df['attribute'].str.extract(r'gene_name\s+"([^"]+)"')
    
    # Convert to appropriate types
    gtf_df['exon_number'] = pd.to_numeric(gtf_df['exon_number'], errors='coerce').fillna(-1).astype(int)
    gtf_df['tsl'] = pd.to_numeric(gtf_df['tsl'], errors='coerce').fillna(-1).astype(int)
    gtf_df['start'] = gtf_df['start'].astype(int)
    gtf_df['end'] = gtf_df['end'].astype(int)
    
    return gtf_df[['seqname', 'gene_name', 'exon_number', 'tsl', 'start', 'end', 'strand']]


# ---- STEP 2: IDENTIFY UPSTREAM & DOWNSTREAM EXONS ----

# Function to find upstream and downstream exon coordinates
def find_upstream_downstream(gtf_data, blocks_data):
    results = []
    
    for _, row in blocks_data.iterrows():
        chrom = row['chr']
        gene = row['gene_name']
        block_start = row['block_start']
        block_end = row['block_end']
        
        # Filter GTF data for matching gene and chromosome
        gene_exons = gtf_data[(gtf_data['gene_name'] == gene) & (gtf_data['seqname'] == chrom)]
        
        # Prefer transcripts with tsl == 1
        preferred_exons = gene_exons[gene_exons['tsl'] == 1]
        if preferred_exons.empty:
            preferred_exons = gene_exons  # Fall back to all if no tsl == 1
        
        # Find upstream exon (closest exon before block_start)
        upstream_exon = preferred_exons[preferred_exons['end'] < block_start].sort_values(by='end', ascending=False).head(1)
        start_ups_exon = int(upstream_exon['start'].values[0]) if not upstream_exon.empty else None
        end_ups_exon = int(upstream_exon['end'].values[0]) if not upstream_exon.empty else None

        # Find downstream exon (closest exon after block_end)
        downstream_exon = preferred_exons[preferred_exons['start'] > block_end].sort_values(by='start', ascending=True).head(1)
        start_dns_exon = int(downstream_exon['start'].values[0]) if not downstream_exon.empty else None
        end_dns_exon = int(downstream_exon['end'].values[0]) if not downstream_exon.empty else None

        results.append({
            'chr': chrom,
            'gene_name': gene,
            'block_start': block_start,
            'block_end': block_end,
            'strand': row['strand'],
            'number_of_exons': row['number_of_exons'],
            'block_coord': row['block_coord'],
            'start_ups_exon': start_ups_exon,
            'end_ups_exon': end_ups_exon,
            'start_dns_exon': start_dns_exon,
            'end_dns_exon': end_dns_exon
        })
    
    return pd.DataFrame(results)



In [7]:
# ---- RUN FUNCTIONS ----

# Parse the GTF file
gtf_df = parse_gtf(gtf_file)




In [8]:
# Apply function to find upstream and downstream exon coordinates
final_blocks_df = find_upstream_downstream(gtf_df, blocks_df)

In [9]:
# Remove rows where any NaN value exists
final_blocks_df = final_blocks_df.dropna()

final_blocks_df[['start_ups_exon', 'end_ups_exon', 'start_dns_exon', 'end_dns_exon']] = (
    final_blocks_df[['start_ups_exon', 'end_ups_exon', 'start_dns_exon', 'end_dns_exon']]
    .astype(int)  # Convert to integer type
)

In [10]:
final_blocks_df.to_csv('exon_block_coordinates_ups_dns.csv', index= False)