In [None]:
### Purpose: Calculate recursive splice site strength - Figure S7B
##           use/adjust script 13 to plot the data
### Input: output script 1, annotatation.gtf and genome.fasta


In [14]:
import pandas as pd
import numpy as np
import pybedtools
from Bio.Seq import Seq
import re


In [2]:
# Load dataframe
df = pd.read_csv('whippet_CE_gene_name_exon_number.csv')
# # Load the GTF file
gtf_file ='gencode.v44.basic.annotation.gtf'
# Load the fasta file
fasta_file='GRCh38.p14.genome.fa'


In [3]:
# Function to parse GTF and extract exon information
def parse_gtf(gtf_file):
    gtf_df = pd.read_csv(gtf_file, sep='\t', comment='#', header=None, names=[
        'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])
    
    gtf_df = gtf_df[gtf_df['feature'] == 'exon']
    
    gtf_df['exon_number'] = gtf_df['attribute'].str.extract(r'exon_number\s+"?(\d+)"?').astype(float).fillna(-1).astype(int)
    gtf_df['gene_name'] = gtf_df['attribute'].str.extract(r'gene_name\s+"([^"]+)"')

    gtf_df['Coord'] = gtf_df['seqname'] + ':' + gtf_df['start'].astype(str) + '-' + gtf_df['end'].astype(str)
    
    return gtf_df[['seqname', 'start', 'end', 'strand', 'gene_name', 'exon_number', 'Coord']]

# Function to compute upstream exon positions
def compute_upstream_exon_positions(df, gtf):
    five_start = []
    five_end = []
    chromosome = []

    for _, row in df.iterrows():
        try:
            chr, coords = row['Coord'].split(':')
            start, end = map(int, coords.split('-'))
            strand = row['Strand']
            gene_name = row['gene_name']

            chromosome.append(chr)

            current_exon_df = gtf[(gtf['seqname'] == chr) & (gtf['start'] == start) & (gtf['end'] == end) & (gtf['gene_name'] == gene_name)]

            if not current_exon_df.empty:
                current_exon = current_exon_df.iloc[0]

                # Determine 5' end of current exon
                if strand == '+':
                    five_end.append(start + 5)
                else:
                    five_end.append(end - 5)

                # Find upstream exon
                if pd.notna(current_exon['exon_number']) and current_exon['exon_number'] > 1:
                    upstream_exon_df = gtf[(gtf['seqname'] == chr) & 
                                           (gtf['exon_number'] == current_exon['exon_number'] - 1) & 
                                           (gtf['gene_name'] == gene_name)]

                    if not upstream_exon_df.empty:
                        upstream_exon = upstream_exon_df.iloc[0]
                        
                        if strand == '+':
                            five_start.append(upstream_exon['end'] - 2)
                        else:
                            five_start.append(upstream_exon['start'] + 2)
                    else:
                        five_start.append(np.nan)
                else:
                    five_start.append(np.nan)
            else:
                five_start.append(np.nan)
                five_end.append(np.nan)
        
        except Exception as e:
            print(f"Skipping row due to error: {e}")
            five_start.append(np.nan)
            five_end.append(np.nan)

    df['5_start'] = five_start
    df['5_end'] = five_end
    df['chr'] = chromosome
    return df

# Function to create a BED file from exon positions
def create_bed_file(df):
    df_bed = pd.DataFrame({
        'chr': df['chr'],
        'start': df['5_start'],
        'end': df['5_end'],
        'gene_name': df['gene_name'],
        'dPSI': df['dPSI'],
        'strand': df['Strand'],
        'Coord': df['Coord']
    })

    df_bed.dropna(inplace=True)
    df_bed['start'] = df_bed['start'].astype(int)
    df_bed['end'] = df_bed['end'].astype(int)

    adjusted_start = []
    adjusted_end = []

    for _, row in df_bed.iterrows():
        if row['strand'] == '+':
            adjusted_start.append(row['start'] - 1)
            adjusted_end.append(row['end'])
        else:
            adjusted_start.append(row['end'] - 1)
            adjusted_end.append(row['start'])

    df_bed['start'] = adjusted_start
    df_bed['end'] = adjusted_end
    df_bed = df_bed[df_bed['start'] < df_bed['end']]

    return df_bed

# Function to extract sequences from a FASTA file
def extract_sequences(df_bed, fasta_file):
    sequences = []

    for _, row in df_bed.iterrows():
        try:
            bed = pybedtools.BedTool.from_dataframe(pd.DataFrame([{
                'chrom': row['chr'], 
                'start': row['start'], 
                'end': row['end'], 
                'strand': row['strand']
            }]))

            a = bed.sequence(fi=fasta_file, s=True)

            with open(a.seqfn) as f:
                lines = f.read().split("\n")

            sequence = lines[1] if len(lines) > 1 else ""

            # Extract only 3nt upstream + 6nt exon
            if row['strand'] == '+':
                upstream_3_nt = sequence[:3]
                first_6_nt = sequence[-6:]
                combined_sequence = upstream_3_nt + first_6_nt
            else:
                first_6_nt = sequence[:6]
                upstream_3_nt = sequence[-3:]
                combined_sequence = str(Seq(first_6_nt + upstream_3_nt).reverse_complement())

            sequences.append(combined_sequence)

        except Exception as e:
            sequences.append(None)
            print(f"Skipping row due to error: {e}")

    df_bed['sequence'] = sequences
    return df_bed

# Function to merge sequences back into the original dataframe
def merge_sequences(df, df_bed):
    df_bed.dropna(subset=['sequence'], inplace=True)
    df = df.merge(df_bed[['Coord', 'sequence']], on='Coord', how='left')
    return df




In [4]:
# Run code

# Step 1: Parse GTF File
gtf_df = parse_gtf(gtf_file)

# Step 2: Compute upstream exon positions
df = compute_upstream_exon_positions(df, gtf_df)

# Step 3: Create BED file
df_bed = create_bed_file(df)

# Step 4: Extract correct sequences
df_bed = extract_sequences(df_bed, fasta_file)

# Step 5: Merge sequences back into df
df = merge_sequences(df, df_bed)


                 Gene  Node                   Coord Strand Type  psi_ctrl  \
0  ENSG00000142945.13     2  chr1:44740913-44741007      +   CE   0.98965   
1  ENSG00000142945.13     4  chr1:44747384-44747485      +   CE   0.99504   
2  ENSG00000142945.13     5  chr1:44747652-44747700      +   CE   0.99386   
3  ENSG00000142945.13     6  chr1:44750442-44750564      +   CE   0.98215   
4  ENSG00000142945.13     7  chr1:44753132-44753254      +   CE   0.74565   

    psi_KD  DeltaPsi  Probability Complexity  Entropy     dPSI  exon_number  \
0  0.99478 -0.005126        0.661         K1   0.1256  0.00513          2.0   
1  0.99445  0.000586        0.538         K0   0.0000 -0.00059          3.0   
2  0.99324  0.000628        0.540         K0   0.0000 -0.00062          4.0   
3  0.99340 -0.011251        0.707         K2   0.9798  0.01125          5.0   
4  0.70769  0.037959        0.679         K2   0.9798 -0.03796          6.0   

  gene_name     5_start       5_end   chr   sequence  
0     K

In [10]:
output_fasta = "all_whippet_RS5ss.fasta"

with open(output_fasta, "w") as f:
    for idx, i in df.iterrows():
        coord = str(i["Coord"])  # Ensure Coord is a string
        sequence = str(i["sequence"])  # Ensure sequence is a string

        if pd.notna(sequence) and sequence != "nan":  # Skip NaN values
            f.write(f">{coord}\n{sequence}\n")


In [15]:
## paste fasta into MaxEnt score online tool (max entropy model) (http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html)
## then save as .fa file using text editor (copy paste values).

# Dictionary to store the coordinates and MaxEnt scores
coord_maxent_dict = {}

with open("all_whip_R5ss.fa", "r") as f:
    current_coord = None
    for line in f:
        if line.startswith(">"):  # Header line
            current_coord = line.strip().lstrip(">")
        else:  # Sequence line with MaxEnt score
            match = re.search(r"MAXENT:\s*([-+]?\d*\.\d+|\d+)", line)
            if match:
                max_ent_5 = float(match.group(1))
                if current_coord:
                    coord_maxent_dict[current_coord] = max_ent_5

# Convert dictionary to DataFrame
maxent_df = pd.DataFrame(list(coord_maxent_dict.items()), columns=['Coord', 'MaxEnt'])

# Print to verify
print(maxent_df.head())


                     Coord  MaxEnt
0  chr1:44740913-44741007\   -4.64
1  chr1:44747384-44747485\   -6.67
2  chr1:44747652-44747700\   -9.27
3  chr1:44750442-44750564\  -13.38
4  chr1:44753132-44753254\  -15.17


In [16]:
# Remove any trailing backslashes from 'Coord' in maxent_df
maxent_df['Coord'] = maxent_df['Coord'].str.strip().str.rstrip('\\')

In [18]:
## Merge the DataFrames on 'Coord'
df_max = df.merge(maxent_df, on='Coord', how='left')
# Rename 'MaxEnt' column to 'R5ss_MaxEnt'
df_max= df_max.rename(columns={'MaxEnt': 'R5ss_MaxEnt'})


In [20]:
df_max.to_csv('all_whippet_Recursive_5ss_maxent.csv', index=False)

In [None]:
### PLOTTING SIMILAR TO SCRIPT 13 ####