In [None]:
### purpose: get 5' and 3' splice sites for all exons in whippet analysis (related to Figure 6)
### Input: output from script 1 (whippet_CE_gene_name_exon_number.csv)
#          and genome.fa used for mapping.

In [1]:
#packages
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import pybedtools
from tqdm import tqdm
import seaborn as sns
from splicing_utils import *

In [61]:
#Read input.csv
wh = pd.read_csv("whippet_CE_gene_name_exon_number.csv")

In [11]:
# Extract chromosome, start, and end
coord_split = wh['Coord'].str.extract(r'(?P<chr>chr[\dXYM]+):(?P<start>\d+)-(?P<end>\d+)')

# Convert extracted columns to integers
wh_bed = pd.DataFrame()
wh_bed["chr"] = coord_split["chr"]
wh_bed["start"] = coord_split["start"].astype(int)
wh_bed["end"] = coord_split["end"].astype(int)
wh_bed["name"] = wh["Coord"].loc[coord_split.index]  # Align index
wh_bed["score"] = 0
wh_bed["strand"] = wh["Strand"].loc[coord_split.index]  # Align index


    chr     start       end                    name  score strand
0  chr1  44740913  44741007  chr1:44740913-44741007      0      +
1  chr1  44747384  44747485  chr1:44747384-44747485      0      +
2  chr1  44747652  44747700  chr1:44747652-44747700      0      +
3  chr1  44750442  44750564  chr1:44750442-44750564      0      +
4  chr1  44753132  44753254  chr1:44753132-44753254      0      +


In [20]:
genome = "GRCh38.p14.genome.fa"

In [34]:
#Get splice site sequences

#Initiate placeholders
wh_bed["5_start"] = 0
wh_bed["5_end"] = 0
wh_bed["3_start"] = 0
wh_bed["3_end"] = 0

for idx, i in wh_bed.iterrows():
    if i.strand == "+":
        wh_bed["5_start"][idx] = i.end - 3
        wh_bed["5_end"][idx] = i.end + 6
        wh_bed["3_start"][idx] = i.start - 21
        wh_bed["3_end"][idx] = i.start + 2
    else:
        wh_bed["5_start"][idx] = i.start - 7
        wh_bed["5_end"][idx] = i.start + 2
        wh_bed["3_start"][idx] = i.end - 3
        wh_bed["3_end"][idx] = i.end + 20
        
wh_5 = wh_bed[["chr", "5_start", "5_end", "name", "score", "strand"]]
wh_3 = wh_bed[["chr", "3_start", "3_end", "name", "score", "strand"]]



In [35]:
wh_seqs_5 = get_sequence_from_bed(wh_5, genome)
wh_seqs_3 = get_sequence_from_bed(wh_3, genome)

In [36]:
wh_5['5ss_seq']=wh_seqs_5
wh_3['3ss_seq']=wh_seqs_3

In [38]:
#Write sequences into fasta files, keeping the exon coordinate as name

with open("5ss_whippet.fasta", "w") as f:
    for idx, line in enumerate(wh_seqs_5):
        if idx < len(wh_5.name):  # Ensure index is within bounds
            f.write(">" + wh_5.name.iloc[idx] + "\n" + line + "\n")

with open("3ss_whippet.fasta", "w") as f:
    for idx, line in enumerate(wh_seqs_3):
        if idx < len(wh_3.name):  # Ensure index is within bounds
            f.write(">" + wh_3.name.iloc[idx] + "\n" + line + "\n")


In [51]:
#In the next step you need to get MaxEnt scores (http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq_acc.html),
# uploading the generated fasta files and setting to Maximum Entropy model score. Save the output in a txt file and change name to ".fa".


# Dictionary to store the coordinates and MaxEnt scores 5ss
coord_maxent_dict_wh_5 = {}

with open("5ss.whippet.maxent.fa", "r") as f:
    current_coord = None
    for line in f:
        if line.startswith(">"):  # Header line
            current_coord = line.strip().lstrip(">")
        else:  # Sequence line with MaxEnt score
            match = re.search(r"MAXENT:\s*([-+]?\d*\.\d+|\d+)", line)
            if match:
                max_ent_5 = float(match.group(1))
                if current_coord:
                    coord_maxent_dict_wh_5[current_coord] = max_ent_5

# Convert dictionary to DataFrame
maxent_wh_5 = pd.DataFrame(list(coord_maxent_dict_wh_5.items()), columns=['Coord', 'MaxEnt'])

# Print to verify
print(maxent_wh_5.head())

# Dictionary to store the coordinates and MaxEnt scores 3ss
coord_maxent_dict_wh_3 = {}

with open("3ss.whippet.maxent.fa", "r") as f:
    current_coord = None
    for line in f:
        if line.startswith(">"):  # Header line
            current_coord = line.strip().lstrip(">")
        else:  # Sequence line with MaxEnt score
            match = re.search(r"MAXENT:\s*([-+]?\d*\.\d+|\d+)", line)
            if match:
                max_ent_3 = float(match.group(1))
                if current_coord:
                    coord_maxent_dict_wh_3[current_coord] = max_ent_3

# Convert dictionary to DataFrame
maxent_wh_3 = pd.DataFrame(list(coord_maxent_dict_wh_3.items()), columns=['Coord', 'MaxEnt'])

# # Print to verify
# print(maxent_wh_3.head())


                     Coord  MaxEnt
0  chr1:44740913-44741007\    9.65
1  chr1:44747384-44747485\   10.28
2  chr1:44747652-44747700\    9.72
3  chr1:44750442-44750564\    8.41
4  chr1:44753132-44753254\    6.89
                     Coord  MaxEnt
0  chr1:44740913-44741007\    8.89
1  chr1:44747384-44747485\   10.53
2  chr1:44747652-44747700\    8.00
3  chr1:44750442-44750564\    9.20
4  chr1:44753132-44753254\   11.19


In [52]:
# Remove any trailing backslashes from 'Coord' in maxent_df
maxent_wh_5['Coord'] = maxent_wh_5['Coord'].str.strip().str.rstrip('\\')
maxent_wh_3['Coord'] = maxent_wh_3['Coord'].str.strip().str.rstrip('\\')

In [53]:
# Merge the DataFrames on 'name' in wh_5 which is equivalent to 'Coord' in maxent_wh_5
wh_5_me = wh_5.merge(maxent_wh_5, left_on='name', right_on='Coord', how='left')
wh_3_me = wh_3.merge(maxent_wh_3, left_on='name', right_on='Coord', how='left')

In [63]:
# Rename columns in wh_5_me
wh_5_me_renamed = wh_5_me.rename(columns={'MaxEnt': 'MaxEnt_5ss'})

# Rename columns in wh_3_me
wh_3_me_renamed = wh_3_me.rename(columns={'MaxEnt': 'MaxEnt_3ss'})

# Deduplicate `wh_5_me` and `wh_3_me` to ensure one-to-one mapping
wh_5_me_unique = wh_5_me.rename(columns={'MaxEnt': 'MaxEnt_5ss'}).drop_duplicates(subset="Coord")
wh_3_me_unique = wh_3_me.rename(columns={'MaxEnt': 'MaxEnt_3ss'}).drop_duplicates(subset="Coord")

# Merge wh with wh_5_me and wh_3_me
wh = wh.merge(wh_5_me_unique[['Coord', '5ss_seq', 'MaxEnt_5ss']], on='Coord', how='left')
wh = wh.merge(wh_3_me_unique[['Coord', '3ss_seq', 'MaxEnt_3ss']], on='Coord', how='left')



Final wh length: 121888


In [65]:
wh.to_csv("whippet_CE_gene_name_exon_number_maxent.csv", index=False)

In [None]:
first_block_skip = pd.read_csv()
middle_block_skip = pd.read_csv()
last_block_skip = pd.read_csv()

first_block_incl = pd.read_csv()
middle_block_incl = pd.read_csv()
last_block_incl = pd.read_csv()