# Analysis of PTMs in ESRP1-expression prostate cancers

This notebook contains all analysis used to generate data for the analysis of ESRP1-mediated splicing in TCGA prostate cancer dataset (Figure 4/5 of manuscript). To run this, notebook, you must have the resulting files from a complete mapping run of the [ExonPTMapper](https://github.com/NaegleLab/ExonPTMapper/tree/main) python package.

## Table of Contents

1. [Load Data](#load-data)
2. [Identifying PTMs impacted by ESRP1-mediated splicing](#esrp1-mediated-splicing-in-prostate-cancer)
    1. [Projecting PTMs onto splicegraph](#projecting-ptms-onto-splicegraph)
    2. [Extract PTMs for specific TCGA datasets](#extract-ptms-for-specific-tcga-datasets)
    3. [Annotate PTMs from Splicegraph with functional information](#annotate-ptms-from-splicegraph)
    4. [Find altered flanking sequences in TCGA data](#find-altered-flanking-sequences)
3. [Analysis of ESRP1-control of PTMs](#analyze)
    1. [Gene Set Enrichment with EnrichR](#gene-set-enrichment-with-enrichrgseapy)
    2. [Exon Ontology Analysis](#exon-ontology)
    3. [PTM-associated Interactions](#interaction-data)
    4. [Flanking Sequence Analysis](#flanking-sequence-analysis)
        1. [Changes in PTM flanking sequences](#changes-in-ptm-flanking-sequences)
        2. [Kinase Library Analysis](#kinase-library-analysis)

### Load Data

In [None]:
import pandas as pd
import os
from ExonPTMapper import mapping, config

from tqdm import tqdm

#PTM-POSE functions
from PTM_POSE import project, annotate

#import custom statistic functions
import stat_utils

#location of figshare data
figshare_dir = 'C:/Users/Sam/OneDrive/Documents/GradSchool/Research/Splicing/Paper_Prep/PTM_Splicing_FigShare_Update/'
#where to find projected PTMs
ptm_data_dir = figshare_dir + 'PTM_Projection_Data/'
#where to find information from different databases
database_dir = figshare_dir + 'External_Data/'
#where analysis data will be saved
analysis_dir = figshare_dir + '/Analysis_For_Paper/'



#load ptm_coordinates
ptm_coordinates = pd.read_csv(ptm_data_dir + '/processed_data_dir/ptm_coordinates.csv', index_col = 0, dtype = {'Chromosome/scaffold name': str})
spliceseq = pd.read_csv(database_dir + '/TCGA_SpliceSeq/TCGASpliceData.txt', delim_whitespace=True) #splicegraph information from spliceseq


## Identifying PTMs impacted by ESRP1-mediated splicing

### Project PTMs onto splicegraph

In [485]:

#PTM-POSE functions
from PTM_POSE import project

#load spliceseq
spliceseq = pd.read_csv(database_dir + '/TCGA_SpliceSeq/TCGASpliceData.txt', delim_whitespace=True) #splicegraph information from spliceseq
spliceseq['Exon Label'] = spliceseq['Symbol'] + '_' + spliceseq['Exon'].astype(str)

#project PTMs onto splicing events
spliceseq, spliceseq_ptms = project.project_ptms_onto_splice_events(spliceseq, ptm_coordinates, gene_col = 'Symbol', chromosome_col = 'Chromosome', strand_col = 'Strand', region_start_col = 'Chr_Start', region_end_col = 'Chr_Stop', event_id_col = 'Exon Label', coordinate_type = 'hg19', separate_modification_types = False, PROCESSES = 6)

#set PTM and exon labels to use throughout analysis
spliceseq_ptms['PTM'] = spliceseq_ptms.apply(lambda row: row['UniProtKB Accession'] + '_' + row['Residue']+ str(row['PTM Position in Canonical Isoform']) if row['PTM Position in Canonical Isoform'] == row['PTM Position in Canonical Isoform'] else row['Source of PTM'].split(';')[0], axis = 1)
spliceseq_ptms['Exon'] = spliceseq_ptms['Region ID'].apply(lambda x: x.split('_')[1])

#save data
spliceseq_ptms.to_csv(analysis_dir + '/TCGA/splicegraph_ptms.csv', index = False)
spliceseq.to_csv(analysis_dir + '/TCGA/splicegraph_exons.csv', index = False)

  spliceseq = pd.read_csv("./Data/TCGASpliceData.txt", delim_whitespace=True) #SpliceSeq Data
  ptm_coordinates = pd.read_csv('../PTM_POSE/Resource_Files/ptm_coordinates.csv', index_col = 0, dtype = {'Chromosome/scaffold name': str})
  return bound(*args, **kwds)


### Extract PTMs for specific TCGA datasets

In [None]:
import spliceseq_analysis as ss
import numpy as np

max_p_value = 0.01
min_effect_size = 0.3
min_psi_range = 0.25

tissue = 'PRAD'
spliceseq_dir = database_dir + f'TCGA/{tissue}/TCGA_SpliceSeq/'
data = ss.ESRP1_analysis(odir = analysis_dir + f'', tissue = tissue, spliceseq_dir=spliceseq_dir, include_ME = False)
data.compare_PSI_for_events_MW(cutoff = 1, min_patients_in_group = 3)
data.compare_PSI_for_exons_MW(min_patients = 3)
data.add_ptms_to_data()
data.extract_significant_exons(effect_size = min_effect_size, alpha = max_p_value, min_psi_range = min_psi_range)
data.extract_significant_ptms(effect_size = min_effect_size, alpha = max_p_value, min_psi_range = min_psi_range, duplicate_handling = 'conflicting_any')
data.save_data(odir = analysis_dir + f'./TCGA/', tissue=tissue)


### Annotate PTMs from Splicegraph

In [None]:
#add expected flanking sequence for each ptm
ptm_info = pd.read_csv(ptm_data_dir + '/processed_data_dir/ptm_info.csv')
ptm_info['PTM'] = ptm_info['Protein'] + '_' + ptm_info['Residue'] + ptm_info['PTM Location (AA)'].astype(str)
#add flanking sequence
if 'Expected Flanking Sequence' in ptm_coordinates.columns:
    ptm_coordinates = ptm_coordinates.drop('Expected Flanking Sequence', axis = 1)
ptm_coordinates = ptm_coordinates.reset_index().merge(ptm_info[['PTM','Flanking Sequence']], left_on = 'Source of PTM', right_on = 'PTM', how = 'left').set_index('Genomic Coordinates')
ptm_coordinates = ptm_coordinates.drop('PTM', axis = 1)
ptm_coordinates = ptm_coordinates.rename(columns={'Flanking Sequence':'Expected Flanking Sequence'})

In [None]:
#if obtained exon ontology data run this block
def process_exont(functions):
    """
    Given functional annotations from ExonOntology, process them into a format that can be appended to annotated ptms data
    """
    #functions = functions.dropna(subset = 'Feature category')
    functions = functions.replace(np.nan, 'No Annotation')
    functions = functions[functions['Feature category'] != 'PTM']

    functions['Region ID'] = functions['Name'].apply(lambda x: x.split('_Low')[0])
    functions['Region ID'] = functions['Region ID'].apply(lambda x: x.split('_High')[0])

    functions = functions.groupby('Region ID')[['Feature category', 'Feature name']].agg(lambda x: ';'.join(np.unique(x)))
    functions.columns = ['EXONT:Category', 'EXONT:Name']
    #update region ids to be floats
    new_index = {}
    for i in functions.index:
        if '.' not in i:
            new_index[i] = i + '.0'
        else:
            new_index[i] = i
    functions = functions.rename(new_index).reset_index()
    return functions

functions = pd.read_csv(analysis_dir + f'/TCGA/ExonLevel/ExonOntology/ExonData_Results/ExonAnnotations.csv', header = 1, sep = ';')
functions = process_exont(functions)
functions_flank = pd.read_csv(analysis_dir + f'/TCGA/ExonLevel/ExonOntology/Flank_Data_Results/ExonAnnotations.csv', header = 1, sep = ';')
functions_flank = process_exont(functions_flank)
functions = pd.concat([functions, functions_flank])

In [None]:
#annotated_ptms = data.PSI_ptms.copy() # ptms projected onto splice events
annotated_ptms = data.mapped_ptms[data.mapped_ptms['Gene'].isin(data.PSI_events['symbol'].values)]     # ptms associated with prostate proteins
annotated_ptms = annotated_ptms.merge(functions, on = 'Region ID', how = 'left')
annotated_ptms = annotated_ptms[['Source of PTM', 'Gene','UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class', 'EXONT:Category', 'EXONT:Name']].drop_duplicates()
annotated_ptms['Modification Class'] = annotated_ptms["Modification Class"].str.split(';')
annotated_ptms = annotated_ptms.explode("Modification Class")

#separate cases in which multiple protein ids exist
annotated_ptms = annotated_ptms[~annotated_ptms['UniProtKB Accession'].str.contains(';')]

In [None]:
annotated_ptms['PTM Position in Canonical Isoform'] = annotated_ptms["PTM Position in Canonical Isoform"].astype(float)

#annotate ptms
from PTM_POSE import annotate

annotated_ptms = annotate.add_PSP_regulatory_site_data(annotated_ptms, database_dir + '/PhosphoSitePlus/Regulatory_sites.gz')
annotated_ptms = annotate.add_PSP_disease_association(annotated_ptms, database_dir + '/PhosphoSitePlus/Disease-associated_sites.gz')
annotated_ptms = annotate.add_PSP_kinase_substrate_data(annotated_ptms, database_dir + '/PhosphoSitePlus/Kinase_Substrate_Dataset.gz')
annotated_ptms = annotate.add_ELM_interactions(annotated_ptms, database_dir + '/ELM/elm_interactions.tsv')
annotated_ptms = annotate.add_PTMint_data(annotated_ptms)
annotated_ptms = annotate.add_PTMcode_interprotein(annotated_ptms, fname = database_dir + '/PTMcode/PTMcode2_associations_between_proteins.txt.gz')
annotated_ptms = annotate.add_DEPOD_phosphatase_data(annotated_ptms)
annotated_ptms = annotate.add_RegPhos_data(annotated_ptms)

In [526]:
regphos_conversion = {'CK2A1':'CSNK2A1', 'PKACA':'PRKACA', 'ABL1(ABL)':'ABL1'}
def combine_kinases(row):
    psp = row['PSP:Kinase'].split(';') if row['PSP:Kinase'] == row['PSP:Kinase'] else []
    regphos = row['RegPhos:Kinase'].split(';') if row['RegPhos:Kinase'] == row['RegPhos:Kinase'] else []
    for i, rp in enumerate(regphos):
        if rp.upper() in regphos_conversion:
            regphos[i] = regphos_conversion[rp.upper()]
        else:
            regphos[i] = rp.upper()
    combined = np.unique(psp+regphos)
    if len(combined) > 0:
        return ';'.join(combined)
    else:
        return np.nan

annotated_ptms['Combined:Kinase'] = annotated_ptms.apply(combine_kinases, axis = 1)

In [531]:
annotated_ptms.to_csv(analysis_dir + '/TCGA/PRAD_annotated_ptms.csv', index = False)

### Find altered flanking sequences

In addition to differential inclusion of exons, we sought to identify cases in which a PTM site may be conserved in the isoform, but have altered flanking residues due changes to an adjacent exon/exon region, which could lead to different binding interactions. To do so, we projected PTMs onto exons adjacent to splice events and identified cases in which the flanking sequence was different based on inclusion values of adjacent exons

In [None]:
#extract changed flanking sequences depending on inclusion of adjacent exons
data.get_changed_flanking_sequences(ptm_coordinates)
data.flanking_sequences['Matched'] =data.flanking_sequences["Inclusion Flanking Sequence"] == data.flanking_sequences['Exclusion Flanking Sequences']
data.flanking_sequences.to_csv(analysis_dir + '/TCGA/changed_flank_sequences.csv')

## Analyze

### Gene Set Enrichment with EnrichR/Gseapy

To assess the general function/properties of spliced genes, as well as those with impacted PTMs, we utilized the gseapy package to interface with EnrichR web services. First we defined the gene lists to run through enrichment analysis, based on which genes were spliced and contained impacted PTMs

In [33]:
import gseapy as gp

#construct background set from the splicegraph, restricting to genes found in prostate splicing data
prad_splicegraph= data.splicegraph[data.splicegraph['Symbol'].isin(data.PSI_events['symbol'].unique())]

#gene information (total and spliced genes)
total_genes = prad_splicegraph['Symbol'].unique()
total_genes_with_controlled_exons = data.sig_exons['symbol'].unique()
total_genes_with_controlled_ptms = data.sig_ptms['Gene'].unique()
total_genes_with_altered_ptms_only = set(data.flanking_sequences.loc[~data.flanking_sequences['Matched'], 'Gene'].unique()).difference(total_genes_with_controlled_ptms)

#establish background and foreground sets for gene set enrichment analysis
background_ptms = list(total_genes_with_controlled_exons)
foreground_ptms = list(total_genes_with_controlled_ptms) + list(total_genes_with_altered_ptms_only)
background_exons = list(total_genes_with_controlled_exons)
foreground_exons = set(total_genes_with_controlled_exons).difference(foreground_ptms)


We then ran these gene lists through EnrichR

In [36]:
#use gene ontology, reactome, and kegg gene sets
subgene_set_list  = ['KEGG_2021_Human', 'GO_Biological_Process_2023', 'GO_Cellular_Component_2023', 'GO_Molecular_Function_2023','Reactome_2022']


#perform gene set enrichment analysis and save data
enr_all = gp.enrichr(list(total_genes_with_controlled_exons), background = total_genes, gene_sets = subgene_set_list, organism='human')
enr_all.results['Test Type'] = 'All' 
enr_ptms = gp.enrichr(list(foreground_ptms), background = total_genes, gene_sets = subgene_set_list, organism='human')
enr_ptms.results['Test Type'] = 'PTMs Impacted' 
enr_exons = gp.enrichr(list(foreground_exons), background = total_genes, gene_sets = subgene_set_list, organism='human')
enr_exons.results['Test Type'] = 'PTMs Unaffected'
results = pd.concat([enr_all.results, enr_exons.results, enr_ptms.results])

In [37]:
results.to_csv(analysis_dir + '/TCGA/Enrichr/Combined_results.csv')

### Exon Ontology

To identify potential functions of PTMs, we processed splicing data of PTM-containing spliced exons and exons with PTMs with altered flanking sequences for running through the ExonOntology web server. The produced files here were uploaded to the ExonOntology web server and the results were downloaded.

#### Process spliced PTM-containing exons

In [165]:
tissue = 'PRAD'
#create label for each event
exon_data_sig = data.sig_exons.copy()
exon_data_sig['label'] = exon_data_sig['Symbol'] + '_' + exon_data_sig['Individual exon'] + '_' + exon_data_sig['ESRP1_MW']



#using information from splicegraph, identify the genomic start and end of splice event and add to data
chromosome_list = []
start_list = []
end_list = []
for i, row in exon_data_sig.iterrows():
    #grab coordinates
    exon = float(row['Individual exon'])
    gene = row['Symbol']
    start_row = data.splicegraph.loc[(data.splicegraph['Symbol'] == gene) & (data.splicegraph['Exon'] == exon)].iloc[0]
    end_row = data.splicegraph.loc[(data.splicegraph['Symbol'] == gene) & (data.splicegraph['Exon'] == exon)].iloc[0]

    #extract all needed information from splicegraph
    chromosome = start_row['Chromosome']
    strand = start_row['Strand']
    if strand == '+':
        start = start_row['Chr_Start']
        end = end_row['Chr_Stop']
    else:
        start = end_row['Chr_Stop']
        end = start_row['Chr_Start']

    #convert from hg19 to hg38
    chromosome_list.append(chromosome)
    start_list.append(start)
    end_list.append(end)

#store in data
exon_data_sig['CHROMOSOME'] = chromosome_list
exon_data_sig['EXON START'] = start_list
exon_data_sig['EXON END'] = end_list


#grab only columns necessary for NEASE
exon_data_sig = exon_data_sig[['label', 'CHROMOSOME', 'EXON START', 'EXON END']]
exon_data_sig.to_csv(analysis_dir + '/TCGA/ExonLevel/ExonOntology/exon_data_sig_for_upload.csv', header=False, index = False)

#### Process data for Exons with PTMs with altered Flanking Regions

In [136]:
#using information from splicegraph, identify the genomic start and end of splice event and add to data
chromosome_list = []
start_list = []
end_list = []
flank_data = data.flanking_sequences[~data.flanking_sequences['Matched']].copy()
for i, row in flank_data.iterrows():
    #grab coordinates
    start_row = data.splicegraph.loc[data.splicegraph['Exon Label'] == row['Region ID']].iloc[0]

    #extract all needed information from splicegraph
    chromosome = start_row['Chromosome']
    strand = start_row['Strand']
    if strand == '+':
        start = start_row['Chr_Start']
        end = start_row['Chr_Stop']
    else:
        start = start_row['Chr_Stop']
        end = start_row['Chr_Start']

    #convert from hg19 to hg38
    chromosome_list.append(chromosome)
    start_list.append(start)
    end_list.append(end)

flank_data['CHROMOSOME'] = chromosome_list
flank_data['EXON START'] = start_list
flank_data['EXON END'] = end_list


#grab only columns necessary for NEASE
flank_data = flank_data[['Region ID', 'CHROMOSOME', 'EXON START', 'EXON END']]
flank_data.to_csv(analysis_dir + '/TCGA/ExonLevel/ExonOntology/flank_data_for_upload.csv', header=False, index = False)

### Interaction Data

We sought to identify PTM-associated interactions that may be impacted by ESRP1-splicing control of PTMs, pulling interaction data from PhosphoSitePlus, RegPhos, and PTMcode. From this, we can identify proteins that are most impacted by ESRP1-splicing.

In [3]:
import POSE_config as config
import pandas as pd

#using uniprot to gene name dict, construct dict to go the other direction (gene name to uniprot id)
name_to_uniprot = pd.DataFrame(config.uniprot_to_gene, index = ['Gene']).T
name_to_uniprot = name_to_uniprot[name_to_uniprot['Gene'] != '']
name_to_uniprot['Gene'] = name_to_uniprot['Gene'].apply(lambda x: x.split(' '))
name_to_uniprot = name_to_uniprot.explode('Gene')
name_to_uniprot = name_to_uniprot.reset_index()
name_to_uniprot.columns = ['UniProtKB/Swiss-Prot ID', 'Gene name']
name_to_uniprot_df = name_to_uniprot.drop_duplicates(subset = 'Gene name', keep = False)
name_to_uniprot = name_to_uniprot.groupby('Gene name').agg(' '.join)
name_to_uniprot_dict = name_to_uniprot.to_dict()['UniProtKB/Swiss-Prot ID']

#grab significant PTMs and process
sig_ptms = data.sig_ptms.copy()
sig_ptms['PTM'] = sig_ptms.apply(lambda x: x['UniProtKB Accession'] + '_' + x['Residue']+ str(int(x['PTM Position in Canonical Isoform'])) if x['PTM Position in Canonical Isoform'] == x['PTM Position in Canonical Isoform'] else x['Source of PTM'].split(';')[0], axis = 1)
sig_ptm_list = sig_ptms['PTM'].unique()

#make sure PTM column of annotated PTMs is in the right format
data.annotated_ptms['PTM'] = data.annotated_ptms.apply(lambda x: x['UniProtKB Accession'] + '_' + x['Residue']+ str(int(x['PTM Position in Canonical Isoform'])) if x['PTM Position in Canonical Isoform'] == x['PTM Position in Canonical Isoform'] else x['Source of PTM'].split(';')[0], axis = 1)

In [5]:
#grab info on whether ptm is in domain
ptm_info = pd.read_csv(ptm_data_dir + '/processed_data_dir/ptm_info.csv', index_col = 0)
ptm_info = ptm_info[ptm_info['Isoform Type'] == 'Canonical']
ptm_info['PTM'] =ptm_info['Protein'].apply(lambda x: x.split('-')[0])+ '_'+ptm_info['Residue']+ptm_info['PTM Location (AA)'].astype(int).astype(str)
domain_ptms = ptm_info.loc[ptm_info['inDomain'], 'PTM']

#### Combine data

##### PhosphoSitePlus Interactions

We pulled protein interaction data from [PhosphoSitePlus](https://www.phosphosite.org/homeAction.action) and processed to append uniprot ids of interacting protein.

In [7]:
#dictionary to convert phosphositeplus names that are not standard gene names to UniProt IDs
psp_name_dict = {'Actinfilin':'Q6TDP4','14-3-3 zeta':'P63104','14-3-3 epsilon':'P62258','14-3-3 sigma':'P31947','P130Cas':'P56945','ENaC-beta':'P51168','ENaC-alpha':'P37088','14-3-3 eta':'Q04917','14-3-3 beta':'P31946', '14-3-3 gamma':'P61981', '14-3-3 theta':'P27348','Securin':'O95997','GPIbA':'P07359','occludin':'Q16625','ER-beta':'Q92731','53BP1': 'Q12888','4E-T':'Q9NRA8','53BP2':'Q13625','AP-2 beta':'Q92481','APAF':'O14727','Bcl-xL':'Q07817','C/EBP-epsilon':'Q15744','CREB':'P16220','Calmodulin':'P0DP23','Cortactin':'Q14247','DNAPK':'P78527', 'Diaphanous-1':'O60610', 'ER-alpha':'P03372', 'Exportin-1':'O14980', 'Ezrin':'P15311', 'H3':'Q6NXT2','HSP70':'P0DMV8;P0DMV9','IKKG':'Q9Y6K9', 'Ig-beta':'P40259','Ku80':'P13010','LC8':'Q96FJ2', 'MRLC2V':'P10916', 'Merlin':'P35240','NFkB-p105':'P19838', 'Rb':'P06400', 'RhoGDI alpha':'P52565', 'Rhodopsin':'P08100', 'SHP-1':'P29350', 'SHP-2':'Q06124','SLP76':'Q13094','SMRT':'Q9Y618','SRC-3':'Q9Y6Q9','STI1':'Q9BPY8','Vinculin':'P18206','beclin 1':'Q14457','claspin':'Q9HAW4', 'gp130':'P40189','leupaxin':'O60711','p14ARF':'Q8N726','rubicon':'Q92622','snRNP A':'P09661','snRNP B1':'P08579','snRNP C':'P09234','syntenin':'O00560;Q9H190','talin 1':'Q9Y490', 'ubiquitin':'P0CG47', '4E-BP1':'Q13541', 'ALK2':'Q04771', 'AMPKA1':'Q13131','AurA':'O14965','AurB':'Q96GD4', 'AurC':'Q9UQB9', 'C/EBP-beta':'P17676', 'CAMK1A':'Q14012', 'CHD-3 iso3':'Q12873', 'CK1A':'P48729', 'CK2B':'P67870', 'DAT':'Q01959', 'DJ-1':'Q99497', 'DOR-1':'P41143', 'DYN1':'Q05193','Desmoplakin':'P15924', 'Exportin-4':'Q9C0E2', 'FBPase':'P09467', 'FBPase 2':'O60825', 'G-alpha':'P63096', 'G-alpha 13':'Q14344', 'G-alpha i1':'P63096', 'G-beta 1':'P62873', 'G-beta 2':'P62879', 'G6PI':'P06744', 'GM130':'Q08379', 'GR':'P04150', 'H4':'P62805', 'HP1 alpha':'P45973', 'IkB-alpha':'P25963', 'IkB-beta':'Q15653', 'PPAR-gamma':'P37231', 'Claudin-1':'O95832', 'Claudin-2':'P57739', 'Cofilin-1':'P23528', 'K14':'P02533', 'K18':'P05783', 'K5':'P13647','K8':'P05787','Ku70':'P12956', 'Moesin':'P26038','N-WASP':'O00401','Nur77':'P22736','P38A':'Q16539','P38B':'Q15759', 'P70S6KB':'P23443','PGC-1 alpha':'Q9UBK2','PKHF1':'Q96S99','P38G':'P53778','PKCI':'P41743','PKCZ':'Q05513', 'PKG1':'Q13976', 'PTP-PEST':'Q05209','Plectin-1':'Q15149','RFA2':'P15927','SERCA2':'P16615','SH2-B-beta':'Q9NRF2', 'SNAP-alpha':'P54920', 'SPT16':'Q9BXB7', 'SPT6':'Q7KZ85','STEP':'P54829','STLK3':'Q9UEW8', 'Snail1':'O95863', 'Snail2':'O43623', 'Stargazin':'P62955','Survivin':'O15392','TARP':'P09693','TK':'P04183','TOM20':'Q15388','TR-alpha':'P10827','Titin':'Q8WZ42','Vimentin':'P08670','WASP':'P42768','ZAP':'Q7Z2W4',  'Zyxin':'Q15942', 'cIAP1':'Q13490','caveolin-1':'Q03135', 'coronin 2A':'Q92828', 'desmin':'P17661','eIF2-alpha':'Q9BY44', 'eIF2-beta':'P20042', 'eIF3-alpha':'O75822', 'eIF3-eta':'P55884', 'eIF3-zeta':'O15371', 'eNOS':'P29474', 'emerin':'P50402', 'epsin 1':'Q9Y6I3', 'glutaminase':'O94925','hnRNP A1':'P09651', 'hnRNP A2/B1':'P22626', 'hnRNP A3':'P51991','hnRNP D0':'Q14103', 'hnRNP E2':'Q15366','hnRNP P2':'P35637','hnRNP U':'Q00839', 'kindlin-2':'Q96AC1', 'kindlin-3':'Q86UX7','lamin A/C':'P02545', 'mucolipin 1':'Q9GZU1','nNOS':'Q8WY41','p21Cip1':'P38936', 'p27Kip1':'P46527','p47phox':'P14598','p90RSK':'Q15418','palladin':'Q8WX93','polybromo 1':'Q86U86', 'syndecan-4':'P31431', 'tensin 1 iso1':'Q9HBL0', 'utrophin':'P46939','DKFZp686L1814':'Q6MZP7', 'EB1':'Q15691', 'EB2':'Q15555', 'G-alpha i3':'P08754','HSP20':'O14558','HSP40':'P25685', 'Hic-5':'O43294', 'Ig-alpha':'P11912', 'LC3A':'Q9H492', 'LC3B':'Q9GZQ8', 'LC3C':'Q9BXW4','NFkB-p100':'Q00653','NFkB-p65':'Q04206','Pnk1':'Q96T60', 'RPT2':'P62191','EB3':'Q9UPY8'}

In [8]:
#extract PSP data from annotated PTMs, separate cases in which single PTM has multipe interactions
psp_interact = data.annotated_ptms.dropna(subset = 'PSP:ON_PROT_INTERACT')[['Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class', 'PTM','PSP:DOMAIN','PSP:ON_PROT_INTERACT']]
psp_interact['PSP:ON_PROT_INTERACT'] = psp_interact['PSP:ON_PROT_INTERACT'].apply(lambda x: x.split(';'))
psp_interact = psp_interact.explode('PSP:ON_PROT_INTERACT')

#extract type of interaction and interacting protein
psp_interact['Type'] = psp_interact['PSP:ON_PROT_INTERACT'].apply(lambda x: x.split('(')[1].split(')')[0])
psp_interact['Interacting Protein'] = psp_interact['PSP:ON_PROT_INTERACT'].apply(lambda x: x.split('(')[0].strip(' '))
psp_interact = psp_interact.merge(sig_ptms[['PTM', 'deltaPSI_MW']], on = 'PTM', how = 'left')

#identify if in domain based ProteomeScout or PSP data
psp_interact['In Domain'] = psp_interact.apply(lambda x: (x['PTM'] in domain_ptms.values) | (x['PSP:DOMAIN'] == x['PSP:DOMAIN']), axis = 1)

#convert interacting protein to uniprot id
interacting_id = []
missed_genes = []
for gene in psp_interact['Interacting Protein']:
    #remove isoform label if present
    if gene in name_to_uniprot_dict: #if PSP name is gene name found in uniprot
        interacting_id.append(name_to_uniprot_dict[gene])
    elif gene.upper() in name_to_uniprot_dict:
        interacting_id.append(name_to_uniprot_dict[gene.upper()])
    elif gene.split(' ')[0].upper() in name_to_uniprot_dict:
        interacting_id.append(name_to_uniprot_dict[gene.split(' ')[0].upper()])
    elif gene.replace('-', '').upper() in name_to_uniprot_dict:
        interacting_id.append(name_to_uniprot_dict[gene.replace('-', '').upper()])
    elif gene in psp_name_dict: # if PSP name is not gene name, but is in conversion dictionary
        interacting_id.append(psp_name_dict[gene])
    else: #otherwise note that gene was missed
        interacting_id.append(np.nan)
        missed_genes.append(gene)

#save information
psp_interact['Interacting ID'] = interacting_id
psp_interact = psp_interact.dropna(subset = 'Interacting ID')
psp_interact['Interacting ID'] = psp_interact['Interacting ID'].str.split(';')
psp_interact = psp_interact.explode('Interacting ID')
psp_interact['Interaction Source'] = 'PhosphoSitePlus'

In [9]:
#some genes were not captured and were removed for various reasons (often not human in origin)
np.unique(missed_genes)

array(['E7', 'LANA', 'NPM-ALK', 'YpkA'], dtype='<U7')

##### Kinase-substrate Data

Kinase-substrate information downloaded from [PhosphoSitePlus](https://www.phosphosite.org/homeAction.action) and [RegPhos](http://140.138.144.141/~RegPhos/index.php)

In [10]:
ks_genes_to_uniprot = {'ABL1(ABL)':'P00519', 'ACK':'Q07912', 'AURC':'Q9UQB9', 'ERK1(MAPK3)':'P27361','ERK2(MAPK1)':'P28482',  'ERK5(MAPK7)':'Q13164','JNK1(MAPK8)':'P45983', 'CK1A':'P48729', 'JNK2(MAPK9)':'P45984', 'JNK3(MAPK10)':'P53779', 'P38A(MAPK14)':'Q16539','P38B(MAPK11)':'Q15759', 'P38G(MAPK12)':'P53778','P70S6K' :'Q9UBS0', 'PAK':'Q13153', 'PKCZ':'Q05513', 'CK2A':'P19784', 'ABL2':'P42684', 'AMPKA1':'Q13131', 'AMPKA2':'Q13131', 'AURB':'Q96GD4', 'CAMK1A':'Q14012', 'CDC42BP':'Q9Y5S2','CK1D':'P48730','CK1E':'P49674','CK2B':'P67870','DMPK1':'Q09013', 'DNAPK':'P78527','DSDNA KINASE':'P78527', 'EG3 KINASE':'P49840','ERK3(MAPK6)':'Q16659','GSK3':'P49840', 'MRCKA':'Q5VT25', 'P38D(MAPK13)':'O15264','P70S6KB':'Q9UBS0','PDKC':'P78527','PKCH':'P24723','PKCI':'P41743','PKCT':'Q04759','PKD3':'O94806','PKG1':'Q13976','PKG2':'Q13237','SMMLCK':'Q15746'}

In [11]:
#extract combined PSP and RegPhos kinase-substarate data
term =  'Combined:Kinase'
ks_interact = data.annotated_ptms.dropna(subset = term)[['Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class', 'PTM', 'PSP:DOMAIN',term]]
ks_interact['Type'] = 'REGULATES'

#separate PTMs with multiple kinases
ks_interact[term] = ks_interact[term].str.split(';')
ks_interact = ks_interact.explode('Combined:Kinase')

#identify if in domain based ProteomeScout or PSP data
ks_interact['In Domain'] = ks_interact.apply(lambda x: (x['PTM'] in domain_ptms.values) | (x['PSP:DOMAIN'] == x['PSP:DOMAIN']), axis = 1)

#convert interacting protein to uniprot id
interacting_id = []
missed_genes = []
for gene in ks_interact['Combined:Kinase']:

    if gene in name_to_uniprot_dict:
        interacting_id.append(name_to_uniprot_dict[gene])
    elif gene.upper() in name_to_uniprot_dict:
        interacting_id.append(name_to_uniprot_dict[gene.upper()])
    elif gene.split(' ')[0].upper() in name_to_uniprot_dict:
        interacting_id.append(name_to_uniprot_dict[gene.split(' ')[0].upper()])
    elif gene.replace('-', '').upper() in name_to_uniprot_dict:
        interacting_id.append(name_to_uniprot_dict[gene.replace('-', '').upper()])
    elif gene in ks_genes_to_uniprot:
        interacting_id.append(ks_genes_to_uniprot[gene])
    else:
        interacting_id.append(np.nan)
        missed_genes.append(gene)

#save data
ks_interact['Interacting ID'] = interacting_id
ks_interact = ks_interact.dropna(subset = 'Interacting ID').drop_duplicates()
ks_interact["Interaction Source"] = "PSP/RegPhos"
ks_interact = ks_interact.merge(sig_ptms[['PTM', 'deltaPSI_MW']], on = 'PTM', how = 'left')

##### PTM-code Interactions

We pulled interactions from [PTMcode](https://ptmcode.embl.de/), which are largely extracted based on experimental and computational structural information

In [12]:
#extract ptm code interaction information
term =  'PTMcode:Interprotein_Interactions'

#separate cases in which single PTM has multipe interactions
ptmcode_interact = data.annotated_ptms.dropna(subset = 'PTMcode:Interprotein_Interactions')[['Gene', 'UniProtKB Accession', 'Residue', 'PTM Position in Canonical Isoform', 'Modification Class', 'PTM', 'PSP:DOMAIN','PTMcode:Interprotein_Interactions']]
ptmcode_interact['Interacting ID'] = ptmcode_interact['PTMcode:Interprotein_Interactions'].apply(lambda x: ';'.join(np.unique([ptm.split('_')[0] for ptm in x.split(';')])))
ptmcode_interact['Interacting ID'] = ptmcode_interact['Interacting ID'].str.split(';')
ptmcode_interact = ptmcode_interact.explode('Interacting ID')

#add additional information
ptmcode_interact['Interaction Source'] = "PTMcode"
ptmcode_interact['Type'] = 'INDUCES'
ptmcode_interact['In Domain'] = ptmcode_interact.apply(lambda x: (x['PTM'] in domain_ptms.values) | (x['PSP:DOMAIN'] == x['PSP:DOMAIN']), axis = 1)
ptmcode_interact = ptmcode_interact.merge(sig_ptms[['PTM', 'deltaPSI_MW']], on = 'PTM', how = 'left')

##### Combine data sources

In [16]:
psp_interact = psp_interact[['PTM', 'UniProtKB Accession', 'Interacting ID', 'Type', 'deltaPSI_MW', 'In Domain', 'Interaction Source']]
ptmcode_interact = ptmcode_interact[['PTM', 'UniProtKB Accession', 'Interacting ID', 'Type', 'deltaPSI_MW', 'In Domain', 'Interaction Source']]
ks_interact = ks_interact[['PTM', 'UniProtKB Accession', 'Interacting ID', 'Type', 'deltaPSI_MW', 'In Domain', 'Interaction Source']]

#combine all interaction data into one dataframe and combine repeat entries across sources
all_interactions = pd.concat([psp_interact, ptmcode_interact, ks_interact])
all_interactions = all_interactions.groupby(['PTM', 'UniProtKB Accession', 'Interacting ID', 'Type', 'deltaPSI_MW', 'In Domain'], dropna = False, as_index = False)['Interaction Source'].agg(';'.join)

#convert uniprot ids back to gene names for interpretability
ptm_gene = []
interacting_gene = []
for i, row in all_interactions.iterrows():
    ptm_gene.append(config.uniprot_to_gene[row['UniProtKB Accession']].split(' ')[0])
    interacting_gene.append(config.uniprot_to_gene[row['Interacting ID'].split(' ')[0]].split(' ')[0])
all_interactions['Modified Gene'] = ptm_gene
all_interactions["Interacting Gene"] = interacting_gene

In [60]:
all_interactions.drop_duplicates().to_csv(analysis_dir + '/TCGA/PTM_Interactions.csv',index = False)

  values = values.astype(str)


#### Process Interaction Data

In [18]:
import networkx as nx
import stat_utils

In [19]:
#extract interaction data
network_data = all_interactions.copy().drop_duplicates()

#generate network with all possible PTM-associated interactions within prostate cohort
full_graph = nx.from_pandas_edgelist(network_data, source = 'Modified Gene', target = 'Interacting Gene')
#calculate various centrality measures
degree_centrality = nx.degree_centrality(full_graph)
eigenvector_centrality = nx.eigenvector_centrality(full_graph)
closeness_centrality = nx.closeness_centrality(full_graph)
betweenness_centrality = nx.betweenness_centrality(full_graph)
full_data = pd.DataFrame({'Degree': dict(full_graph.degree()), 'Degree Centrality':degree_centrality, 'Eigenvector':eigenvector_centrality,'Closeness':closeness_centrality,'Betweenness':betweenness_centrality})

#trim network to only include edges associated proteins impacted by splicing (either directly or through targets)
network_data = network_data.dropna(subset = 'deltaPSI_MW')
sig_graph = nx.from_pandas_edgelist(network_data, source = 'Modified Gene', target = 'Interacting Gene')

#calculate network centrality measures
degree_centrality = nx.degree_centrality(sig_graph)
eigenvector_centrality = nx.eigenvector_centrality(sig_graph)
closeness_centrality = nx.closeness_centrality(sig_graph)
betweenness_centrality = nx.betweenness_centrality(sig_graph)
sig_data = pd.DataFrame({'Degree': dict(sig_graph.degree()), 'Degree Centrality':degree_centrality, 'Eigenvector':eigenvector_centrality,'Closeness':closeness_centrality,'Betweenness':betweenness_centrality})
sig_data['Fraction of Interactions Impacted by Splicing'] = sig_data['Degree']/full_data.loc[sig_data.index, 'Degree']

In [21]:
#use a hypergeometric test to assess if number of impacted interactions would be expected based on full network
interactions_in_full_network = len(full_graph.edges())
interactions_in_sig_network = len(sig_graph.edges())
overrepresentation_test = []
for i, row in sig_data.iterrows():
    degree = row['Degree']
    degree_in_full_network = full_data.loc[i, 'Degree']
    
    p = stat_utils.hypergeom(interactions_in_full_network, degree_in_full_network, interactions_in_sig_network, degree)
    overrepresentation_test.append(p)
sig_data['p'] = overrepresentation_test

In [25]:
sig_data.to_csv(analysis_dir + '/TCGA/PTM_Network_Properties.csv')

### Flanking Sequence Analysis

Here, we compared flanking sequences across altered flanking sequence events depending on whether adjacent exon was either included or excluded. This includes calculating:
1. Sequence identity
2. Which residues are changing and at what position
3. Whether an ELM motif is specific to either inclusion or exclusion sequence
4. Whether event appears to have introduced a stop codon (won't consider for main figures of manuscript)
5. Changes to kinase library scores

#### Changes in PTM flanking sequences

In [61]:
from Bio import pairwise2
import re
def getSequenceSimilarity(can_flank, alt_flank):
    """
    Given two flanking sequences, calculate the sequence similarity between them using Biopython and criteria definded by Pillman et al. BMC Bioinformatics 2011

    Parameters
    ----------
    can_flank: str
        flanking sequence for PTM in canonical protein isoform
    alt_flank: str
        flanking sequence for PTM in alternative protein isoform

    Returns
    -------
    normalized_score: float
        normalized score of sequence similarity between flanking sequences (calculated similarity/max possible similarity)
    """
    #align canonical and alternative flanks, return only the score
    actual_similarity = pairwise2.align.globalxs(can_flank, alt_flank, -10, -2, score_only = True)
    #aling the canonical flank to itself, return only the score
    control_similarity = pairwise2.align.globalxs(can_flank, can_flank, -10, -2, score_only = True)
    #normalize score
    normalized_score = actual_similarity/control_similarity
    return normalized_score


def findAlteredPositions(seq1, seq2, desired_seq_size = 11):
    """
    Given two sequences, identify the location of positions that have changed

    Parameters
    ----------
    seq1, seq2: str
        sequences to compare (order does not matter)
    desired_seq_size: int
        size of the flanking sequence to compare. IMPORTANT, this should not be larger than the available flanking sequence in the ptm_info dataframe
    
    Returns
    -------
    altered_positions: list
        list of positions that have changed
    residue_change: list
        list of residues that have changed associated with that position
    flank_side: str
        indicates which side of the flanking sequence the change has occurred (N-term, C-term, or Both)
    """
    altered_positions = []
    residue_change = []
    flank_side = []
    seq_size = len(seq1)
    flank_size = (seq_size -1)/2
    if seq_size == len(seq2) and seq_size == desired_seq_size:
        for i in range(seq_size):
            if seq1[i] != seq2[i]:
                altered_positions.append(i-(flank_size))
                residue_change.append(f'{seq1[i]}->{seq2[i]}')
        #check to see which side flanking sequence
        altered_positions = np.array(altered_positions)
        n_term = any(altered_positions < 0)
        c_term = any(altered_positions > 0)
        if n_term and c_term:
            flank_side = 'Both'
        elif n_term:
            flank_side = 'N-term only'
        elif c_term:
            flank_side = 'C-term only'
        else:
            flank_side = 'Unclear'
        return altered_positions, residue_change, flank_side
    else:
        return np.nan, np.nan, np.nan



In [64]:
#load elm interaction data
elm_classes = pd.read_csv(database_dir+'ELM/elm_classes.tsv', sep = '\t', header = 5)


#option where we consider regulatory region for each flanking sequence size
altered_flanks = data.flanking_sequences[~data.flanking_sequences['Matched']].copy()
altered_flanks = altered_flanks.dropna(subset = ['Inclusion Flanking Sequence', 'Exclusion Flanking Sequence'])
stop_codon_introduced = altered_flanks[(altered_flanks['Inclusion Flanking Sequence'].str.contains(r'\*')) | (altered_flanks['Exclusion Flanking Sequence'].str.contains(r'\*'))]
altered_flanks = altered_flanks[(~altered_flanks['Inclusion Flanking Sequence'].str.contains(r'\*')) & (~altered_flanks['Exclusion Flanking Sequence'].str.contains(r'\*'))]
altered_flanks = altered_flanks[(~altered_flanks['Inclusion Flanking Sequence'].str.contains('X')) & (~altered_flanks['Exclusion Flanking Sequence'].str.contains('X'))]

#iterate through each event and compare the inclusion/exclusion flanking sequences
similarity = []
altered_positions = []
residue_changes = []
flank_side = []
inclusion_match_list = []
exclusion_match_list = []
for i, row in altered_flanks.iterrows():
    #extract flanking sequences
    inclusion_flank = row['Inclusion Flanking Sequence']
    exclusion_flank = row['Exclusion Flanking Sequence']

    #compare sequence identity
    similarity.append(getSequenceSimilarity(exclusion_flank, inclusion_flank)) 
    
    #find positions in flanking sequences that are different
    results = findAlteredPositions(inclusion_flank, exclusion_flank)
    altered_positions.append(results[0])
    residue_changes.append(results[1])
    flank_side.append(results[2])

    #find ELM motif matches in flanking sequences
    matches = []
    for j, elm_row in elm_classes.iterrows():
        reg_ex = elm_row['Regex']
        if re.search(reg_ex, inclusion_flank) is not None:
            matches.append(elm_row['ELMIdentifier'])

    inclusion_match_list.append(matches)

    matches = []
    for j, elm_row in elm_classes.iterrows():
        reg_ex = elm_row['Regex']
        if re.search(reg_ex, inclusion_flank) is not None:
            matches.append(elm_row['ELMIdentifier'])

    exclusion_match_list.append(matches)
    
#add all data to dataframe, combining entries from same flank event with ';'
altered_flanks['Altered_Positions'] = [';'.join([str(int(x)) for x in pos]) if isinstance(pos, np.ndarray) else np.nan for pos in altered_positions]
altered_flanks['Residue Changes'] = [';'.join([str(x) for x in res]) if isinstance(res, np.ndarray) or isinstance(res, list) else np.nan for res in residue_changes]
altered_flanks["Location of Altered Flank"] = flank_side
altered_flanks['Similarity'] = similarity
altered_flanks['Inclusion Motifs'] = inclusion_match_list
altered_flanks['Exclusion Motifs'] = exclusion_match_list
altered_flanks['Motif in Both'] = altered_flanks.apply(lambda x: ';'.join(set(x['Inclusion Motifs']).intersection(x['Exclusion Motifs'])), axis = 1)
altered_flanks['Motif only in Inclusion'] = altered_flanks.apply(lambda x: ';'.join(set(x['Inclusion Motifs']).difference(x['Exclusion Motifs'])), axis = 1)
altered_flanks['Motif only in Exclusion'] = altered_flanks.apply(lambda x: ';'.join(set(x['Exclusion Motifs']).difference(x['Inclusion Motifs'])), axis = 1)
altered_flanks = altered_flanks.drop(columns = ['Inclusion Motifs', 'Exclusion Motifs'])
altered_flanks['Stop Codon Introduced'] = False

#indicate if stop codon introduced
stop_codon_introduced['Similarity'] = np.nan
stop_codon_introduced['Altered_Positions'] = np.nan
stop_codon_introduced['Residue Changes'] = np.nan
stop_codon_introduced["Location of Altered Flank"] = np.nan
stop_codon_introduced['Motif in Both'] = np.nan
stop_codon_introduced['Motif only in Inclusion'] = np.nan
stop_codon_introduced['Motif only in Exclusion'] = np.nan
stop_codon_introduced['Stop Codon Introduced'] = True

#combine non-stop codon and stop codon events
altered_flanks = pd.concat([altered_flanks, stop_codon_introduced])

#save
altered_flanks.to_csv(analysis_dir + '/TCGA/changed_flank_sequences_PRAD.csv', index = False)

#### Kinase Library Analysis

To determine how altered flanking sequences might alter kinase-substrate interactions, we used Kinase Library to score inclusion and exclusion flanking sequences (depending on whether adjacent exon is included).

##### Process flanking sequences for use with Kinase Library

To use Kinase Library, we needed to convert flanking sequence format and write flanking sequences to text files for upload to Kinase Library

In [None]:
def editSequence(seq):
    """
    Convert flanking sequence to version accepted by kinase library (modified residue denoted by asterick)
    """
    if seq == seq:
        seq = seq.replace('t','t*')
        seq = seq.replace('s','s*')
        seq = seq.replace('y','y*')
    else:
        return np.nan
    return seq

In [None]:
#get flanking sequences to use for kinase library analysis
flank_data = data.flanking_sequences[~data.flanking_sequences['Matched']] 
flank_data = flank_data[flank_data['Modification Class'].str.contains('Phosphorylation')]
#generate files to input into Kinase Library
inclusion_sequences = flank_data[['PTM', 'Inclusion Flanking Sequence']].drop_duplicates()
inclusion_sequences['Inclusion Flanking Sequence'] = inclusion_sequences['Inclusion Flanking Sequence'].apply(editSequence)
inclusion_sequences = inclusion_sequences.dropna(subset = 'Inclusion Flanking Sequence')
#write sequences to text file
with open(analysis_dir + '/TCGA/Kinase_Library/inclusion_sequences_input.txt', 'w') as f:
    for index, row in inclusion_sequences.iterrows():
        f.write(row['Inclusion Flanking Sequence']+'\n')

exclusion_sequences = flank_data[['PTM', 'Exclusion Flanking Sequence']].drop_duplicates()
exclusion_sequences['Exclusion Flanking Sequence'] = exclusion_sequences['Exclusion Flanking Sequence'].apply(editSequence)
exclusion_sequences = exclusion_sequences.dropna(subset = 'Exclusion Flanking Sequence')
#write sequences to text file
with open(analysis_dir + '/TCGA/Kinase_Library/exclusion_sequences_input.txt', 'w') as f:
    for index, row in exclusion_sequences.iterrows():
        f.write(row['Exclusion Flanking Sequence']+'\n')

##### Process Kinase Library Output

After running flanking sequences through kinase library, we then needed to process the output to determine which kinases were most impacted by the flanking sequence changes (largest changes to kinase percentile)

In [None]:
#grab canonical sequences and match kinase library formatting
flank_data = data.flanking_sequences[~data.flanking_sequences['Matched']]
flank_data = flank_data[flank_data['Modification Class'].str.contains('Phosphorylation')]
inclusion_sequences = flank_data[['Region ID','PTM', 'Inclusion Flanking Sequence']].drop_duplicates()
inclusion_sequences = inclusion_sequences.dropna(subset = 'Inclusion Flanking Sequence')
inclusion_sequences['Label'] = inclusion_sequences['Region ID'] + ';' + inclusion_sequences['PTM']
inclusion_sequences['Inclusion Flanking Sequence'] = inclusion_sequences['Inclusion Flanking Sequence'].apply(lambda x: x.upper().replace(' ', '_')+'_')

#grab alternative sequences and match kinase library formatting
exclusion_sequences = flank_data[['Region ID', 'PTM', 'Exclusion Flanking Sequence']].drop_duplicates()
exclusion_sequences = exclusion_sequences.dropna(subset = 'Exclusion Flanking Sequence')
exclusion_sequences['Exclusion Flanking Sequence'] = exclusion_sequences['Exclusion Flanking Sequence'].apply(lambda x: x.upper().replace(' ','_')+'_')
exclusion_sequences['Label'] = exclusion_sequences['Region ID'] + ';' + exclusion_sequences['PTM']
exclusion_sequences = exclusion_sequences[['Label', 'Exclusion Flanking Sequence']].drop_duplicates()


#add kinase library scorescores to sequence info
exclusion_scores = pd.read_csv(analysis_dir + '/TCGA/Kinase_Library/exclusion_scores.tsv', sep = '\t')
exclusion_sequences = exclusion_sequences.merge(exclusion_scores, left_on = 'Exclusion Flanking Sequence', right_on = 'sequence', how = 'left')

inclusion_scores = pd.read_csv(analysis_dir + '/TCGA/Kinase_Library/inclusion_scores.tsv', sep = '\t')
inclusion_sequences = inclusion_sequences.merge(inclusion_scores, left_on = 'Inclusion Flanking Sequence', right_on = 'sequence', how = 'left')


#pivot and extract scores
exclusion_sequences_y = exclusion_sequences[exclusion_sequences['Label'].str.contains('_Y')]
exclusion_percentiles_y = exclusion_sequences_y.pivot_table(index = 'Label', columns = 'kinase', values = 'site_percentile')
exclusion_sequences_st = exclusion_sequences[(exclusion_sequences['Label'].str.contains('_S')) | (exclusion_sequences['Label'].str.contains('_T'))]
exclusion_percentiles_st = exclusion_sequences_st.pivot_table(index = 'Label', columns = 'kinase', values = 'site_percentile')

inclusion_sequences_y = inclusion_sequences[inclusion_sequences['Label'].str.contains('_Y')]
inclusion_percentiles_y = inclusion_sequences_y.pivot_table(index = 'Label', columns = 'kinase', values = 'site_percentile')
inclusion_sequences_st = inclusion_sequences[(inclusion_sequences['Label'].str.contains('_S')) | (inclusion_sequences['Label'].str.contains('_T'))]
inclusion_percentiles_st = inclusion_sequences_st.pivot_table(index = 'Label', columns = 'kinase', values = 'site_percentile')

#calculate the difference in percentiles
labels= list(set(inclusion_percentiles_y.index).intersection(exclusion_percentiles_y.index))
percentiles_diff_y = inclusion_percentiles_y.loc[labels].copy()
percentiles_diff_y = percentiles_diff_y[exclusion_percentiles_y.columns]
for i, row in percentiles_diff_y.iterrows():
    percentiles_diff_y.loc[i] = row - exclusion_percentiles_y.loc[i]

labels= list(set(inclusion_percentiles_st.index).intersection(exclusion_percentiles_st.index))
percentiles_diff_st = inclusion_percentiles_st.loc[labels].copy()
percentiles_diff_st = percentiles_diff_st[exclusion_percentiles_st.columns]
for i, row in percentiles_diff_st.iterrows():
    percentiles_diff_st.loc[i] = row - exclusion_percentiles_st.loc[i]

#save results
percentiles_diff_y.to_csv(analysis_dir + '/TCGA/Kinase_Library/percentile_differences_Y.tsv')
percentiles_diff_st.to_csv(analysis_dir + '/TCGA/Kinase_Library/percentile_differences_ST.tsv')