# Generating the null model for comparison to modification data

In this file, we will generate a null model consisting of random residues, rather than modifiable ones, meant to represent the impact splicing would have on residues if they are randomly selected from the proteome. To do so, we will first perform the mapping and projection process for all residues in the protoeme, and then randomly select a subset of these residues to define as a "modifiable". With this null model, we can then calculate constitutive rates and altered flank rates for each residue, and compare these to the rates observed in the modification data.

## Table of Contents

1. [Load Data](#load-data)
2. [Mapping and Projection of Residues to the Genome](#construct-null-residue-dataframes)
3. [Generate null model distributions of constitutive and altered flank rates](#generate-null-model-distributions)
    1. [Constitutive rates](#generate-null-model-distributions-for-constitutive-rates)
    2. [Altered flank rates](#generate-null-model-distributions-for-altered-flank-rates)



## Load Data

In [1]:
from ExonPTMapper import mapping, config
import os
import pandas as pd
import tqdm 
import numpy as np

#store figshare directory
figshare_dir = './'

#load mapping data
mapper = mapping.PTM_mapper()
#get rid of ptm information stored by mapper object
mapper.ptm_info = None
mapper.ptm_coordinates = None
mapper.isoform_ptms = None
mapper.alternative_ptms = None



Downloading Canonical UniProt isoforms
Downloading ID translator file
Loading mapper object from .tsv files
Loading exon-specific data
Loading transcript-specific data
Loading gene-specific info
Loading unique protein isoforms
Loading protein-specific info
Loading information on PTMs on canonical proteins
Loading genomic coordinates of PTMs associated with canonical proteins


  self.ptm_coordinates = pd.read_csv(config.processed_data_dir + 'ptm_coordinates.csv',index_col = 0,


Loading information on PTMs on alternative proteins


  self.alternative_ptms = pd.read_csv(config.processed_data_dir + 'alternative_ptms.csv', dtype = {'Exon ID (Alternative)':str, 'Chromosome/scaffold name': str, 'Ragged':str, 'Genomic Coordinates':str, 'Second Exon': str, 'Alternative Residue': str, 'Protein':str})


## Construct null residue dataframes

In [None]:
import multiprocessing
def find_residues_one_protein(mapper, uniprot_id):
    """
    Given a uniprot ID, find all PTMs present in the protein and save to dataframe
    
    Parameters
    ----------
    unip_id: strings
        Ensemble transcript for the protein of interest
        
    Returns
    -------
    ptm_df: pandas dataframe
        Dataframe containing gene id, transcript id, protein id, residue modified, location of residue and modification type. Each row
            corresponds to a unique ptm
    """
    gene_id = mapper.proteins.loc[uniprot_id, 'Gene stable ID']
    transcript_id = mapper.proteins.loc[uniprot_id, 'Associated Matched Transcripts']
    iso_type = mapper.proteins.loc[uniprot_id, 'UniProt Isoform Type']

    trans_info = mapper.transcripts.loc[transcript_id.split(';')[0]]

    #iterate through transcript sequence and add each residue to the lists, using same structure as residue_info dataframe
    seq = trans_info['Amino Acid Sequence']
    protein_list, type_list, ptm_list, pos_list, residue_list, gene_name_list, gene_id_list, transcripts = [],[],[],[],[],[],[],[]
    for pos, residue in zip(range(1,len(seq)+1),seq):
        ptm_list.append(uniprot_id+residue+str(pos))
        pos_list.append(pos)
        residue_list.append(residue)


    residue_df = pd.DataFrame({'Gene name':mapper.genes.loc[gene_id.split(';')[0], 'Gene name'], 'Genes': gene_id, 'Transcripts': transcript_id,'Protein':uniprot_id,'Isoform Type':iso_type, 'Residue': residue_list, 
                'PTM Location (AA)':pos_list, 'Modification': 'Null', 'Modification Class':'Null'})

    residue_df.index = ptm_list

    return residue_df
    
def find_residues_list(mapper, uniprot_ids):
    num_ptms = {}
    df_list = []
    for prot in uniprot_ids:
        #check to make sure transcript has appropriate information
        info = find_residues_one_protein(mapper, prot)
        if info.empty:
            num_ptms[prot] = 0
        else:
            df_list.append(info)	

    residue_df = pd.concat(df_list).dropna(axis = 1, how = 'all')
    residue_df['PTM Location (AA)'] = residue_df['PTM Location (AA)'].astype(int)
    return residue_df, num_ptms
        


def find_residues(mapper, fname = 'residue_info.csv', PROCESSES = 4):
    #remove proteins without matched transcripts
    trim_proteins = mapper.proteins.dropna(subset = 'Associated Matched Transcripts').copy()
    print('Constructing initial residue info dataframe')
    if PROCESSES == 1:
        residue_info, num_residues = find_residues_list(mapper, trim_proteins.index.values)
    else:
        #check num_cpus available, if greater than number of cores - 1 (to avoid freezing machine), then set to PROCESSES to 1 less than total number of cores
        num_cores = multiprocessing.cpu_count()
        if PROCESSES > num_cores - 1:
            PROCESSES = num_cores - 1
            
        protein_data_split = np.array_split(trim_proteins.index.values, PROCESSES)
        pool = multiprocessing.Pool(PROCESSES)
        #run with multiprocessing
        results = pool.starmap(find_residues_list, [(mapper, protein_data_split[i]) for i in range(PROCESSES)])
        
        #extract info from run
        residue_info = pd.concat([res[0] for res in results])
        num_ptms_list = [res[1] for res in results]
        num_ptms = {}
        for item in num_ptms_list:
            num_ptms.update(item)


    residue_info.to_csv(fname)
    mapper.ptm_info = residue_info.copy()
    return mapper

def mapResidues(mapper, odir = './', PROCESSES = 4):
    print('mapping residues to genomic location')
    mapper.mapPTMs_all()
    mapper.ptm_info.to_csv(odir + 'residue_info.csv')
    mapper.ptm_coordinates.to_csv(odir + 'residue_coordinates.csv')
    return mapper

def annotateResidues(mapper, odir = './'):
    print('annotating residues')
    ####run additional analysis
    if 'Tryptic Fragment' not in mapper.ptm_info.columns:
        print('Getting tryptic fragments associated with canonical ptms')
        mapper.getAllTrypticFragments()
    if 'Flanking Sequence' not in mapper.ptm_info.columns:
        print('Getting flanking sequences associated with canonical ptms')
        mapper.getAllFlankingSeqs(flank_size = 10)
    if 'inDomain' not in mapper.ptm_info.columns:
        print('Identifying PTMs that are in protein domains')
        mapper.findAllinDomains()

    mapper.ptm_info.to_csv(odir + 'residue_info.csv')
    return mapper

def projectResidues(mapper, odir = './', PROCESSES = 4):
    print('project residues')
    mapper.projectPTMs_toIsoformExons(alternative_only = False, PROCESSES = PROCESSES)

    mapper.alternative_ptms.to_csv(odir + 'alternative_ptms.csv', index = False)
    return mapper

In [None]:
mapper = find_residues(mapper, fname = figshare_dir + '/Analysis_For_Paper/NullModel/residue_info.csv', PROCESSES = 4)
mapper = mapResidues(mapper, odir = figshare_dir + '/Analysis_For_Paper/NullModel/')
mapper = annotateResidues(mapper,odir = figshare_dir + '/Analysis_For_Paper/NullModel/')
mapper = projectResidues(mapper, odir = figshare_dir + '/Analysis_For_Paper/NullModel/', PROCESSES = 4)

## Generate null model distributions

To generate the null model distributions, we follow the same procedure as the true modification data, performing the following steps for each modification class:
1. Sample a random subset of residues, matching the same number of residues for that modification.
2. Extract the mapping and projection data for those residues, treating them as "modified".
3. Calculate the constitutive rate and altered flank rate as done with true modifications.
4. Save the calculated rates to a list. Repeat a total of 100 times.
5. Save rates to a  text file.


### Load Data

In [None]:
def getIsoformSpecificPTMs(alternative_ptms, isoforms, required_length = 20):
    """
    Reduce alternative ptm dataframe to ptms that are unique to a specific protein sequence, rather than a specific transcript. This avoids issue in which multiple transcripts can code for the same protein isoform.

    Parameters
    ----------
    required_length : int, optional
        Minimum length of protein isoform to be considered. The default is 20 amino acids.

    Returns
    -------
    isoform_ptms (as attribute of mapper object): dataframe
        Dataframe of PTMs that are unique to a specific protein isoform, rather than a specific transcript.
    """
    #get isoform data, then separate isoform data into unique rows for each transcript
    isoforms = isoforms[isoforms['Isoform Type'] == 'Alternative'].copy()
    isoforms = isoforms[['Isoform ID', 'Transcript stable ID', 'Isoform Length']]
    isoforms['Transcript stable ID'] = isoforms['Transcript stable ID'].apply(lambda x: x.split(';'))
    isoforms = isoforms.explode('Transcript stable ID')
    isoforms = isoforms.rename({'Transcript stable ID': 'Alternative Transcript'}, axis = 1)
    
    #merge isoform and alternative ptm information
    isoform_ptms = alternative_ptms.merge(isoforms, on = 'Alternative Transcript', how = 'left').copy()
    
    #split by PTM (for cases where isoform are annotated with PTM
    isoform_ptms['Source of PTM'] = isoform_ptms['Source of PTM'].apply(lambda x: x.split(';'))
    isoform_ptms = isoform_ptms.explode('Source of PTM')


    #drop duplicate rows by isoform id
    isoform_ptms = isoform_ptms.drop_duplicates(subset = ['Isoform ID', 'Source of PTM'])
    isoform_ptms = isoform_ptms.drop('Alternative Transcript', axis = 1)

    #drop isoforms shorter than 20 amino acids
    isoform_ptms = isoform_ptms[isoform_ptms['Isoform Length'] >= required_length]
    return isoform_ptms

In [None]:
#dictionary for converting between single letter and full names of amino acids
residue_dict = {'A': 'Alanine', 'C': 'Cysteine', 'D':'AsparticAcid','E':'GlutamicAcid',
                'F':'Phenylalanine','G':'Glycine','H':'Histidine','I':'Isoleucine', 
                'K':'Lysine', 'L':'Leucine','M':'Methionine', 'N':'Asparagine',
                'P':'Proline','Q':'Glutamine','R':'Arginine','S':'Serine', 'T':'Threonine',
                'V':'Valine','W':'Tryptophan','Y':'Tyrosine'}

#load real ptm data
mapper = mapping.PTM_mapper()
#get isoform specific ptms (collapse to )
mapper.isoform_ptms = pd.read_csv(figshare_dir + '/PTM_Projection_Data/processed_data_dir/isoform_ptms.csv', index_col = 0)

#copy ptm info
ptm_info = mapper.ptm_info.copy()

#get file for converting modification types to their grouped classess (phosphotyrosine to phosphorylation, etc.)
mod_groups = pd.read_csv(figshare_dir + '/External_Data/modification_conversion.csv', header = 0)


#separate based on modifications, such that each row is a unique residue/modification type pair
exploded_mods = ptm_info.copy()
exploded_mods["Modification Class"] = exploded_mods['Modification Class'].apply(lambda x: x.split(';'))
exploded_mods = exploded_mods.explode('Modification Class').reset_index()
exploded_mods = exploded_mods.rename({'index':'PTM'}, axis = 1)

#load residue information
residue_info = pd.read_csv(figshare_dir + '/Analysis_For_Paper/Null_Model/residue_info.csv')
residue_info = residue_info[residue_info['Isoform Type'] == 'Canonical']
residue_info['PTM'] = residue_info.index

### Generate Null Distribution for Constitutive Rates

In [None]:
mods_to_test = ['PHOS','UBIQ', 'ACET', 'GLCN', 'METH','SUMO', 'DIMETH', 'NTRY', 'HYDR', 'TRIMETH','SULF', 'PALM', 'CITR', 'GGLU']
#mods_to_test = ['HYDR', 'TRIMETH','SULF', 'PALM', 'GGLU']
#mods_to_test = ['CITR']
mod_column = 'Mod Class'
for mod in mods_to_test:
    print(f'Getting null {mod} data')
    tmp_mods = exploded_mods[exploded_mods[mod_column] == mod].copy()
    #load toy mapped residues
    residues = tmp_mods['Residue'].unique()
    sizes = tmp_mods.groupby('Residue').size()
    mapped_residues = {}
    for res in residues:
        try:
            mapped_residues[res] = getIsoformSpecificPTMs(pd.read_csv(f'../MockModifications/mapped/{residue_dict[res]}.csv', dtype = {'Exon ID (Alternative)':str, 
                                                                        'Chromosome/scaffold name': str, 'Ragged':str, 'Genomic Coordinates':str, 'Exon ID (Canonical)': str, 
                                                                            'Alternative Residue': str, 'Protein':str, 'Canonical Protein Position (AA)': str, 
                                                                        'Alternative Protein Position (AA)':str,'Mapping Result':str, 'Second Exon': str}), isoforms)
        except:
            print(f'{residue_dict[res]} not found. {sizes[res]} with modification.')
            
    unique_res = tmp_mods['Residue'].unique()
    num_mod = tmp_mods.groupby('Residue').size()[mapped_residues.keys()]
    
    np.random.seed(100)
    rate_list = []
    for i in tqdm.tqdm(range(100)):
        #for each residue, randomly sample the number that matches that found within modification population
        sampled_results = None
        for residue in num_mod.index:
            residue_list = residue_info.loc[residue_info['Residue'] == residue, "PTM"].unique()
            sample = np.random.choice(residue_list, size = num_mod[residue], replace = False)
            if sampled_results is None:
                conservation_data = residue_info.loc[sample].copy()
                sampled_results = mapped_residues[residue][mapped_residues[residue]['Source of PTM'].isin(sample)]
            else:
                conservation_data = pd.concat([conservation_data, residue_info.loc[sample]])
                sampled_results = pd.concat([sampled_results,mapped_residues[residue][mapped_residues[residue]['Source of PTM'].isin(sample)]])

        #get alternative isoforms for which the PTM is conserved (i.e. the PTM is present in the isoform and has residue data) or lost (i.e. the PTM is not present in the isoform)    
        conserved_transcripts = sampled_results[sampled_results['Mapping Result'] == 'Success'].groupby('Source of PTM')['Isoform ID'].apply(list)
        lost_transcripts = sampled_results[sampled_results['Mapping Result'] != 'Success'].groupby('Source of PTM')['Isoform ID'].apply(list)
        

        #calculate the number of conserved transcripts and collapse into a single string
        num_conserved_transcripts = conserved_transcripts.apply(len)
        conserved_transcripts = conserved_transcripts.apply(','.join)

        #calculate the number of lost transcripts and collapse into a single string
        num_lost_transcripts = lost_transcripts.apply(len)
        lost_transcripts = lost_transcripts.apply(','.join)
                                               

        #conservation_data['Number of Conserved Transcripts'] = num_conserved
        #conservation_data['Number of Lost Transcripts'] = num_lost                                    

        conservation_score = []
        for ptm in conservation_data.index:
            num_conserved = num_conserved_transcripts[ptm] if ptm in num_conserved_transcripts else 0
            num_lost = num_lost_transcripts[ptm] if ptm in num_lost_transcripts else 0
            #check if there are any conserved transcripts (or if not and is NaN)
            if num_conserved != num_conserved and num_lost == num_lost:
                conservation_score.append(0)
            elif num_conserved == 0 and num_lost == 0:
                conservation_score.append(1)
            elif num_conserved != num_conserved and num_lost != num_lost:
                conservation_score.append(1)
            #check if any lost transcripts: if not replace NaN with 0 when calculating
            elif num_lost != num_lost and num_conserved == num_conserved:
                conservation_score.append(1)
            else:
                conservation_score.append(num_conserved/(num_conserved+num_lost))

        conservation_data['Score'] = conservation_score
        conservation_data['Constitutive'] = (conservation_data['Score'] == 1)*1
        rate = conservation_data[conservation_data['Constitutive'] == 1].shape[0]/conservation_data.shape[0]
        rate_list.append(rate)
        
    # open file
    with open(figshare_dir + f'/Analysis_For_Paper/Null_Model/Null_Constitutive_Rates/{mod}.txt', 'w+') as f:

        # write elements of list
        for items in rate_list:
            f.write('%s\n' %items)

        print("File written successfully")


    # close the file
    f.close()

### Get Null Model Distributions for Altered Flank Rates

In [None]:
import re

def getFlankingSequences(isoforms, isoform_ptms, flank_size = 5):
    flank_seq = []
    for i, row in isoform_ptms.iterrows():
        iso_id = row['Isoform ID']
        protein_sequence = isoforms.get(iso_id)
        if protein_sequence == protein_sequence:
                #if multiple transcripts associated with protein, only use first transcript (should be same seq)
            pos = row['Alternative Protein Position (AA)']
            if pos==pos:
                pos = int(float(pos))
                seq_length = len(protein_sequence)
                if pos <= flank_size:
                    #if amino acid does not have long enough N-terminal flanking sequence, add spaces to cushion sequence
                    spaces = ' ' * (flank_size - pos + 1)
                    flank_seq.append(spaces + protein_sequence[0:pos-1]+protein_sequence[pos-1].lower()+protein_sequence[pos:pos+flank_size])
                elif len(protein_sequence)-pos <= flank_size:
                    #if amino acid does not have long enough C-terminal flanking sequence, add spaces to cushion sequence
                    spaces = ' ' * (flank_size - (seq_length - pos))
                    flank_seq.append(protein_sequence[pos-flank_size-1:pos-1]+protein_sequence[pos-1].lower()+protein_sequence[pos:]+spaces)
                else:
                    #full flanking sequence available
                    flank_seq.append(protein_sequence[pos-flank_size-1:pos-1]+protein_sequence[pos-1].lower()+protein_sequence[pos:pos+flank_size])
            else:
                flank_seq.append(np.nan)
        else:
            flank_seq.append(np.nan)
    return flank_seq


def getFlankingSequences_can(transcripts, residue_info, flank_size = 5):
    flank_seq = []
    for i, row in residue_info.iterrows():
        trans_id = row['Transcripts']
        if ';' in trans_id:
            trans_id = trans_id.split(';')[0]
            
        protein_sequence = transcripts.loc[trans_id]
        if isinstance(protein_sequence, pd.Series):
            protein_sequence = protein_sequence['Amino Acid Sequence']
                #if multiple transcripts associated with protein, only use first transcript (should be same seq)
            if row['PTM Location (AA)'] == row['PTM Location (AA)']:
                pos = int(float(row['PTM Location (AA)']))
                if pos <= flank_size:
                    #if amino acid does not have long enough N-terminal flanking sequence, add spaces to cushion sequence
                    spaces = ''.join([' ' for i in range(flank_size - pos + 1)])
                    flank_seq.append(spaces + protein_sequence[0:pos-1]+protein_sequence[pos-1].lower()+protein_sequence[pos:pos+flank_size])
                elif len(protein_sequence)-pos <= flank_size:
                    #if amino acid does not have long enough C-terminal flanking sequence, add spaces to cushion sequence
                    spaces = ''.join([' ' for i in range(flank_size - (len(protein_sequence)-pos))])
                    flank_seq.append(protein_sequence[pos-flank_size-1:pos-1]+protein_sequence[pos-1].lower()+protein_sequence[pos:]+spaces)
                else:
                    #full flanking sequence available
                    flank_seq.append(protein_sequence[pos-flank_size-1:pos-1]+protein_sequence[pos-1].lower()+protein_sequence[pos:pos+flank_size])
            else:
                flank_seq.append(np.nan)
        else:
            flank_seq.append(np.nan)
    return flank_seq

def matchedFlankSeq(seq1, seq2):
    return (seq1 == seq2)*1

def compareAllFlankSeqs(residue_info, res, flank_size = 5, unique_isoforms = True):
    """
    Given the alternative ptms and canonical ptm data, compare flanking sequences and determine if they are identical. 
    
    Parameters
    ----------
    flank_size:
        size of the flanking sequence to compare. IMPORTANT, this should not be larger than the available flanking sequence in the ptm_info dataframe
    """
    conserved_flank = []
    for i in res.index:
        #check if alt flanking seq exists
        alt_flank = res.loc[i, 'Flanking Sequence']
        if alt_flank != alt_flank:
            conserved_flank.append(np.nan)
        else:
            
            ptm = res.loc[i, 'Source of PTM']
            if ';' in ptm:
                for p in ptm.split(';'):
                    try:
                        tmp_info = residue_info.loc[p]
                        if isinstance(tmp_info, pd.DataFrame):
                            iso_id = res.loc[i, 'Isoform ID']
                            trans_id = isoforms.loc[iso_id, 'Transcript stable ID'].split(';')
                            tmp_info = tmp_info[tmp_info['Transcripts'] == trans_id[0]]
                            if tmp_info.shape[0] == 0:
                                conserved_flank.append(np.nan)
                            else:
                                can_flank = tmp_info['Flanking Sequence'].values[0]
                            can_flank = tmp_info['Flanking Sequence'].values[0]
                        else:
                            can_flank = tmp_info['Flanking Sequence']
                        #can_flank = can_flank[n_term:c_term]
                        conserved_flank.append(matchedFlankSeq(can_flank, alt_flank))
                        success = True
                    except:
                        pass
                if not success:
                    conserved_flank.append(matchedFlankSeq(can_flank, alt_flank))
            else:
                
                tmp_info = residue_info.loc[ptm]
                if isinstance(tmp_info, pd.DataFrame):
                    iso_id = res.loc[i, 'Isoform ID']
                    trans_id = isoforms.loc[iso_id, 'Transcript stable ID'].split(';')
                    tmp_info = tmp_info[tmp_info['Transcripts'] == trans_id[0]]
                    if tmp_info.shape[0] == 0:
                        conserved_flank.append(np.nan)
                    else:
                        can_flank = tmp_info['Flanking Sequence'].values[0]
                else:
                    can_flank = tmp_info['Flanking Sequence']
                #can_flank = can_flank[n_term:c_term]
                conserved_flank.append(matchedFlankSeq(can_flank, alt_flank))
                
                    #conserved_flank.append(np.nan)
    return conserved_flank

def compareAllFlankSeqs2(residue_info, res, flank_size = 5, unique_isoforms = True):
    """
    Given the alternative ptms and canonical ptm data, compare flanking sequences and determine if they are identical. 
    
    Parameters
    ----------
    flank_size:
        size of the flanking sequence to compare. IMPORTANT, this should not be larger than the available flanking sequence in the ptm_info dataframe
    """
    conserved_flank = []
    for i in res.index:
        #check if alt flanking seq exists
        alt_flank = res.loc[i, 'Flanking Sequence']
        if alt_flank != alt_flank:
            conserved_flank.append(np.nan)
        else:
            
            ptm = res.loc[i, 'Source of PTM'].split(';')[0]
            try:
                tmp_info = residue_info.loc[ptm]
                can_flank = tmp_info['Flanking Sequence']
                #can_flank = can_flank[n_term:c_term]
                conserved_flank.append(matchedFlankSeq(can_flank, alt_flank))
                success = True
            except Exception as e:
                print(e)
                conserved_flank.append(np.nan)
                
                    #conserved_flank.append(np.nan)

    return conserved_flank

In [None]:
residue_dict = {'A': 'Alanine', 'C': 'Cysteine', 'D':'AsparticAcid','E':'GlutamicAcid','F':'Phenylalanine','G':'Glycine','H':'Histidine','I':'Isoleucine','L':'Leucine',  'K':'Lysine','M':'Methionine', 'N':'Asparagine','P':'Proline','Q':'Glutamine','R':'Arginine','S':'Serine', 'T':'Threonine','V':'Valine','W':'Tryptophan','Y':'Tyrosine'}

transcripts = pd.read_csv(config.processed_data_dir + 'transcripts.csv', index_col = 0)
residue_info['Flanking Sequence'] = getFlankingSequences_can(transcripts, residue_info, flank_size = 5)

mapper.isoforms  = mapper.isoforms.set_index('Isoform ID')
isoforms = mapper.isoforms['Amino Acid Sequence'].to_dict()

mods_to_test =  ['Phosphorylation','Ubiquitination', 'Acetylation', 'Glycosylation', 'Methylation','Sumoylation', 'Dimethylation', 'Nitrosylation', 'Hydroxylation', 'Trimethylation','Sulfation', 'Palmitoylation', 'Carboxylation', 'Crotonylation', 'Succinylation']
mod_column = 'Mod Class'
for mod in mods_to_test:
    print(f'Getting null {mod} data')
    tmp_mods = exploded_mods[exploded_mods[mod_column] == mod].copy()
    #load toy mapped residues
    residues = tmp_mods['Residue'].unique()
    sizes = tmp_mods.groupby('Residue').size()
    mapped_residues = {}
    for res in residues:
        try:
            mapped_residues[res] = getIsoformSpecificPTMs(pd.read_csv(f'../MockModifications/mapped/{residue_dict[res]}.csv', dtype = {'Exon ID (Alternative)':str, 
                                                                        'Chromosome/scaffold name': str, 'Ragged':str, 'Genomic Coordinates':str, 'Exon ID (Canonical)': str, 
                                                                            'Alternative Residue': str, 'Protein':str, 'Canonical Protein Position (AA)': str, 
                                                                        'Alternative Protein Position (AA)':str,'Mapping Result':str, 'Second Exon':str}), isoforms)
        except:
            print(f'{residue_dict[res]} not found. {sizes[res]} with modification.')
            
    unique_res = tmp_mods['Residue'].unique()
    num_mod = tmp_mods.groupby('Residue').size()[mapped_residues.keys()]
    
    np.random.seed(100)
    flank_conservation_list = []
    for i in tqdm.tqdm(range(100)):
        #for each residue, randomly sample the number that matches that found within modification population
        sampled_results = None
        for residue in num_mod.index:
            #residue_list = residue_info.loc[residue_info['Residue'] == residue, "PTM"].unique()
            residue_list = np.unique(residue_info[residue_info['Residue'] == residue].index.values)
            sample = np.random.choice(residue_list, size = num_mod[residue], replace = False)
            if sampled_results is None:
                conservation_data = residue_info.loc[sample].copy()
                sampled_results = mapped_residues[residue][mapped_residues[residue]['Source of PTM'].isin(sample)]
            else:
                conservation_data = pd.concat([conservation_data, residue_info.loc[sample]])
                sampled_results = pd.concat([sampled_results,mapped_residues[residue][mapped_residues[residue]['Source of PTM'].isin(sample)]])

        #get flanking sequences
        sampled_results = sampled_results[sampled_results['Mapping Result'] == 'Success']
        sampled_results = sampled_results.reset_index()
        sampled_results['Flanking Sequence'] = getFlankingSequences(isoforms, sampled_results)
        sampled_results = sampled_results.dropna(subset = 'Flanking Sequence')
        conserved_flanks = compareAllFlankSeqs2(residue_info, sampled_results)
        conserved_flanks = [f for f in conserved_flanks if f == f]
        #calculate fraction of ptms with conserved flank
        flank_conservation_list.append(1- sum(conserved_flanks)/len(conserved_flanks))
        
                                               
        
    # open file
    with open(figshare_dir + f'/Analysis_For_Paper/Null_Model/Null_Flank_Rates/{mod}.txt', 'w+') as f:

        # write elements of list
        for items in flank_conservation_list:
            f.write('%s\n' %items)

        print("File written successfully")


    # close the file
    f.close()