# Introduction

### Get ProteomeScout Phosphorylation Data
- download the phosphorylation zip file from ProteomeScout (https://proteomescout.wustl.edu/compendia/proteomescout_phosphorylation.zip)
- unzip the file 

### Cross Reference with ProteomeScout Phosphorylation Data
-  For each of the resource file, cross reference with ProteomeScout phosphorylation data
    - this step will require the ProteomeScout API
    - the final formatted file only contains data with confirmed phosphorylation site
-  Create a binary substrate/kinase map for each of the filtered resource files


# Cross Referencing with ProteomeScout Phosphorylation Data
### Initializing

In [8]:
# IMPORTS

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import pandas as pd
import os
import time

import sys
sys.path.append('../../../ProteomeScoutAPI/')
import proteomeScoutAPI
import proteome_scout_phosphorylation as psp
import createSubKinMatrix

In [9]:
# version date
version = '2020-02-26'
# protoemeScout version
pscout_version = '2020-02-07'

# file location
# local (../../), sammas (/Volumes/naegle_lab/Kinase Predictions/)
base = '../../'


In [10]:
# DEFINE FILE NAMES

# global Kinase Map
KinaseMap = base + 'Data/Map/globalKinaseMap.csv'

# ProteomeScout phosphorylation data zip file downloading address
data_add = 'https://proteomescout.wustl.edu/compendia/proteomescout_phosphorylation.zip'
# phosphorylation data file name in the zip file
data_file = 'data.tsv'
citation_file = 'citations.tsv'


# unzipped ProteomeScout phosphorylation data (all species)
pscout_data = base + 'Data/Raw/ProteomeScout' + pscout_version + '/' + data_file
pscout_citation = base + 'Data/Raw/ProteomeScout' + pscout_version + '/' + citation_file

# input/preprocessed prediction data 
PP_file = base + 'Data/Formatted/PhosphoPICK/PhosphoPICK_formatted_' + version + '.csv'  # PhosphoPICK files      
GPS_file = base + 'Data/Formatted/GPS/GPS_formatted_' + version + '.csv'                 # GPS files
NW_file = base + 'Data/Formatted/NetworKIN/NetworKIN_formatted_' + version + '.csv'      # NetworKIN files

# output dir
out_dir = base + 'Data/Final/'
pp = 'PhosphoPICK/PhosphoPICK_' + version
nw = 'NetworKIN/NetworKIN_' + version
gps = 'GPS/GPS_' + version


In [3]:
# DEFINE FUNCTION 'get_phosphosites'

def get_phosphosites(pscout, df, filter_setting, substrate_column = 'substrate_id'):
    """
    Builds dictionary of substrates and their phosphosites as pulled from ProteomeScout
    
    Parameters
    ----------
    pscout: reference ProteomeScout file
    df: dataframe of the predictions
    threshold:
        'low', 'medium', 'high'
    filter_setting[filter_by, thresh/evidence_code, peptide_length]:
        filter_by:
            'number' = number of evidence
            'type' = source of evidence
        thresh/evidence_code:
            threshold = (more than) 1, 2, or 3 experiments
            evidence_code = see citations.tsv in the ProteomeScout download for experiment ids of interest.
        peptide_length:
            7 for PhosphoPICK and GPS
            5 for NetworKIN
    """
    # get the list of unique substrates' uniprot protein accessions
    substrates = df[substrate_column].unique()
    substrate_sites = {}
    # get the known phosphosites from proteomeScout file by the uniprot protein accessions
    for sub in substrates: 
        df_s= psp.phosphorylation_data(pscout, sub,filter_setting, 'S') 
        df_t= psp.phosphorylation_data(pscout, sub,filter_setting, 'T')
        df_y= psp.phosphorylation_data(pscout, sub,filter_setting, 'Y')
        df_temp = pd.concat([df_s, df_t, df_y])
        if len(df_temp) > 0:
            substrate_sites[sub] = df_temp[df_temp['phosphorylation'] == 1]
        else:
            substrate_sites[sub] = pd.DataFrame(columns = ['substrate_acc', 'site', 'peptide', 'phosphorylation'])

    return substrate_sites

In [11]:
# DEFINE FUNCTION 'filter_data'

def filter_data(data, pscout, filter_setting):
        """
        Filters the results file to only includes sites with known phosphorylation events
        as determined by ProteomeScout. Filters based on site id and peptide segment
        
        Parameters
        ----------
        data: dataframe of the predictions
        pscout: reference ProteomeScout file
        filter_setting: [filter_by, thresh/evidence_code, peptide_length]
        """
        # Builds dictionary of substrates and their phosphosites 
        phosphosites = get_phosphosites(pscout, data, filter_setting, substrate_column = 'substrate_acc')

        keep_indexes = []
        for index, row in data.iterrows():
            substrate = row['substrate_acc'] # protein accession
            site = row['site']               # aa and site position
            peptide = row['pep'].upper().replace('_', '') # pep seq
            if site in phosphosites[substrate]['site'].to_list():
                keep_indexes.append(index) # if the site matches in the phosphosites dictionary, keep the entry
            elif peptide in phosphosites[substrate]['peptide'].to_list():
                keep_indexes.append(index) # if the site doesn't match, keep the entry that the pep seq matches in the phosphosites dictionary
       
        return data.filter(keep_indexes, axis = 0)

In [6]:
# DEFINE FUNCTION 'x_reference_pscout'

def x_reference_pscout(perdictor, pscout, filter_setting, output):
    """
    Filters the large prediction dataframe by chunks

    Parameters
    ----------
    perdictor: the input file location
    pscout: reference ProteomeScout file
    filter_setting: [filter_by, thresh/evidence_code, peptide_length]
    output: output file location
    """
    print('reading file by chunks:')
    for chunk in pd.read_csv(perdictor, chunksize = 1000000, sep = ','):
        print('Cross referencing with ProteomeScout:')
        start = time.time()
        filtered_chunk = filter_data(chunk, pscout, filter_setting)
        end = time.time()
        print (f"chunk time \t{(end-start):.3f}")
        
        print('remove any unmatched phosphosite type:')
        start = time.time()
        filtered_chunk = rm_unmatched_kinase_type(filtered_chunk)
        end = time.time()
        print (f"chunk time \t{(end-start):.3f}")

        if not os.path.isfile(output):
            filtered_chunk.to_csv(output, mode='a', index=False, sep=',')
        else:
            filtered_chunk.to_csv(output, mode='a', index=False, sep=',', header=False)



In [7]:
# DEFINE FUNCTION 'rm_unmatched_kinase_type'

def rm_unmatched_kinase_type(df):
    """
    Check to see if the predicted kinase type matchs with the phosphosite type
    remove the ones that are not

    Parameters
    ----------
    df: dataframe
    """
    
    # get the kinase type from the globalKinaseMap
    kin = pd.read_csv(KinaseMap, usecols = ['Kinase Name', 'Type'])
    y_kin = kin.loc[kin['Type'] == 'Pkinase_tyr']
    st_kin = kin.loc[kin['Type'] == 'Pkinase']
    dual_kin = kin.loc[kin['Type'] == 'Pkinase/Pkinase_tyr']
    
    df = df[['substrate_id', 'substrate_acc','substrate_name', 'site','pep', 'score','Kinase Name']]
    df_y = df[df['Kinase Name'].isin(y_kin['Kinase Name'])]
    df_y = df_y[df_y['site'].str.contains('Y')]
    df_st = df[df['Kinase Name'].isin(st_kin['Kinase Name'])]
    df_st = df_st[df_st['site'].str.contains('S|T')]
    df_dual = df[df['Kinase Name'].isin(dual_kin['Kinase Name'])]
    
    df_final = pd.concat([df_y, df_st, df_dual])
    df_final = df_final.reset_index()
    
    return df_final

### Download, Unzip and Save ProteomeScout Phosphorylation Data

In [6]:
# download and unzip current ProteomeScout data
resp = urlopen(data_add)
zipfile = ZipFile(BytesIO(resp.read()))
file = open(pscout_data, 'w')
for line in zipfile.open(data_file).readlines():
    file.write(line.decode('utf-8'))
file.close()

file = open(pscout_citation, 'w')
for line in zipfile.open(citation_file).readlines():
    file.write(line.decode('utf-8'))
file.close()

In [6]:
# load the ProteomeScout data using ProteomeScout API
pscout = proteomeScoutAPI.ProteomeScoutAPI(pscout_data)

## x ProteomeScout Phosphorylation Data: All Human phosphosites

**filter_setting\[filter_by, thresh/evidence_code, peptide_length\]**
- filter_by:
    - 'number' = number of evidence
    - 'type' = source of evidence
- thresh/evidence_code:
    - threshold = (more than) 1, 2, or 3 experiments
    - evidence_code = see citations.tsv in the ProteomeScout download for experiment ids of interest.
        - 1575: dbPTM
        - 1323: phosphoELM
        - 1395: HPRD
        - 1803: Uniprot
- peptide_length:
    - 7 for PhosphoPICK and GPS
    - 5 for NetworKIN


### PhosphoPICK 
- cross referece PhosphoPICK results with ProteomeScout: all human phosphosites
- keep entries of confirmed phosphorylation sites

In [None]:
filter_setting = ['number', 1, 7]
PP_final = out_dir + pp + '_all.csv'
x_reference_pscout(PP_file, pscout, filter_setting, PP_final)

In [5]:
# creat final prediction data in matrix format
start = time.time()
PP_matrix = out_dir + pp + '_all_matrix.csv'
df_matrix = createSubKinMatrix.createMatrix(PP_final, PP_matrix)
end = time.time()
print (f"Total Time\t{(end-start):.3f}")
df_matrix

reading input file....
time:  514.7537043094635
creating matrix....
time:  698.3762056827545
saving file....
time:  483.03302097320557
Total Time	1696.396


Kinase Name,substrate_id,substrate_name,substrate_acc,site,pep,ABL1,ABL2,AKT1,AKT2,ALK,...,RPS6KB1,SGK1,SRC,STK11,STK3,STK4,SYK,TBK1,TTK,VRK1
0,A0AUZ9_105,KANSL1L,A0AUZ9,S105,QKKLGEPSCNKLKNI,-,-,0.31384,0.493335,-,...,0.306717,0.952709,-,0.267147,0.0686061,0.0108412,-,0.999963,0.25365,0.161356
1,A0AUZ9_205,KANSL1L,A0AUZ9,S205,KKIVPGHSNVPVSSS,-,-,0.262498,0.395382,-,...,0.18495,0.729764,-,0.523432,0.95497,0.117029,-,0.609988,0.823403,0.998463
2,A0AUZ9_462,KANSL1L,A0AUZ9,S462,PEQDFEMSPSSPTLL,-,-,0.299157,0.431462,-,...,0.234139,0.942794,-,0.520239,0.958706,0.0848029,-,0.688605,0.999362,0.857669
3,A0AUZ9_464,KANSL1L,A0AUZ9,S464,QDFEMSPSSPTLLLR,-,-,0.28393,0.505125,-,...,0.288901,0.619724,-,0.326365,0.808062,0.039911,-,0.94385,0.946977,0.57769
4,A0AUZ9_519,KANSL1L,A0AUZ9,S519,NGIYRSASENLDELS,-,-,0.0303407,0.133198,-,...,0.101713,0.26985,-,0.411705,0.75646,0.0369049,-,0.966373,0.934602,0.985039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235927,Q9Y6Y9_22,LY96,Q9Y6Y9,Y22,FTEAQKQYWVCNSSD,0.956079,0.959977,-,-,0.415029,...,-,-,0.512277,-,-,-,0.676518,-,-,-
235928,Q9Y6Z7_141,COLEC10,Q9Y6Z7,S141,SIARLKTSMKFVKNV,-,-,0.634687,0.203743,-,...,0.611742,0.887418,-,0.690986,0.224621,0.0623551,-,0.915975,0.992806,0.65835
235929,Q9Y6Z7_155,COLEC10,Q9Y6Z7,T155,VIAGIRETEEKFYYI,-,-,0.809654,0.595772,-,...,0.708029,0.908424,-,0.661208,0.91861,0.127396,-,0.999998,0.824276,0.853006
235930,Q9YNA8_181,ERVK-19,Q9YNA8,T181,SESKPRGTSRLPAGQ,-,-,0.59466,0.273395,-,...,0.205518,0.336644,-,0.446203,0.998199,0.183948,-,0.997309,0.934105,0.983127


## NetworKIN
- cross referece NetworKIN specified results with ProteomeScout: all human phosphosites
- keep entries of confirmed phosphorylation sites

In [None]:
filter_setting = ['number', 1, 5]
NW_final = out_dir + nw + '_all.csv'
x_reference_pscout(NW_file, pscout, filter_setting, NW_final)

In [6]:
start = time.time()
NW_matrix = out_dir + nw + '_all_matrix.csv'
df_matrix = createSubKinMatrix.createMatrix(NW_final, NW_matrix)
end = time.time()
print (f"Total Time\t{(end-start):.3f}")
df_matrix

reading input file....
time:  506.28629207611084
creating matrix....
time:  2838.734772205353
saving file....
time:  588.9918270111084
Total Time	3935.263


Kinase Name,substrate_id,substrate_name,substrate_acc,site,pep,ABL1,ABL2,ACVR2A,ACVR2B,AKT1,...,STK4,TEC,TGFBR2,TLK1,TLK2,TNIK,TTK,TXK,TYK2,YES1
0,A0AV02_107,SLC12A8,A0AV02,Y107,GSGGVYSMISS,0.2054,0.1582,-,-,-,...,-,0.0538,-,-,-,-,-,0.0538,0,0.287
1,A0AV02_485,SLC12A8,A0AV02,T485,GEGNRTPESQK,-,-,-,-,0.0081,...,0.1564,-,-,0.0138,0.0138,0.1637,0.0002,-,-,-
2,A0AV02_488,SLC12A8,A0AV02,S488,NRTPESQKRKS,-,-,0.2836,0.2836,0.0071,...,-,-,0.2836,0.0276,0.0276,-,0,-,-,-
3,A0AV02_99,SLC12A8,A0AV02,S99,GVGERSSIGSG,-,-,0.4005,0.4005,0.0107,...,-,-,0.4005,0.1702,0.1702,-,0,-,-,-
4,A0AVF1_167,TTC26,A0AVF1,Y167,YMRSHYQEAID,0.2054,0.1581,-,-,-,...,-,0.0619,-,-,-,-,-,0.0619,0,0.1637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231245,Q9Y6Y8_935,SEC23IP,Q9Y6Y8,Y935,SKDEDYLGKVG,0.2054,0.1854,-,-,-,...,-,0.0482,-,-,-,-,-,0.0482,0,0.1801
231246,Q9Y6Y9_131,LY96,Q9Y6Y9,Y131,FSKGKYKCVVE,0.2054,0.1581,-,-,-,...,-,0.1795,-,-,-,-,-,0.1665,0,0.1636
231247,Q9Y6Y9_22,LY96,Q9Y6Y9,Y22,EAQKQYWVCNS,0.2054,0.1581,-,-,-,...,-,0.1795,-,-,-,-,-,0.1665,0,0.1637
231248,Q9Y6Z7_141,COLEC10,Q9Y6Z7,S141,ARLKTSMKFVK,-,-,0.1671,0.1671,0.0107,...,-,-,0.1671,0.0103,0.0103,-,0,-,-,-


## GPS
- cross referece Networkin results with ProteomeScout: all human phosphosites
- keep entries of confirmed phosphorylation sites

In [None]:
filter_setting = ['number', 1, 7]
GPS_final = out_dir + gps + '_all.csv'
x_reference_pscout(GPS_file, pscout, filter_setting, GPS_final)

In [10]:
start = time.time()
GPS_matrix = out_dir + gps + '_all_matrix.csv'
df_matrix = createSubKinMatrix.createMatrix(GPS_final, GPS_matrix)
end = time.time()
print (f"Total Time\t{(end-start):.3f}")
df_matrix

reading input file....
time:  2405.549310207367
creating matrix....
time:  11164.601485013962
saving file....
time:  882.6065351963043
Total Time	14458.558


Kinase Name,substrate_id,substrate_name,substrate_acc,site,pep,AAK1,ABL1,ABL2,ACVRL1,AKT1,...,ULK2,ULK3,VRK1,VRK2,WNK1,WNK2,WNK3,WNK4,YES1,ZAP70
0,A0AUZ9_105,KANSL1L,A0AUZ9,S105,QKKLGEPSCNKLKNI,-0.796,-,-,-0.802,5.636,...,3.407,0.13,0.16,24.09,-8.76e-05,2.92,0.61,1.465,-,-
1,A0AUZ9_205,KANSL1L,A0AUZ9,S205,KKIVPGHSNVPVSSS,-0.0327,-,-,-0.154,3.997,...,-0.235,1.637,0.159,46.489,0.000366,0.847,0.841,0.691,-,-
2,A0AUZ9_462,KANSL1L,A0AUZ9,S462,PEQDFEMSPSSPTLL,0.002,-,-,-1.26,6.592,...,3.243,-0.38,0.296,13.935,-0.000365,1.903,1.294,-0.415,-,-
3,A0AUZ9_464,KANSL1L,A0AUZ9,S464,QDFEMSPSSPTLLLR,0.077,-,-,-0.426,5.256,...,3.186,-0.73,0.09,33.548,-0.00044,0.576,0.98,0.498,-,-
4,A0AUZ9_519,KANSL1L,A0AUZ9,S519,NGIYRSASENLDELS,0.903,-,-,-0.645,11.737,...,3.167,1.518,0.047,-2.98,-0.000152,-0.938,-0.974,-0.587,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246005,Q9Y6Y9_22,LY96,Q9Y6Y9,Y22,FTEAQKQYWVCNSSD,-,-1.4,-3.85,-,-,...,-,-,-,-,-,-,-,-,8.093,0.819
246006,Q9Y6Z7_141,COLEC10,Q9Y6Z7,S141,SIARLKTSMKFVKNV,-2.07,-,-,-0.197,5.14,...,3.187,-0.495,-0.0712,16.753,0.000203,1.255,0.304,2.469,-,-
246007,Q9Y6Z7_155,COLEC10,Q9Y6Z7,T155,VIAGIRETEEKFYYI,0.575,-,-,-0.979,2.315,...,3.305,-0.58,0.174,32.484,0.000387,-0.0423,-0.634,0.175,-,-
246008,Q9YNA8_181,ERVK-19,Q9YNA8,T181,SESKPRGTSRLPAGQ,1.148,-,-,0.59,3.483,...,1.049,-0.289,0.116,4.295,0.000111,2.803,0.689,0.374,-,-


## x ProteomeScout Phosphorylation Data: more than 2 expermental evidence

### PhosphoPICK 
- cross referece PhosphoPICK results with ProteomeScout: > 2 exp
- keep entries of phosphorylation sites that confirmed by more than 2 experiments

In [11]:
filter_setting = ['number', 2, 7]
PP_final_2 = out_dir + pp + '_2exp.csv'
x_reference_pscout(PP_final, pscout, filter_setting, PP_final_2)

In [12]:
# creat final prediction data in matrix format
start = time.time()
PP_matrix_2 = out_dir + pp + '_2exp_matrix.csv'
df_matrix = createSubKinMatrix.createMatrix(PP_final_2, PP_matrix_2)
end = time.time()
print (f"Total Time\t{(end-start):.3f}")
df_matrix

reading input file....
time:  162.90110611915588
creating matrix....
time:  468.56794595718384
saving file....
time:  217.8954999446869
Total Time	849.563


Kinase Name,substrate_id,substrate_name,substrate_acc,site,pep,ABL1,ABL2,AKT1,AKT2,ALK,...,RPS6KB1,SGK1,SRC,STK11,STK3,STK4,SYK,TBK1,TTK,VRK1
0,A0AUZ9_714,KANSL1L,A0AUZ9,S714,GRKKRHLSETALGER,-,-,0.0177951,0.0371243,-,...,0.00649734,0.170851,-,0.677318,0.867988,0.0695949,-,0.972387,0.239441,0.797879
1,A0AVF1_167,TTC26,A0AVF1,Y167,IHYMRSHYQEAIDIY,0.770165,0.321441,-,-,0.81248,...,-,-,0.32199,-,-,-,0.488359,-,-,-
2,A0AVF1_174,TTC26,A0AVF1,Y174,YQEAIDIYKRILLDN,0.472744,0.343645,-,-,0.175019,...,-,-,0.24153,-,-,-,0.603764,-,-,-
3,A0AVK6_102,E2F8,A0AVK6,S102,DCIHEHLSGDEFEKS,-,-,0.368837,0.258889,-,...,0.49183,0.183528,-,0.484654,0.865798,0.100145,-,0.820923,0.997087,0.999137
4,A0AVK6_316,E2F8,A0AVK6,Y316,KTKIRRLYDIANVLS,0.216595,0.756549,-,-,0.0919727,...,-,-,0.36341,-,-,-,0.83055,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100208,Q9Y6Y8_868,SEC23IP,Q9Y6Y8,S868,ESLSRMGSDLKQGFI,-,-,0.145398,0.650732,-,...,0.050197,0.128863,-,0.107274,0.63288,0.437779,-,0.491654,0.79775,0.618392
100209,Q9Y6Y8_894,SEC23IP,Q9Y6Y8,S894,EFARAHTSSTQLQEE,-,-,0.165402,0.140362,-,...,0.0555068,0.0107901,-,0.166863,0.268594,0.766834,-,0.929262,0.957763,0.646376
100210,Q9Y6Y8_926,SEC23IP,Q9Y6Y8,S926,EAEKVVESPDFSKDE,-,-,0.742549,0.890605,-,...,0.167823,0.542968,-,0.158054,0.397929,0.769324,-,0.984595,0.535643,0.846977
100211,Q9Y6Y8_930,SEC23IP,Q9Y6Y8,S930,VVESPDFSKDEDYLG,-,-,0.738674,0.902687,-,...,0.18677,0.423831,-,0.101548,0.714349,0.76956,-,0.985227,0.850026,0.929873


## NetworKIN 
- cross referece NetworKIN specified results with ProteomeScout: >2 exp
- keep entries of phosphorylation sites that confirmed by more than 2 experiments

In [11]:
filter_setting = ['number', 2, 5]
NW_final_2 = out_dir + nw + '_2exp.csv'
x_reference_pscout(NW_final, pscout, filter_setting, NW_final_2)

reading file by chunks:
Cross referencing with ProteomeScout:
chunk time 	1198.034
remove any unmatched phosphosite type:
chunk time 	0.920
Cross referencing with ProteomeScout:
chunk time 	1119.530
remove any unmatched phosphosite type:
chunk time 	3.978
Cross referencing with ProteomeScout:
chunk time 	202.226
remove any unmatched phosphosite type:
chunk time 	0.822
Cross referencing with ProteomeScout:
chunk time 	193.833
remove any unmatched phosphosite type:
chunk time 	0.822
Cross referencing with ProteomeScout:
chunk time 	206.040
remove any unmatched phosphosite type:
chunk time 	0.792
Cross referencing with ProteomeScout:
chunk time 	199.863
remove any unmatched phosphosite type:
chunk time 	0.809
Cross referencing with ProteomeScout:
chunk time 	198.356
remove any unmatched phosphosite type:
chunk time 	0.724
Cross referencing with ProteomeScout:
chunk time 	183.605
remove any unmatched phosphosite type:
chunk time 	0.791
Cross referencing with ProteomeScout:
chunk time 	181.

In [13]:
# creat final prediction data in matrix format
start = time.time()
NW_matrix_2 = out_dir + nw + '_2exp_matrix.csv'
df_matrix = createSubKinMatrix.createMatrix(NW_final_2, NW_matrix_2)
end = time.time()
print (f"Total Time\t{(end-start):.3f}")
df_matrix

reading input file....
time:  186.29736804962158
creating matrix....
time:  1620.3240761756897
saving file....
time:  190.20430994033813
Total Time	1997.634


Kinase Name,substrate_id,substrate_name,substrate_acc,site,pep,ABL1,ABL2,ACVR2A,ACVR2B,AKT1,...,STK4,TEC,TGFBR2,TLK1,TLK2,TNIK,TTK,TXK,TYK2,YES1
0,A0AVF1_167,TTC26,A0AVF1,Y167,YMRSHYQEAID,0.2054,0.1581,-,-,-,...,-,0.0619,-,-,-,-,-,0.0619,0,0.1637
1,A0AVF1_174,TTC26,A0AVF1,Y174,EAIDIYKRILL,0.2054,0.1581,-,-,-,...,-,0.1636,-,-,-,-,-,0.1636,0.0001,0.2241
2,A0AVI2_1840,FER1L5,A0AVI2,S1840,LELDLSDMPLP,-,-,0.2323,0.2323,0.0066,...,-,-,0.2323,0.0268,0.0268,-,0,-,-,-
3,A0AVK6_102,E2F8,A0AVK6,S102,IHEHLSGDEFE,-,-,0.1642,0.1642,0.0094,...,-,-,0.1642,0.0733,0.0733,-,0.0001,-,-,-
4,A0AVK6_316,E2F8,A0AVK6,Y316,KIRRLYDIANV,0.2054,0.1581,-,-,-,...,-,0.1642,-,-,-,-,-,0.1642,0,0.1639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99808,Q9Y6Y8_868,SEC23IP,Q9Y6Y8,S868,LSRMGSDLKQG,-,-,0.1671,0.1671,0.0469,...,-,-,0.1686,0.0101,0.0101,-,0,-,-,-
99809,Q9Y6Y8_894,SEC23IP,Q9Y6Y8,S894,ARAHTSSTQLQ,-,-,0.1675,0.1675,0.0249,...,-,-,0.1689,0.0773,0.0773,-,0,-,-,-
99810,Q9Y6Y8_926,SEC23IP,Q9Y6Y8,S926,EKVVESPDFSK,-,-,-,-,0.0154,...,-,-,-,0.0144,0.0144,-,0,-,-,-
99811,Q9Y6Y8_930,SEC23IP,Q9Y6Y8,S930,ESPDFSKDEDY,-,-,0.3492,0.3492,0.0124,...,-,-,0.3521,0.2188,0.2188,-,0,-,-,-


## GPS 
- cross referece Networkin results with ProteomeScout: >2 exp
- keep entries of phosphorylation sites that confirmed by more than 2 experiments

In [13]:
filter_setting = ['number', 2, 7]
GPS_final_2 = out_dir + gps + '_2exp.csv'
x_reference_pscout(GPS_final, pscout, filter_setting, GPS_final_2)

reading file by chunks:
Cross referencing with ProteomeScout:
chunk time 	1019.004
remove any unmatched phosphosite type:
chunk time 	0.773
Cross referencing with ProteomeScout:
chunk time 	1085.658
remove any unmatched phosphosite type:
chunk time 	0.755
Cross referencing with ProteomeScout:
chunk time 	720.381
remove any unmatched phosphosite type:
chunk time 	0.722
Cross referencing with ProteomeScout:
chunk time 	277.821
remove any unmatched phosphosite type:
chunk time 	0.930
Cross referencing with ProteomeScout:
chunk time 	222.889
remove any unmatched phosphosite type:
chunk time 	0.820
Cross referencing with ProteomeScout:
chunk time 	353.611
remove any unmatched phosphosite type:
chunk time 	1.001
Cross referencing with ProteomeScout:
chunk time 	216.387
remove any unmatched phosphosite type:
chunk time 	3.697
Cross referencing with ProteomeScout:
chunk time 	216.951
remove any unmatched phosphosite type:
chunk time 	1.009
Cross referencing with ProteomeScout:
chunk time 	354.

In [14]:
# creat final prediction data in matrix format
start = time.time()
GPS_matrix_2 = out_dir + gps + '_2exp_matrix.csv'
df_matrix = createSubKinMatrix.createMatrix(GPS_final_2, GPS_matrix_2)
end = time.time()
print (f"Total Time\t{(end-start):.3f}")
df_matrix

reading input file....
time:  541.7289910316467
creating matrix....
time:  4299.524427890778
saving file....
time:  354.0694088935852
Total Time	5198.493


Kinase Name,substrate_id,substrate_name,substrate_acc,site,pep,AAK1,ABL1,ABL2,ACVRL1,AKT1,...,ULK2,ULK3,VRK1,VRK2,WNK1,WNK2,WNK3,WNK4,YES1,ZAP70
0,A0AUZ9_714,KANSL1L,A0AUZ9,S714,GRKKRHLSETALGER,2.112,-,-,-1.02,13.524,...,2.735,0.739,-0.0866,17.681,0.000554,1.7,1.339,2.607,-,-
1,A0AVF1_167,TTC26,A0AVF1,Y167,IHYMRSHYQEAIDIY,-,5.133,18.454,-,-,...,-,-,-,-,-,-,-,-,4.062,1.726
2,A0AVF1_174,TTC26,A0AVF1,Y174,YQEAIDIYKRILLDN,-,8.081,10.857,-,-,...,-,-,-,-,-,-,-,-,-1.97,-2.92
3,A0AVI2_1801,FER1L5,A0AVI2,Y1801,CVQSQKDYIWSLDAT,-,1.76,6.075,-,-,...,-,-,-,-,-,-,-,-,5.344,-0.225
4,A0AVI2_1804,FER1L5,A0AVI2,S1804,SQKDYIWSLDATSMK,-0.355,-,-,-0.416,5.486,...,0.976,-0.6,0.135,-7.15,-0.000133,1.709,0.637,0.186,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103347,Q9Y6Y8_868,SEC23IP,Q9Y6Y8,S868,ESLSRMGSDLKQGFI,-0.721,-,-,-0.102,10.466,...,1.464,0.971,0.2,13.094,0.000108,2.198,1.657,1.866,-,-
103348,Q9Y6Y8_894,SEC23IP,Q9Y6Y8,S894,EFARAHTSSTQLQEE,-1.58,-,-,-0.565,5.589,...,1.598,1.433,0.048,13.049,-3.02e-06,0.313,-0.0935,1.243,-,-
103349,Q9Y6Y8_926,SEC23IP,Q9Y6Y8,S926,EAEKVVESPDFSKDE,-0.244,-,-,-0.0759,3.692,...,1.969,0.849,0.209,16.522,6.97e-05,1.362,0.013,1.788,-,-
103350,Q9Y6Y8_930,SEC23IP,Q9Y6Y8,S930,VVESPDFSKDEDYLG,1.851,-,-,-0.162,5.064,...,1.081,0.206,0.205,-3.2,0.000255,1.759,-0.869,0.584,-,-


## x ProteomeScout Phosphorylation Data: more than 3 expermental evidence

### PhosphoPICK 
- cross referece PhosphoPICK results with ProteomeScout: > 3 exp
- keep entries of phosphorylation sites that confirmed by more than 3 experiments

In [10]:
filter_setting = ['number', 3, 7]
PP_final_3 = out_dir + pp + '_3exp.csv'
x_reference_pscout(PP_final_2, pscout, filter_setting, PP_final_3)

reading file by chunks:
Cross referencing with ProteomeScout:
chunk time 	1399.239
remove any unmatched phosphosite type:
chunk time 	1.036
Cross referencing with ProteomeScout:
chunk time 	360.003
remove any unmatched phosphosite type:
chunk time 	1.083
Cross referencing with ProteomeScout:
chunk time 	333.817
remove any unmatched phosphosite type:
chunk time 	4.696
Cross referencing with ProteomeScout:
chunk time 	362.401
remove any unmatched phosphosite type:
chunk time 	1.053
Cross referencing with ProteomeScout:
chunk time 	359.070
remove any unmatched phosphosite type:
chunk time 	0.920
Cross referencing with ProteomeScout:
chunk time 	365.897
remove any unmatched phosphosite type:
chunk time 	1.020
Cross referencing with ProteomeScout:
chunk time 	359.378
remove any unmatched phosphosite type:
chunk time 	1.083
Cross referencing with ProteomeScout:
chunk time 	2032.058
remove any unmatched phosphosite type:
chunk time 	0.909


In [15]:
# creat final prediction data in matrix format
start = time.time()
PP_matrix_3 = out_dir + pp + '_3exp_matrix.csv'
df_matrix = createSubKinMatrix.createMatrix(PP_final_3, PP_matrix_3)
end = time.time()
print (f"Total Time\t{(end-start):.3f}")
df_matrix

reading input file....
time:  102.87561082839966
creating matrix....
time:  287.2023937702179
saving file....
time:  132.44462513923645
Total Time	522.606


Kinase Name,substrate_id,substrate_name,substrate_acc,site,pep,ABL1,ABL2,AKT1,AKT2,ALK,...,RPS6KB1,SGK1,SRC,STK11,STK3,STK4,SYK,TBK1,TTK,VRK1
0,A0AUZ9_714,KANSL1L,A0AUZ9,S714,GRKKRHLSETALGER,-,-,0.0177951,0.0371243,-,...,0.00649734,0.170851,-,0.677318,0.867988,0.0695949,-,0.972387,0.239441,0.797879
1,A0AVF1_167,TTC26,A0AVF1,Y167,IHYMRSHYQEAIDIY,0.770165,0.321441,-,-,0.81248,...,-,-,0.32199,-,-,-,0.488359,-,-,-
2,A0AVF1_174,TTC26,A0AVF1,Y174,YQEAIDIYKRILLDN,0.472744,0.343645,-,-,0.175019,...,-,-,0.24153,-,-,-,0.603764,-,-,-
3,A0AVK6_102,E2F8,A0AVK6,S102,DCIHEHLSGDEFEKS,-,-,0.368837,0.258889,-,...,0.49183,0.183528,-,0.484654,0.865798,0.100145,-,0.820923,0.997087,0.999137
4,A0AVK6_355,E2F8,A0AVK6,S355,PEISPNTSGSSPVIH,-,-,0.310041,0.111103,-,...,0.299193,0.0584277,-,0.474796,0.996856,0.181339,-,0.950161,0.875254,0.99058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62771,Q9Y6Y8_742,SEC23IP,Q9Y6Y8,S742,MASLPSESNEPKRKL,-,-,0.619325,0.876464,-,...,0.0942796,0.347304,-,0.211332,0.439555,0.441494,-,0.425666,0.992769,0.258528
62772,Q9Y6Y8_862,SEC23IP,Q9Y6Y8,S862,LHLELKESLSRMGSD,-,-,0.498193,0.456675,-,...,0.0748745,0.443008,-,0.0907065,0.693489,0.577784,-,0.232276,0.451305,0.942333
62773,Q9Y6Y8_868,SEC23IP,Q9Y6Y8,S868,ESLSRMGSDLKQGFI,-,-,0.145398,0.650732,-,...,0.050197,0.128863,-,0.107274,0.63288,0.437779,-,0.491654,0.79775,0.618392
62774,Q9Y6Y8_894,SEC23IP,Q9Y6Y8,S894,EFARAHTSSTQLQEE,-,-,0.165402,0.140362,-,...,0.0555068,0.0107901,-,0.166863,0.268594,0.766834,-,0.929262,0.957763,0.646376


## NetworKIN 
- cross referece NetworKIN specified results with ProteomeScout: >3 exp
- keep entries of phosphorylation sites that confirmed by more than 3 experiments

In [12]:
filter_setting = ['number', 3, 5]
NW_final_3 = out_dir + nw + '_3exp.csv'
x_reference_pscout(NW_final_2, pscout, filter_setting, NW_final_3)

reading file by chunks:
Cross referencing with ProteomeScout:
chunk time 	1334.859
remove any unmatched phosphosite type:
chunk time 	1.031
Cross referencing with ProteomeScout:
chunk time 	276.368
remove any unmatched phosphosite type:
chunk time 	1.088
Cross referencing with ProteomeScout:
chunk time 	282.086
remove any unmatched phosphosite type:
chunk time 	1.150
Cross referencing with ProteomeScout:
chunk time 	252.170
remove any unmatched phosphosite type:
chunk time 	1.061
Cross referencing with ProteomeScout:
chunk time 	270.240
remove any unmatched phosphosite type:
chunk time 	0.961
Cross referencing with ProteomeScout:
chunk time 	278.601
remove any unmatched phosphosite type:
chunk time 	0.961
Cross referencing with ProteomeScout:
chunk time 	290.867
remove any unmatched phosphosite type:
chunk time 	1.166
Cross referencing with ProteomeScout:
chunk time 	279.410
remove any unmatched phosphosite type:
chunk time 	0.936
Cross referencing with ProteomeScout:
chunk time 	265.9

In [16]:
# creat final prediction data in matrix format
start = time.time()
NW_matrix_3 = out_dir + nw + '_3exp_matrix.csv'
df_matrix = createSubKinMatrix.createMatrix(NW_final_3, NW_matrix_3)
end = time.time()
print (f"Total Time\t{(end-start):.3f}")
df_matrix

reading input file....
time:  140.49775505065918
creating matrix....
time:  1001.6730706691742
saving file....
time:  101.22164678573608
Total Time	1244.354


Kinase Name,substrate_id,substrate_name,substrate_acc,site,pep,ABL1,ABL2,ACVR2A,ACVR2B,AKT1,...,STK4,TEC,TGFBR2,TLK1,TLK2,TNIK,TTK,TXK,TYK2,YES1
0,A0AVF1_167,TTC26,A0AVF1,Y167,YMRSHYQEAID,0.2054,0.1581,-,-,-,...,-,0.0619,-,-,-,-,-,0.0619,0,0.1637
1,A0AVF1_174,TTC26,A0AVF1,Y174,EAIDIYKRILL,0.2054,0.1581,-,-,-,...,-,0.1636,-,-,-,-,-,0.1636,0.0001,0.2241
2,A0AVK6_102,E2F8,A0AVK6,S102,IHEHLSGDEFE,-,-,0.1642,0.1642,0.0094,...,-,-,0.1642,0.0733,0.0733,-,0.0001,-,-,-
3,A0AVK6_355,E2F8,A0AVK6,S355,ISPNTSGSSPV,-,-,0.3208,0.3208,0.0166,...,-,-,0.3208,0.1016,0.1016,-,0.0001,-,-,-
4,A0AVK6_357,E2F8,A0AVK6,S357,PNTSGSSPVIH,-,-,0.1671,0.1671,0.0144,...,-,-,0.1671,0.1479,0.1479,-,0.0001,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62662,Q9Y6Y8_742,SEC23IP,Q9Y6Y8,S742,SLPSESNEPKR,-,-,0.3134,0.3134,0.0135,...,-,-,0.3161,0.1681,0.1681,-,0,-,-,-
62663,Q9Y6Y8_862,SEC23IP,Q9Y6Y8,S862,LELKESLSRMG,-,-,0.2941,0.2941,0.0113,...,-,-,0.2967,0.0164,0.0164,-,0,-,-,-
62664,Q9Y6Y8_868,SEC23IP,Q9Y6Y8,S868,LSRMGSDLKQG,-,-,0.1671,0.1671,0.0469,...,-,-,0.1686,0.0101,0.0101,-,0,-,-,-
62665,Q9Y6Y8_894,SEC23IP,Q9Y6Y8,S894,ARAHTSSTQLQ,-,-,0.1675,0.1675,0.0249,...,-,-,0.1689,0.0773,0.0773,-,0,-,-,-


## GPS
- cross referece Networkin results with ProteomeScout: >3 exp
- keep entries of phosphorylation sites that confirmed by more than 3 experiments

In [14]:
filter_setting = ['number', 3, 7]
GPS_final_3 = out_dir + gps + '_3exp.csv'
x_reference_pscout(GPS_final_2, pscout, filter_setting, GPS_final_3)

reading file by chunks:
Cross referencing with ProteomeScout:
chunk time 	1480.297
remove any unmatched phosphosite type:
chunk time 	14.850
Cross referencing with ProteomeScout:
chunk time 	241.280
remove any unmatched phosphosite type:
chunk time 	21.755
Cross referencing with ProteomeScout:
chunk time 	299.123
remove any unmatched phosphosite type:
chunk time 	1.040
Cross referencing with ProteomeScout:
chunk time 	296.836
remove any unmatched phosphosite type:
chunk time 	11.412
Cross referencing with ProteomeScout:
chunk time 	300.208
remove any unmatched phosphosite type:
chunk time 	4.735
Cross referencing with ProteomeScout:
chunk time 	312.530
remove any unmatched phosphosite type:
chunk time 	34.637
Cross referencing with ProteomeScout:
chunk time 	346.617
remove any unmatched phosphosite type:
chunk time 	23.773
Cross referencing with ProteomeScout:
chunk time 	291.930
remove any unmatched phosphosite type:
chunk time 	12.174
Cross referencing with ProteomeScout:
chunk time 

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [18]:
# creat final prediction data in matrix format
start = time.time()
GPS_matrix_3 = out_dir + gps + '_3exp_matrix.csv'
df_matrix = createSubKinMatrix.createMatrix(GPS_final_3, GPS_matrix_3)
end = time.time()
print (f"Total Time\t{(end-start):.3f}")
df_matrix

reading input file....
time:  2105.880768060684
creating matrix....
time:  5119.049507856369
saving file....
time:  391.0924758911133
Total Time	7618.281


Kinase Name,substrate_id,substrate_name,substrate_acc,site,pep,AAK1,ABL1,ABL2,ACVRL1,AKT1,...,ULK2,ULK3,VRK1,VRK2,WNK1,WNK2,WNK3,WNK4,YES1,ZAP70
0,A0AUZ9_714,KANSL1L,A0AUZ9,S714,GRKKRHLSETALGER,2.112,-,-,-1.02,13.524,...,2.735,0.739,-0.0866,17.681,0.000554,1.7,1.339,2.607,-,-
1,A0AVF1_167,TTC26,A0AVF1,Y167,IHYMRSHYQEAIDIY,-,5.133,18.454,-,-,...,-,-,-,-,-,-,-,-,4.062,1.726
2,A0AVF1_174,TTC26,A0AVF1,Y174,YQEAIDIYKRILLDN,-,8.081,10.857,-,-,...,-,-,-,-,-,-,-,-,-1.97,-2.92
3,A0AVI2_1801,FER1L5,A0AVI2,Y1801,CVQSQKDYIWSLDAT,-,1.76,6.075,-,-,...,-,-,-,-,-,-,-,-,5.344,-0.225
4,A0AVI2_1804,FER1L5,A0AVI2,S1804,SQKDYIWSLDATSMK,-0.355,-,-,-0.416,5.486,...,0.976,-0.6,0.135,-7.15,-0.000133,1.709,0.637,0.186,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103347,Q9Y6Y8_868,SEC23IP,Q9Y6Y8,S868,ESLSRMGSDLKQGFI,-0.721,-,-,-0.102,10.466,...,1.464,0.971,0.2,13.094,0.000108,2.198,1.657,1.866,-,-
103348,Q9Y6Y8_894,SEC23IP,Q9Y6Y8,S894,EFARAHTSSTQLQEE,-1.58,-,-,-0.565,5.589,...,1.598,1.433,0.048,13.049,-3.02e-06,0.313,-0.0935,1.243,-,-
103349,Q9Y6Y8_926,SEC23IP,Q9Y6Y8,S926,EAEKVVESPDFSKDE,-0.244,-,-,-0.0759,3.692,...,1.969,0.849,0.209,16.522,6.97e-05,1.362,0.013,1.788,-,-
103350,Q9Y6Y8_930,SEC23IP,Q9Y6Y8,S930,VVESPDFSKDEDYLG,1.851,-,-,-0.162,5.064,...,1.081,0.206,0.205,-3.2,0.000255,1.759,-0.869,0.584,-,-
