# Introduction
This notbook is use for creat files of the subset of the data by different thresholds to explore the effects of these  thresholds on overall predictor coverage and size. 

In [1]:
import pandas as pd

In [2]:
# file location
# local (../../), sammas (/Volumes/naegle_lab/Kinase Predictions/)
base = '/Volumes/naegle_lab/Kinase Predictions/'

In [86]:
# input files
# Prediction data type (d_type):
#. 'all' = all human phosphosites
#. '2exp' = with > 2 experimental evidence
#. '3exp' = with > 3 experimental evidence
d_type = '3exp'

# prediction data 
PhosphoPICK = base + 'Data/comparison/Thresh/PhosphoPICK/'+ d_type + '/PhosphoPICK.csv'
NetworKIN = base + 'Data/comparison/Thresh/NetworKIN/'+ d_type + '/NetworKIN.csv'       
GPS = base + 'Data/comparison/Thresh/GPS5.0/'+ d_type + '/GPS.csv'


# GPS kinases
GPS_kinase = base + 'Data/Raw/GPS5.0/gps_valid_kinases.csv'
# manually extracted cutoff scores from GPS5.0
all = base + 'Data/Raw/GPS5.0/gps_all.txt' 
low = base + 'Data/Raw/GPS5.0/gps_low.txt' 
med = base + 'Data/Raw/GPS5.0/gps_med.txt' 
hi = base + 'Data/Raw/GPS5.0/gps_hi.txt' 

# kinase map
KinaseMap = base + 'Data/Map/globalKinaseMap.csv'

# output dir for predictions filtered by different Thresholds
pp_dir = base + 'Data/comparison/Thresh/PhosphoPICK/'+ d_type + '/'
nw_dir = base + 'Data/comparison/Thresh/NetworKIN/'+ d_type + '/'
gps_dir = base + 'Data/comparison/Thresh/GPS5.0/'+ d_type + '/'

GPS_kinase_threshold = base + 'Data/Raw/GPS5.0/gps_valid_kinases_threshold.csv'    #GPS kinases with their cutoff scores


All Analysis is done in low, medium, high thresholds, and Y kinase vs. S/T kinase:

| Predictor    | Score Type          | Low | Med  | High |
|--------------|---------------------|-----|------|------|
| NetworKIN    | Likelihood Ratio    | 0.3 | 0.5  | 1    |
| PhosphoPICK  | P-value             | 0.1 | 0.06 | 0.02 |
| GPS5.0 (Y)   | False Positive Rate | 10% | 6%   | 2%   |
| GPS5.0 (S/T) | False Positive Rate | 15% | 9%   | 4%   |

### Set and Filter by Thresholds

In [71]:
def setPhosphopickThreshold (filename, threshold):
    """
    filter PhosphoPICK prediction by given threshold
    
    Parameters
    ----------
    filename: 'PhosphoPICK' defined by the input file location
    threshold : str
        0.1, 0.06, 0.02
    """
    df = pd.read_csv(filename)
    output_file = pp_dir + 'PhosphoPICK_' + str(threshold) + '.csv'
    for col in list(df.columns[5:]):
        # PhosphoPICK uses p-values as the scores, removing data above the threshold by replacing the score with '-'
        df.loc[(df[col] > str(threshold)), col]='-' 
    df.to_csv(output_file,index=False)

In [72]:
def setNetworkinThreshold (filename, threshold):
    """
    filter NetworKIN prediction by given threshold
    
    Parameters
    ----------
    filename: 'NetworKIN' defined by the input file location
    threshold : str
        0.3, 0.5, 1
    """
    df = pd.read_csv(filename)
    output_file = nw_dir + 'NetworKIN_' + str(threshold) + '.csv'
    for col in list(df.columns[5:]):
        # removing data below the threshold by replacing the score with '-'
        df.loc[(df[col] < str(threshold)), col]='-' 
    df.to_csv(output_file,index=False)

In [73]:
def setGPSThreshold (filename, threshold):
    """
    filter GPS prediction by given threshold
    
    Parameters
    ----------
    filename: 'GPS' defined by the input file location
    threshold : str
        low, mediumn, or high
    """
    df_gps_threshold = pd.read_csv(GPS_kinase_threshold)
    df = pd.read_csv(filename)
    # cutoff score for BUB1 is 0.0 for low, med, and high
    df = df.drop(columns=['BUB1'])
    output_file = gps_dir + 'GPS_' + str(threshold) + '.csv'
    for col in list(df.columns[5:]):
        # get the cutoff score by kinase name in df_gps_threshold
        thresh = df_gps_threshold.loc[(df_gps_threshold['Preferred Name'] == col),threshold].values[0]
        # removing data below the threshold by replacing the score with '-'
        df.loc[(df[col] < str(thresh)), col] = '-' 
    df.to_csv(output_file, index = False) 

**Prepare GPS cutoff score table**
- The actual cutoff scores for low, medium, and high thresholds were extracted from GPS5.0.

In [8]:
# load GPS kinase file
df_kinase = pd.read_csv(GPS_kinase)
# load GPS predictions of random human proteome sequence with threshold setting as 'low', 'mediun', and 'high'
# column 'Cutoff' are the actual cutoff scores for each kinase at the given threshold 
df_low = pd.read_csv(low, usecols = ['Kinase', 'Cutoff'], sep = '\t').drop_duplicates()
df_med = pd.read_csv(med, usecols = ['Kinase', 'Cutoff'], sep = '\t').drop_duplicates()
df_hi = pd.read_csv(hi, usecols = ['Kinase', 'Cutoff'], sep = '\t').drop_duplicates()

In [9]:
# merge df_kinase and df_low, add the cutoff scores for each kinase in low threshold
df_kinase = df_kinase.merge(df_low, left_on=['predictor'], right_on=['Kinase'], how = 'left')
df_kinase = df_kinase.drop(columns = ['Kinase'])
df_kinase = df_kinase.rename(columns={'Cutoff' : 'low'})  
# check to see any kinases that doesn't have a cutoff score for low threshold
df_kinase[df_kinase['low'].isna()]

Unnamed: 0,predictor,kinase,kinase_acc,low


In [10]:
# merge df_kinase and df_low, add the cutoff scores for each kinase in med threshold
df_kinase = df_kinase.merge(df_med, left_on=['predictor'], right_on=['Kinase'], how = 'left')
df_kinase = df_kinase.drop(columns = ['Kinase'])
df_kinase = df_kinase.rename(columns={'Cutoff' : 'medium'})        
# check to see any kinases that doesn't have a cutoff score for med threshold
df_kinase[df_kinase['medium'].isna()]

Unnamed: 0,predictor,kinase,kinase_acc,low,medium
321,TK/VEGFR/KDR,KDR,P35968,61.41,


In [11]:
# submit a different seq, run prediction on the kinase(s) that doesn't have a cutoff score for med threshold
# manully add the cutoff score by dictionary 
thresholds_med = {'KDR' : 66.305}

for key in thresholds_med:
    df_kinase.loc[df_kinase.kinase == key, ["medium"]] = thresholds_med[key]

In [12]:
# merge df_kinase and df_low, add the cutoff scores for each kinase in high threshold
df_kinase = df_kinase.merge(df_hi, left_on=['predictor'], right_on=['Kinase'], how = 'left')
df_kinase = df_kinase.drop(columns = ['Kinase'])
df_kinase = df_kinase.rename(columns={'Cutoff' : 'high'})        
# check to see any kinases that doesn't have a cutoff score for high threshold
df_kinase[df_kinase['high'].isna()]

Unnamed: 0,predictor,kinase,kinase_acc,low,medium,high
98,CMGC/CDK/CDC2/CDK1,CDK1,P06493,3.158,3.707,
163,STE/STE20/TAO/TAOK1,TAOK1,Q7L7X3,1.249,1.995,
186,TKL/RAF/RAF/ARAF,ARAF,P10398,2.916,3.24,
189,TKL/RIPK/RIPK2,RIPK2,O43353,3.253,3.617,
190,TKL/RIPK/RIPK3,RIPK3,Q9Y572,23.118,25.588,
231,Other/NEK/NEK6/NEK6,NEK6,Q9HC98,73.015,80.955,
252,Other/WNK/WNK1,WNK1,Q9H4A3,0.0,0.0,
262,TK/Axl/AXL,AXL,P30530,0.002,0.003,
266,TK/DDR/DDR2,DDR2,Q16832,1.127,1.342,
272,TK/Eph/EPHA4,EPHA4,P54764,1.226,1.511,


In [13]:
# submit a different seq, run prediction on the kinase(s) that don't have a cutoff score for med threshold
# manully add the cutoff score by dictionary 
thresholds_hi = {'CDK1' :4.755,
                'TAOK1' :3.53,
                'ARAF' : 3.686,
                'RIPK2' :4.257,
                'RIPK3' :30.741,
                'NEK6' :93.012,
                'WNK1' :0.001,
                'AXL' : 0.004,
                'DDR2' : 1.635,
                'EPHA4' :1.918,
                'EPHA8' :2.005,
                'EPHB2' :0.554,
                'TYK2' :28.458,
                'KDR' : 72.997}

for key in thresholds_hi:
    df_kinase.loc[df_kinase.kinase == key, ["high"]] = thresholds_hi[key]

In [11]:
# cutoff score for BUB1 is 0.0 for  low, med, and high
df_kinase = df_kinase[df_kinase['kinase'] != 'BUB1']

In [51]:
# get the preferred name
kinaseMap = pd.read_csv(KinaseMap, usecols = ['Preferred Name','UniprotID'])
df_kinase = df_kinase.merge(kinaseMap, left_on=['kinase_acc'], right_on=['UniprotID'], how = 'left')
df_kinase = df_kinase.drop(columns=['UniprotID'])
df_kinase.to_csv(GPS_kinase_threshold, index = False)
df_kinase

Unnamed: 0,predictor,kinase,kinase_acc,low,medium,high,Preferred Name
0,AGC/Akt/AKT1,AKT1,P31749,7.780,9.526,11.650,AKT1
1,AGC/Akt/AKT2,AKT2,P31751,19.022,20.896,24.834,AKT2
2,AGC/Akt/AKT3,AKT3,Q9Y243,2.840,3.297,4.201,AKT3
3,AGC/DMPK/CRIK/CIT,CIT,O14578,0.226,0.463,0.919,CIT
4,AGC/DMPK/GEK/DMPK,DMPK,Q09013,2.483,2.819,3.509,DMPK
...,...,...,...,...,...,...,...
320,TK/VEGFR/KDR,VEGFR2,P35968,61.410,66.305,72.997,VEGFR2
321,Dual/Atypical/BAZ/BAZ1B,BAZ1B,Q9UIG0,-1.790,-1.700,-1.560,BAZ1B
322,Dual/CMGC/CK2/CSNK2A1,CSNK2A1,P68400,1.401,1.733,2.167,CSNK2A1
323,Dual/Other/WEE/WEE1/WEE1B,WEE1B,P0C1S8,0.521,1.116,2.072,WEE2


In [87]:
# filter prediction by given thresholds
setPhosphopickThreshold (PhosphoPICK, 0.1)     #low: p-value = 0.1
setPhosphopickThreshold (PhosphoPICK, 0.06)    #med: p-value = 0.06
setPhosphopickThreshold (PhosphoPICK, 0.02)    #high: p-value = 0.02

setNetworkinThreshold (NetworKIN, 0.3)         #low: likelihood ratio = 0.3
setNetworkinThreshold (NetworKIN, 0.5)         #med: likelihood ratio = 0.5
setNetworkinThreshold (NetworKIN, 1)           #high: likelihood ratio = 1

# actual score in GPS_kinase_threshold
setGPSThreshold (GPS, 'low')                   
setGPSThreshold (GPS, 'medium')
setGPSThreshold (GPS, 'high')