# Update the Human Proteome Reference File and KinPred Final Data

This notebook shows the steps to upgrade from 2019-12-12 to 2020-2-26 reference humam proteome, and update the KinPred final data with the new reference human proteome.

In [2]:
# IMPORTS
import pandas as pd
import os
import sys
sys.path.append('../PreprocessingPredictionData/')
import humanProteomesReference

In [2]:
##################
# File Location  #
##################
# local (../../)
base = '../../'

#####################
# Defining File Dir #
#####################
# Human Proteome fasta file dir
HP_fasta = base + 'Data/Raw/HumanProteome/'
# human proteome referece csv file dir
HP_csv = base + 'Data/Map/'

#########################################################
# Defining the file names for the current (old) version #
#########################################################
# Old version Date
old_version = '2019-12-11'
old_HP_fasta = HP_fasta + 'humanProteome_' + old_version + '.fasta'
old_HP_csv = HP_csv + 'humanProteome_' + old_version + '.csv'

#########################################################
# Defining the file names for the updated (new) version #
#########################################################
# New version Date: uniprot.org (Date of last sequence modification)
new_version = '2020-02-26'
new_HP_fasta = HP_fasta + 'humanProteome_' + new_version + '.fasta'
new_HP_csv = HP_csv + 'humanProteome_' + new_version + '.csv'
seq_mod_fasta = HP_fasta + 'UpdatedSeq_' + new_version + '.fasta'

### Download the updated (new) version of Human Proteome fasta file

In [3]:
# Downloads the updated Human Proteomes (canonical) from Unipro.org
# saves as fasta formate at the given dir/name + last sequence modification date.

humanProteomesReference.downloadHumanProteomes(new_HP_fasta)

In [4]:
# Convert the input fasta file into a dataframe
# saves as csv formate at the given dir/name + last sequence modification date.

humanProteomesReference.fastaToCSV(new_HP_fasta, new_HP_csv)

Unnamed: 0,UniprotID,Gene Name,Entry Name,sequence
0,Q9NQ39,RPS10P5,RS10L_HUMAN,MLMPKKNRIAIHELLFKEGVMVAKKDVHMPKHPELADKNVPNLHVM...
1,Q99470,SDF2,SDF2_HUMAN,MAVVPLLLLGGLWSAVGASSLGVVTCGSVVKLLNTRHNVRLHSHDV...
2,Q8N8B7,TCEANC,TEANC_HUMAN,MSDKNQIAARASLIEQLMSKRNFEDLGNHLTELETIYVTKEHLQET...
3,Q658L1,SAXO2,SAXO2_HUMAN,MGAKSMRSWCLCQICSCGSDYCPYEIVKQPRHVPEEYKPKQGKIDL...
4,O75494,SRSF10,SRS10_HUMAN,MSRYLRPPNTSLFVRNVADDTRSEDLRREFGRYGPIVDVYVPLDFY...
...,...,...,...,...
20360,P53420,COL4A4,CO4A4_HUMAN,MWSLHIVLMRCSFRLTKSLATGPWSLILILFSVQYVYGSGKKYIGP...
20361,Q8TD26,CHD6,CHD6_HUMAN,MKMKIQKKEKQLSNLKVLNHSPMSDASVNFDYKSPSPFDCSTDQEE...
20362,Q9UK99,FBXO3,FBX3_HUMAN,MAAMETETAPLTLESLPTDPLLLILSFLDYRDLINCCYVSRRLSQL...
20363,Q9H6U6,BCAS3,BCAS3_HUMAN,MNEAMATDSPRRPSRCTGGVVVRPQAVTEQSYMESVVTFLQDVVPQ...


### Compare the Current (old) version and the Updated (new) version

- get a list of uniportIDs from the current (old) human proteome referece file that become obsolet/secondary in the updated (new) human proteome referece file
- get a list of uniportIDs from the updated (new) human proteome referece file that have different sequence in the current (old) human proteome referece file
    - prediction with those uniprotIDs (substrate_acc) will be removed from the current prediction data files

In [3]:
df_old = pd.read_csv(old_HP_csv, usecols = ['UniprotID', 'sequence'], sep = '\t')
df_new = pd.read_csv(new_HP_csv, usecols = ['UniprotID', 'sequence'], sep = '\t')

common_acc = df_old.merge(df_new, on=['UniprotID'])
common_seq = df_old.merge(df_new, on=['UniprotID' , 'sequence'])

old_id = df_old[(~df_old.UniprotID.isin(common_acc.UniprotID))][['UniprotID']]
new_seq = df_new[(~df_new.UniprotID.isin(common_seq.UniprotID))|(~df_new.sequence.isin(common_seq.sequence))][['UniprotID']]

print ('Outdated Protein UniprotIDs: \n', old_id, '\n')
print ('Protein UniprotIDs with Sequence Modifacation:\n', new_seq)

Outdated Protein UniprotIDs: 
       UniprotID
1103     Q9UGB4
16384    Q5JT78 

Protein UniprotIDs with Sequence Modifacation:
       UniprotID
1873     Q96LP2
8355     Q6DHV5
10813    Q9UN66
11600    P19013
17838    P0C617
19253    Q8N4F4
19605    O75123
20283    A6QL64


- download the fasta file of the Protein UniprotIDs with Sequence Modifacation

In [14]:
humanProteomesReference.downloadFasta (new_seq, seq_mod_fasta)

### Re-run predictions of the Protein UniprotIDs with Sequence Modifacation in each predictor
Please see the `Get Results` section in 
[FormattingPhosphoPICK.ipynb](https://github.com/NaegleLab/KinPred/blob/master/Code/PreprocessingPredictionData/FormattingPhosphoPICK.ipynb), [FormattingGPS.ipynb](https://github.com/NaegleLab/KinPred/blob/master/Code/PreprocessingPredictionData/FormattingGPS.ipynb), and 
[FormattingNetworKIN.ipynb](https://github.com/NaegleLab/KinPred/blob/master/Code/PreprocessingPredictionData/FormattingNetworKIN.ipynb)
for instruction on how to run prediction with each predictor

### Update the Prediction Data

In [1]:
# IMPORTS
sys.path.append('../PreprocessingPredictionData/')
import gps_convert, phosphoPick_convert, networKin_convert

In [4]:
#####################
# Defining File Dir #
#####################

# Resource Files
SubstrateMap = base + 'Data/Map/globalSubstrateMap.csv'                     # add all unique substrate in HPRD to the global file 
KinaseMap = base + 'Data/Map/globalKinaseMap.csv'                           # add all unique kinase in HPRD to the global file

# GPS
# Current (old) prediction data file
gps_old = base + 'Data/Formatted/GPS/GPS_formatted_' + old_version + '.csv'
# updated (new) prediction data file
gps_new = base + 'Data/Formatted/GPS/GPS_formatted_' + new_version + '.csv'
# manually prepared GPS valid kinase table
gps_kinase = base + 'Data/Raw/GPS/gps_valid_kinases.csv'
# dir for the predictions of the updated sequences
gps_update_dir = base + 'Data/Raw/GPS/updated/updated/'
# temp dir for processing the predictions for the updated sequences
gps_temp_dir_acc_update = base + 'Data/Temp/GPS/mappedAcc/updated/updated/'
gps_temp_dir_site_update = base + 'Data/Temp/GPS/mappedSite/updated/updated/'

# PhosphoPICK
# Current (old) prediction data file
pick_old = base + 'Data/Formatted/PhosphoPICK/PhosphoPICK_formatted_' + old_version + '.csv'
# updated (new) prediction data file
pick_new = base + 'Data/Formatted/PhosphoPICK/PhosphoPICK_formatted_' + new_version + '.csv'
# dir for the predictions of the updated sequences
pick_update_dir = base + 'Data/Raw/PhosphoPICK/updated/updated/'
# temp dir for processing the predictions for the updated sequences
pick_temp_dir_acc_update = base + 'Data/Temp/PhosphoPICK/mappedAcc/updated/updated/'
pick_temp_dir_site_update = base + 'Data/Temp/PhosphoPICK/mappedSite/updated/updated/'

# NetworKIN
# Current (old) prediction data file
kin_old = base + 'Data/Formatted/NetworKIN/NetworKIN_formatted_' + old_version + '.csv'
# updated (new) prediction data file
kin_new = base + 'Data/Formatted/NetworKIN/NetworKIN_formatted_' + new_version + '.csv'
# dir for the predictions of the updated sequences
kin_update_dir = base + 'Data/Raw/NetworKIN/updated/updated/'
# temp dir for processing the predictions for the updated sequences
kin_temp_dir_acc_update = base + 'Data/Temp/NetworKIN/mappedAcc/updated/updated/'
kin_temp_dir_site_update = base + 'Data/Temp/NetworKIN/mappedSite/updated/updated/'

**Remove outdated data** of the above outdated Protein UniprotIDs and Protein UniprotIDs with Sequence Modifacation from each prediction data of each predictor

In [6]:
# append old_id and new_seq together
rm_id = pd.concat([old_id, new_seq]).reset_index(drop = True)
rm_id

Unnamed: 0,UniprotID
0,Q9UGB4
1,Q5JT78
2,Q96LP2
3,Q6DHV5
4,Q9UN66
5,P19013
6,P0C617
7,Q8N4F4
8,O75123
9,A6QL64


In [7]:
def rmOutdated (predictor_old, predictor_new, rm_id_df):
    for chunk in pd.read_csv(predictor_old, chunksize = 1000000):
        chunk = chunk[~chunk.substrate_acc.isin(rm_id_df.UniprotID)]

        if not os.path.isfile(predictor_new):
            chunk.to_csv(predictor_new, mode='a', index=False, sep=',')
        else:
            chunk.to_csv(predictor_new, mode='a', index=False, sep=',', header=False)


- **GPS**

In [18]:
rmOutdated(gps_old, gps_new, rm_id)

- **PhosphoPICK**

In [10]:
rmOutdated(pick_old, pick_new, rm_id)

- **NetworKIN**

In [25]:
rmOutdated(kin_old, kin_new, rm_id)

**Process the rerunned predictions** of each predictor

In [19]:
#get Gene Name(substrate) from the globalSubstrateMap.csv 
df_unique_sub = pd.read_csv(SubstrateMap, usecols = ['Gene Name','UniprotID'])
#get Kinase Name from the globalKinaseMap.csv 
df_unique_kin = pd.read_csv(KinaseMap, usecols = ['Kinase Name','UniprotID'])

In [20]:
def addNameCol (perdictor_df):
    # add Gene Name (substrate) column
    perdictor_df = perdictor_df.merge(df_unique_sub, left_on=['substrate_acc'], right_on=['UniprotID'], how = 'left')
    # drop the duplicated uniprotID column for substrate
    perdictor_df = perdictor_df.drop(columns = 'UniprotID')

    # add Kinase Name column
    perdictor_df = perdictor_df.merge(df_unique_kin, left_on=['kinase_acc'], right_on=['UniprotID'], how = 'left')
    # drop the duplicated uniprotID column for kinases
    perdictor_df = perdictor_df.drop(columns = 'UniprotID')
    
    return perdictor_df

In [None]:
# removing unmatched kinase type and phosphostie type

def rm_unmatched_kinase_type(df):
    df_y = df[df['Kinase Name'].isin(y_kin['Kinase Name'])]
    df_y = df_y[df_y['site'].str.contains('Y')]
    df_st = df[df['Kinase Name'].isin(st_kin['Kinase Name'])]
    df_st = df_st[df_st['site'].str.contains('S|T')]
    df_dual = df[df['Kinase Name'].isin(dual_kin['Kinase Name'])]
    
    df_final = pd.concat([df_y, df_st, df_dual])
    df_final = df_final.reset_index()
    
    return df_final

- **GPS**

In [21]:
# convert substrate_acc and kinase_acc
convert_type = 'acc'
gps_convert.gps_convert_directory(gps_update_dir, gps_kinase, gps_temp_dir_acc_update, convert_type)
# map the site to the updated (new) human proteome reference
convert_type = 'site'
gps_convert.gps_convert_directory(gps_temp_dir_acc_update, new_HP_csv, gps_temp_dir_site_update, convert_type)


Formatting  UpdatedSeq_2020-02-26 ...
Done. Time	2.777
Formatting  UpdatedSeq_2020-02-26.csv ...
Reading input file...
Get unique substrate sites...
Map unique substrate sites...
Done. Time	3.185


In [22]:
# print the converted df for the updated seq predictions
df_gps_update = pd.read_csv(gps_temp_dir_site_update+'UpdatedSeq_' + new_version + '_mappedSite.csv')
df_gps_update

Unnamed: 0,substrate_acc,site,residue,position,predictor,kinase,kinase_acc,pep,score,threshold,mapped site,substrate_id
0,Q96LP2,T8,T,8,AGC/Akt/AKT1,AKT1,P31749,MQLQFLGTLASSEKR,7.707,,T8,Q96LP2_8
1,Q96LP2,S11,S,11,AGC/Akt/AKT1,AKT1,P31749,QFLGTLASSEKRKKS,6.608,,S11,Q96LP2_11
2,Q96LP2,S12,S,12,AGC/Akt/AKT1,AKT1,P31749,FLGTLASSEKRKKSQ,7.025,,S12,Q96LP2_12
3,Q96LP2,S18,S,18,AGC/Akt/AKT1,AKT1,P31749,SSEKRKKSQRLFFKN,14.455,,S18,Q96LP2_18
4,Q96LP2,S28,S,28,AGC/Akt/AKT1,AKT1,P31749,LFFKNIKSTKNKAGK,4.902,,S28,Q96LP2_28
...,...,...,...,...,...,...,...,...,...,...,...,...
258853,A6QL64,Y1670,Y,1670,Dual/Other/WEE/WEE1/WEE1,WEE1,P30291,VLKTELHYTGEALKE,0.444,,Y1670,A6QL64_1670
258854,A6QL64,Y1703,Y,1703,Dual/Other/WEE/WEE1/WEE1,WEE1,P30291,MKDIEKMYKSGYNTM,0.877,,Y1703,A6QL64_1703
258855,A6QL64,Y1707,Y,1707,Dual/Other/WEE/WEE1/WEE1,WEE1,P30291,EKMYKSGYNTMEKCI,1.158,,Y1707,A6QL64_1707
258856,A6QL64,Y1794,Y,1794,Dual/Other/WEE/WEE1/WEE1,WEE1,P30291,SKEKECQYEKEKAER,0.101,,Y1794,A6QL64_1794


In [23]:
# remove the ones that are not kinse (from FormattingGPS.ipynb)
not_kinase = ['PDK2', 'PDK3', 'PDK4', 'MSN', 'GTF2F1', 'MPS1']
df_gps_update = df_gps_update[~df_gps_update['kinase'].isin(not_kinase)]
df_gps_update

Unnamed: 0,substrate_acc,site,residue,position,predictor,kinase,kinase_acc,pep,score,threshold,mapped site,substrate_id
0,Q96LP2,T8,T,8,AGC/Akt/AKT1,AKT1,P31749,MQLQFLGTLASSEKR,7.707,,T8,Q96LP2_8
1,Q96LP2,S11,S,11,AGC/Akt/AKT1,AKT1,P31749,QFLGTLASSEKRKKS,6.608,,S11,Q96LP2_11
2,Q96LP2,S12,S,12,AGC/Akt/AKT1,AKT1,P31749,FLGTLASSEKRKKSQ,7.025,,S12,Q96LP2_12
3,Q96LP2,S18,S,18,AGC/Akt/AKT1,AKT1,P31749,SSEKRKKSQRLFFKN,14.455,,S18,Q96LP2_18
4,Q96LP2,S28,S,28,AGC/Akt/AKT1,AKT1,P31749,LFFKNIKSTKNKAGK,4.902,,S28,Q96LP2_28
...,...,...,...,...,...,...,...,...,...,...,...,...
258853,A6QL64,Y1670,Y,1670,Dual/Other/WEE/WEE1/WEE1,WEE1,P30291,VLKTELHYTGEALKE,0.444,,Y1670,A6QL64_1670
258854,A6QL64,Y1703,Y,1703,Dual/Other/WEE/WEE1/WEE1,WEE1,P30291,MKDIEKMYKSGYNTM,0.877,,Y1703,A6QL64_1703
258855,A6QL64,Y1707,Y,1707,Dual/Other/WEE/WEE1/WEE1,WEE1,P30291,EKMYKSGYNTMEKCI,1.158,,Y1707,A6QL64_1707
258856,A6QL64,Y1794,Y,1794,Dual/Other/WEE/WEE1/WEE1,WEE1,P30291,SKEKECQYEKEKAER,0.101,,Y1794,A6QL64_1794


In [24]:
# add Gene (substrate) and Kinase Name columns
df_gps_update = addNameCol (df_gps_update)

#rename columns
df_gps_update = df_gps_update[['substrate_id','substrate_acc','Gene Name','mapped site','pep', 'score', 'Kinase Name']]
df_gps_update = df_gps_update.rename(columns={'mapped site' : 'site', 'Gene Name' : 'substrate_name'})
df_gps_update

Unnamed: 0,substrate_id,substrate_acc,substrate_name,site,pep,score,Kinase Name
0,Q96LP2_8,Q96LP2,FAM81B,T8,MQLQFLGTLASSEKR,7.707,AKT1
1,Q96LP2_11,Q96LP2,FAM81B,S11,QFLGTLASSEKRKKS,6.608,AKT1
2,Q96LP2_12,Q96LP2,FAM81B,S12,FLGTLASSEKRKKSQ,7.025,AKT1
3,Q96LP2_18,Q96LP2,FAM81B,S18,SSEKRKKSQRLFFKN,14.455,AKT1
4,Q96LP2_28,Q96LP2,FAM81B,S28,LFFKNIKSTKNKAGK,4.902,AKT1
...,...,...,...,...,...,...,...
253075,A6QL64_1670,A6QL64,ANKRD36,Y1670,VLKTELHYTGEALKE,0.444,WEE1
253076,A6QL64_1703,A6QL64,ANKRD36,Y1703,MKDIEKMYKSGYNTM,0.877,WEE1
253077,A6QL64_1707,A6QL64,ANKRD36,Y1707,EKMYKSGYNTMEKCI,1.158,WEE1
253078,A6QL64_1794,A6QL64,ANKRD36,Y1794,SKEKECQYEKEKAER,0.101,WEE1


- **PhosphoPICK**

In [17]:
# convert substrate_acc and kinase_acc
convert_type = 'acc'
phosphoPick_convert.pick_convert_directory(pick_update_dir, 'na', pick_temp_dir_acc_update, convert_type)
# map the site to the updated (new) human proteome reference
convert_type = 'site'
phosphoPick_convert.pick_convert_directory(pick_temp_dir_acc_update, new_HP_csv, pick_temp_dir_site_update, convert_type)


Formatting  UpdatedSeq_2020-02-26.txt ...
getting unique sub
getting sub_acc
merge
done 0.7367167472839355
getting unique kin
getting kin_acc
RPSK6A5 no hit in human
MAP3KB no hit in human
merge
done 143.17519211769104
Done. Time	144.134
Formatting  UpdatedSeq_2020-02-26 ...
Reading input file...
Get unique substrate sites...
Done. Time	0.422


In [5]:
# print the converted df for the updated seq predictions
df_pick_update = pd.read_csv(pick_temp_dir_site_update+'UpdatedSeq_' + new_version + '_mappedSite.csv')
df_pick_update

Unnamed: 0,Uniprot-Acc,blastp-identity,kinase,position,combined-p-value,substrate_acc,kinase_acc,site,pep,substrate_id
0,Q6DHV5,100.0,CDK1,409,0.035717,Q6DHV5,P06493,T409,HGQGFTSTPIKLQVQ,Q6DHV5_409
1,Q6DHV5,100.0,CDK1,951,0.041889,Q6DHV5,P06493,S951,EIKVDFVSPGHDYSF,Q6DHV5_951
2,Q6DHV5,100.0,CDK1,82,0.096195,Q6DHV5,P06493,S82,IHQRSKLSPQTEVSL,Q6DHV5_82
3,Q6DHV5,100.0,CDK1,177,0.106733,Q6DHV5,P06493,S177,NIFVPSSSPVVNQRK,Q6DHV5_177
4,Q6DHV5,100.0,CDK1,240,0.139274,Q6DHV5,P06493,T240,GEIMSLPTPIKQSWN,Q6DHV5_240
...,...,...,...,...,...,...,...,...,...,...
17995,Q6DHV5,100.0,PRKDC,1105,0.802028,Q6DHV5,P78527,T1105,FPNRRIVTTVFNDEG,Q6DHV5_1105
17996,Q6DHV5,100.0,PRKDC,978,0.802709,Q6DHV5,P78527,T978,NIFDEMMTEKHEDHC,Q6DHV5_978
17997,Q6DHV5,100.0,PRKDC,57,0.803943,Q6DHV5,P78527,S57,VREKLKISKINKGEK,Q6DHV5_57
17998,Q6DHV5,100.0,PRKDC,263,0.805551,Q6DHV5,P78527,T263,PLNPLLKTIYRKAVK,Q6DHV5_263


In [19]:
# add the uniprotID of the protein kinases 'RPSK6A5'
# remove predictions with kinase 'MAP3KB' : didn't find any record in human
id_dict = {'RPSK6A5':'O75582'}

for key in id_dict:
    df_pick_update.loc[df_pick_update.kinase == key, ["kinase_acc"]] = id_dict[key]
    # remove MAP3KB : didn't find any record in human
df_pick_update = df_pick_update[df_pick_update['kinase_acc'] != '(no hit in human)']
df_pick_update

Unnamed: 0,Uniprot-Acc,blastp-identity,kinase,position,combined-p-value,substrate_acc,kinase_acc,site,pep,substrate_id
0,Q6DHV5,100.0,CDK1,409,0.035717,Q6DHV5,P06493,T409,HGQGFTSTPIKLQVQ,Q6DHV5_409
1,Q6DHV5,100.0,CDK1,951,0.041889,Q6DHV5,P06493,S951,EIKVDFVSPGHDYSF,Q6DHV5_951
2,Q6DHV5,100.0,CDK1,82,0.096195,Q6DHV5,P06493,S82,IHQRSKLSPQTEVSL,Q6DHV5_82
3,Q6DHV5,100.0,CDK1,177,0.106733,Q6DHV5,P06493,S177,NIFVPSSSPVVNQRK,Q6DHV5_177
4,Q6DHV5,100.0,CDK1,240,0.139274,Q6DHV5,P06493,T240,GEIMSLPTPIKQSWN,Q6DHV5_240
...,...,...,...,...,...,...,...,...,...,...
17995,Q6DHV5,100.0,PRKDC,1105,0.802028,Q6DHV5,P78527,T1105,FPNRRIVTTVFNDEG,Q6DHV5_1105
17996,Q6DHV5,100.0,PRKDC,978,0.802709,Q6DHV5,P78527,T978,NIFDEMMTEKHEDHC,Q6DHV5_978
17997,Q6DHV5,100.0,PRKDC,57,0.803943,Q6DHV5,P78527,S57,VREKLKISKINKGEK,Q6DHV5_57
17998,Q6DHV5,100.0,PRKDC,263,0.805551,Q6DHV5,P78527,T263,PLNPLLKTIYRKAVK,Q6DHV5_263


In [20]:
# add Gene (substrate) and Kinase Name columns
df_pick_update = addNameCol (df_pick_update)

#rename columns
df_pick_update = df_pick_update[['substrate_id','substrate_acc','Gene Name','site','pep', 'combined-p-value', 'Kinase Name']]
df_pick_update = df_pick_update.rename(columns={'combined-p-value' : 'score', 'Gene Name' : 'substrate_name'})
df_pick_update

Unnamed: 0,substrate_id,substrate_acc,substrate_name,site,pep,score,Kinase Name
0,Q6DHV5_409,Q6DHV5,CC2D2B,T409,HGQGFTSTPIKLQVQ,0.035717,CDK1
1,Q6DHV5_951,Q6DHV5,CC2D2B,S951,EIKVDFVSPGHDYSF,0.041889,CDK1
2,Q6DHV5_82,Q6DHV5,CC2D2B,S82,IHQRSKLSPQTEVSL,0.096195,CDK1
3,Q6DHV5_177,Q6DHV5,CC2D2B,S177,NIFVPSSSPVVNQRK,0.106733,CDK1
4,Q6DHV5_240,Q6DHV5,CC2D2B,T240,GEIMSLPTPIKQSWN,0.139274,CDK1
...,...,...,...,...,...,...,...
17803,Q6DHV5_1105,Q6DHV5,CC2D2B,T1105,FPNRRIVTTVFNDEG,0.802028,PRKDC
17804,Q6DHV5_978,Q6DHV5,CC2D2B,T978,NIFDEMMTEKHEDHC,0.802709,PRKDC
17805,Q6DHV5_57,Q6DHV5,CC2D2B,S57,VREKLKISKINKGEK,0.803943,PRKDC
17806,Q6DHV5_263,Q6DHV5,CC2D2B,T263,PLNPLLKTIYRKAVK,0.805551,PRKDC


- **NetworKIN**

In [7]:
# convert substrate_acc and kinase_acc
convert_type = 'acc'
networKin_convert.kin_convert_directory(kin_update_dir, 'na', kin_temp_dir_acc_update, convert_type)
# map the site to the updated (new) human proteome reference
convert_type = 'site'
networKin_convert.kin_convert_directory(kin_temp_dir_acc_update, new_HP_csv, kin_temp_dir_site_update, convert_type)


Formatting  UpdatedSeq_2020-02-26 ...
getting unique sub
getting sub_acc
merge
getting unique kin
getting kin_acc
merge
Done. Time	143.550
Formatting  UpdatedSeq_2020-02-26 ...
Done. Time	1.851


In [8]:
# print the converted df for the updated seq predictions
df_kin_update = pd.read_csv(kin_temp_dir_site_update+'UpdatedSeq_' + new_version + '_mappedSite.csv')
df_kin_update

Unnamed: 0,Position,score,substrate_name,kinase_name,pep,substrate_acc,kinase_acc,site,substrate_id
0,S1432,0.7361,CC2D2A,PRKCB,WVYLASLVQHQ,Q6DHV5,P05771,S1432,Q6DHV5_1432
1,S1432,0.4708,CC2D2A,PAK1,WVYLASLVQHQ,Q6DHV5,Q13153,S1432,Q6DHV5_1432
2,S1432,0.4522,CC2D2A,PRKCA,WVYLASLVQHQ,Q6DHV5,P17252,S1432,Q6DHV5_1432
3,S1432,0.3624,CC2D2A,PRKCZ,WVYLASLVQHQ,Q6DHV5,Q05513,S1432,Q6DHV5_1432
4,S1432,0.3584,CC2D2A,NEK2,WVYLASLVQHQ,Q6DHV5,P51955,S1432,Q6DHV5_1432
...,...,...,...,...,...,...,...,...,...
67024,T500,0.0000,ZNF623,MAPK9,DFNSTTNVKNN,O75123,P45984,T500,O75123_500
67025,T500,0.0000,ZNF623,ATR,DFNSTTNVKNN,O75123,Q13535,T500,O75123_500
67026,T500,0.0000,ZNF623,ATM,DFNSTTNVKNN,O75123,Q13315,T500,O75123_500
67027,T500,0.0000,ZNF623,PRKD1,DFNSTTNVKNN,O75123,Q15139,T500,O75123_500


In [9]:
# remove the ones that are not kinse (from FormattingNetworKIN.ipynb)
not_kinase = ['PDK2','PDK3','PDK4','LCA5']
df_kin_update = df_kin_update[~df_kin_update['kinase_name'].isin(not_kinase)]
df_kin_update

Unnamed: 0,Position,score,substrate_name,kinase_name,pep,substrate_acc,kinase_acc,site,substrate_id
0,S1432,0.7361,CC2D2A,PRKCB,WVYLASLVQHQ,Q6DHV5,P05771,S1432,Q6DHV5_1432
1,S1432,0.4708,CC2D2A,PAK1,WVYLASLVQHQ,Q6DHV5,Q13153,S1432,Q6DHV5_1432
2,S1432,0.4522,CC2D2A,PRKCA,WVYLASLVQHQ,Q6DHV5,P17252,S1432,Q6DHV5_1432
3,S1432,0.3624,CC2D2A,PRKCZ,WVYLASLVQHQ,Q6DHV5,Q05513,S1432,Q6DHV5_1432
4,S1432,0.3584,CC2D2A,NEK2,WVYLASLVQHQ,Q6DHV5,P51955,S1432,Q6DHV5_1432
...,...,...,...,...,...,...,...,...,...
67024,T500,0.0000,ZNF623,MAPK9,DFNSTTNVKNN,O75123,P45984,T500,O75123_500
67025,T500,0.0000,ZNF623,ATR,DFNSTTNVKNN,O75123,Q13535,T500,O75123_500
67026,T500,0.0000,ZNF623,ATM,DFNSTTNVKNN,O75123,Q13315,T500,O75123_500
67027,T500,0.0000,ZNF623,PRKD1,DFNSTTNVKNN,O75123,Q15139,T500,O75123_500


In [14]:
# add Gene (substrate) and Kinase Name columns
df_kin_update = addNameCol(df_kin_update)

#rename columns
df_kin_update = df_kin_update[['substrate_id','substrate_acc','Gene Name','site','pep', 'score', 'Kinase Name']]
df_kin_update = df_kin_update.rename(columns={'Gene Name' : 'substrate_name'})
df_kin_update

Unnamed: 0,substrate_id,substrate_acc,substrate_name,site,pep,score,Kinase Name
0,Q6DHV5_1432,Q6DHV5,CC2D2B,S1432,WVYLASLVQHQ,0.7361,PRKCB
1,Q6DHV5_1432,Q6DHV5,CC2D2B,S1432,WVYLASLVQHQ,0.4708,PAK1
2,Q6DHV5_1432,Q6DHV5,CC2D2B,S1432,WVYLASLVQHQ,0.4522,PRKCA
3,Q6DHV5_1432,Q6DHV5,CC2D2B,S1432,WVYLASLVQHQ,0.3624,PRKCZ
4,Q6DHV5_1432,Q6DHV5,CC2D2B,S1432,WVYLASLVQHQ,0.3584,NEK2
...,...,...,...,...,...,...,...
65490,O75123_500,O75123,ZNF623,T500,DFNSTTNVKNN,0.0000,MAPK9
65491,O75123_500,O75123,ZNF623,T500,DFNSTTNVKNN,0.0000,ATR
65492,O75123_500,O75123,ZNF623,T500,DFNSTTNVKNN,0.0000,ATM
65493,O75123_500,O75123,ZNF623,T500,DFNSTTNVKNN,0.0000,PRKD1


**Append the rerunned predictions** of each predictor to the prediction files and save as the updated prediction files with version date

In [25]:
def appendUpdates(predictor_new, update_df):
    update_df = rm_unmatched_kinase_type(update_df)
    update_df.to_csv(predictor_new, mode='a', index = False, header=False)

- **GPS**

In [26]:
appendUpdates(gps_new, df_gps_update)

- **PhosphoPICK**

In [23]:
appendUpdates(pick_new, df_pick_update)

- **NetworKIN**

In [None]:
appendUpdates(kin_new, df_kin_update)

### Cross Referencing with ProteomeScout Phosphorylation Data

see [CrossReferenceWithProteomeScout.ipynb](https://github.com/NaegleLab/KinPred/blob/master/Code/CrossReferenceWithProteomeScout/CrossReferenceWithProteomeScout.ipynb) for detail

In [None]:
# IMPORTS
sys.path.append('../CrossReferenceWithProteomeScout/')
import XRefProteomeScout
from datetime import date

In [None]:
# version date
pscout_version = date.today().strftime('%Y-%m-%d')

# file location
ref_proteome = base+"Data/Raw/HumanProteome/humanProteome_"+ref_version+".fasta"
pscout_data = base+'Data/Raw/ProteomeScout_'+pscout_version+'/data.tsv'

In [None]:
# download current ProteomeScout Data
XRefProteomeScout.getPScoutData()

In [None]:
# run cross referencing
XRefProteomeScout.XRefProteomeScout(pscout_data, ref_proteome, new_version)