# RegPhos (Human) Data Formatting
This file takes data regarding human kinase-protein interactions from the RegPhos database and converts the data into the .gmt format. The data was retrieved from the RegPhos database on Wed, Jun 14 2017 15:24:15 as a .txt and was transferred to the excel format used on Wed, Jun 14 2017 15:39:54. This data will be added to enhance the KEA2 database and will be suitably formatted for use by ENRICHR and X2K. The citation for the data used in this script is "Lee TY, Bo-Kai Hsu J, Chang WC, Huang HD (2011) "RegPhos: a system to explore the protein kinase-substrate phosphorylation network in humans" Nucleic Acids Research D777-787." and can also be found on the corresponding gh.pages website.

## Import packages necessary for following program

In [35]:
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/init.ipy
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/Scripts.py

## Create a dataframe from a file containing Human RegPhos data

In [None]:
#Define column names as headers from original .txt file
colnames = ['ID', 'AC', 'position', 'decription', 
            'catalytic_kinase','reference', 'resource', 'code' ]

#read data from excel file into dataframe 'regphos_df'
regphos_df = pd.read_table('~/Desktop/Projects/KEA3/RegPhos_Phos_human.txt')

regphos_df.columns = colnames

#drop all rows with an 'NaN' value for the kinases
regphos_df.dropna(axis = 0, inplace = True)

#View dataframe
regphos_df.head(50)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ID,AC,position,decription,catalytic_kinase,reference,resource,code
0,AXA2L_HUMAN,A6NMY6,24,Phosphotyrosine,SRC,19534553,Swiss-Prot 1010711,Y
1,CLM7_HUMAN,A8K4G0,188,Phosphotyrosine,FYN,16920917;17928527,Swiss-Prot 1010711,Y
2,SGK1_HUMAN,O00141,78,Phosphoserine,Erk5(MAPK7),11254654;18691976;19369195,Swiss-Prot 1010711,S
3,SGK1_HUMAN,O00141,256,Phosphothreonine,PDHK2,10191262,Swiss-Prot 1010711,T
4,SGK1_HUMAN,O00141,256,Phosphothreonine,PDK1,10191262;10357815,PhosphoELM.10011,T
5,SGK1_HUMAN,O00141,369,Phosphothreonine,PKA_group,11096081,Swiss-Prot 1010711,T
6,SGK1_HUMAN,O00141,377,Phosphoserine,NEK6,12023960,HPRD 9.0,S
7,SGK1_HUMAN,O00141,422,Phosphoserine,NEK6,10191262;12023960,HPRD 9.0,S
8,SGK1_HUMAN,O00141,422,Phosphoserine,PDHK2,10191262;12023960,HPRD 9.0,S
9,SGK1_HUMAN,O00141,422,Phosphoserine,PDK1,10191262;12023960,PhosphoELM.10011,S


## Convert UniProt IDs to Gene Symbols

In [None]:
#Use uniprot_to_symbol function from Scripts.py to convert
regphos_df['target_symbol'] = Scripts.uniprot_to_symbol(regphos_df['AC'].tolist())

regphos_df.replace('alternate', np.nan, inplace = True)

regphos_df.dropna(inplace = True)

#View dataframe
regphos_df.head()

## Format names of kinases
If there are parantheses surrounding the actual kinase name, remove the extraneous name and just extract the gene symbol of the kinase out of the parantheses.

In [None]:
for index, string in regphos_df.catalytic_kinase.iteritems():
    s = string
    if '(' in s:
        s = string.split('(', 2)[1]
    if ')' in s:
        s = s.split(")", 1)
        regphos_df.catalytic_kinase[index] = s[0]


## Create a new column combining kinases and organism

In [None]:
# Combine 'kinases' and 'species' into one column 'kinase_organism'
regphos_df['kinase_organism'] = ['_'.join([str(kinase), 'Homo sapiens']) for kinase
                                in regphos_df.catalytic_kinase]

#View dataframe
regphos_df.head()

## Perform preliminary data processing
Select columns necessary for .gmt format and filter into new dataframe 'df' df = phospho_df[['acc', 'kinases', 'species']]We must drop duplicates and NaNs, as well as select only the columns necessary for the .gmt file format (the protein ids and kinase gene symbols). 

In [None]:
#select columns necessary for .gmt and .sig format and filter into
#new dataframe 'df_sig' for later creation of the .sig file
df_sig = regphos_df[['target_symbol', 'kinase_organism', 'reference']]

#drop duplicate rows in the dataframe
df_sig.drop_duplicates(['target_symbol', 'kinase_organism'], inplace = True)

#create new dataframe 'df' from 'df_sig'
#drop all rows with an 'NaN' value for the kinases
df = df_sig.dropna(axis = 0)
df_sig.dropna(axis = 0, inplace = True)

df.drop('reference', axis = 1, inplace = True)

#Visualize data
df.head()

## Set Index to 'Kinase_Organism' and Aggregate Kinase Targets

In [None]:
df.set_index('kinase_organism')

#Group kinases in dataframe 'kin'
#Aggregate data in 'kin' according to kinase groups
kin = df.groupby('kinase_organism').agg(lambda x: tuple(x))

#Create a new column with 'RegPhos' as description of data
kin.insert(0, 'Description', 'RegPhos')

#Visualize Data
kin.head()

# Exploratory Data Analysis

## Calculate Number of Protein Targets for each kinase
Create a new column with the number of substrates related to each kinase, and sort the dataframe by this column

In [None]:
# Create column representing counts of protein targets per kinase
kin['kinase_targets_num'] = [len(lst) for kinase, lst in kin['target_symbol'].iteritems()]

# Sort kinases from max to min according to number of protein targets each has
kin.sort_values(by = ['kinase_targets_num'], ascending= False, inplace=True)

# Visualize data
kin.head()

## Create Histogram to display distribution of number of targets per kinase

In [None]:
# Create histogram displaying the distribution of the number
#targets per kinase
kin.plot.hist(by = 'kinase_targets_num', bins = 37)

#Show histogram
plt.show()

## Filter Dataframe by the Number of Substrates for each Kinase
If the kinase has fewer than five targets, this kinase will not be included in the final .gmt file. Instead, its information will be carried over into a .sig file.

In [None]:
df_sig.head()

In [None]:
#Create list of indexes(kinase names) for kinases with fewer than five substrates
sig_key = [index for index, rowData in kin.kinase_targets_num.iteritems() if rowData < 5 ]

#Set indices for dataframe 'kin' of kinases with five or more substrates
indices = [index for index in kin.index if index not in sig_key]

#Create dataframe of all kinases
sig = df_sig.set_index('kinase_organism')

#Reset index of new dataframe
sig.reset_index(inplace = True)

#Rename columns of this dataframe to match .sig format
colnames = ['Source', 'Substrate', 'PubMed_ID']
sig.columns = colnames

## Create File of 'sig' for Later Data Aggregation

In [None]:
#Create dictionary with index numbers as keys
d = dict([(key, '') for key in sig.index])

# loop through rows with iterrows()
for index, rowData in sig.iterrows():
    line = ('\t'.join(rowData))
    d[index] = line
    
#Transfer tab-separated info into a new txt file
with open('Uncondensed_RegPhos_human.txt', 'w') as openfile:
    for index in d:
        openfile.write(str(d[index]) + '\n')

In [None]:
#Remove kinases with fewer than five substrates in 'kin'
kin = kin.loc[indices]

In [None]:
#Insert first four columns for .sig file format (NaNs)
sig.insert(1, 'NA-1', str(np.nan))
sig.insert(2, 'NA-2', str(np.nan))
sig.insert(3, 'NA-3', str(np.nan))
sig.insert(4, 'NA-4', str(np.nan))

#Insert column corresponding to sign (with '?' because it is unknown)
sig.insert(6, 'Sign', '?')

#Insert column specifying interaction type as phosphorylation
sig.insert(7, 'Interaction', 'Phosphorylation')

#View dataframe
sig.head()

## Create .Sig File

In [None]:
#Create dictionary 'sigd' with index numbers as keys
sigd = dict([(key, '') for key in sig.index])

# loop through rows with iterrows()
for index, rowData in sig.iterrows():
    line = ('\t'.join(rowData))
    sigd[index] = line
    
#Transfer tab-separated info into a new txt file
#Make sure to include in ReadMe corresponding column names
with open('RegPhosHumansig.txt', 'w') as openfile:
    for index in sigd:
        openfile.write(str(sigd[index]) + '\n')

# Creation of Final .GMT File

## Create Dictionary of Tab-Separated Rows of the Dataframe

In [None]:
#Reset index of the dataframe
kin.reset_index(inplace = True)

#create column 'target_symbol_merged' in which all 'target_symbol' elements are joined by a \t symbol
kin['target_symbol_merged'] = ['\t'.join(x) for x in kin['target_symbol']]

#drop the now-unneccesary column 'target_symbol' and 'kinase_targets_num'
kin.drop('target_symbol', axis=1, inplace = True)
kin.drop('kinase_targets_num', axis=1, inplace = True)

#Create dictionary 'RegPhos' with index numbers as keys
RegPhos_num = dict([(key, '') for key in kin.index])

# loop through rows with iterrows()
for index, rowData in kin.iterrows():
    line = ('\t'.join(rowData))
    RegPhos_num[index] = line

## Write Info from Dictionary into a .GMT file

In [None]:
#Transfer tab-separated info into a new txt file
with open('RegPhosHuman.gmt', 'w') as openfile:
    for index in RegPhos_num:
        openfile.write(str(RegPhos_num[index]) + '\n')