# Human Protein Reference Database (Post-Translational Modifications) Data Formatting
This file takes data regarding kinase-protein interactions from the HPRD database and converts the data into the .gmt format. The data was retrieved from the HPRD database on Thu, Jun 15 2017 10:41:12. This data will be added to enhance the KEA2 database and will be suitably formatted for use by ENRICHR and X2K. The citation for this data is "Prasad, T. S. K. et al. (2009) Human Protein Reference Database - 2009 Update. Nucleic Acids Research. 37, D767-72." and can also be found on the corresponding gh.pages website.

## Import packages necessary for following program

In [1]:
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/init.ipy
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/Scripts.py

## Create a dataframe from a file containingthe HPRD Post-Translational Modification data

In [2]:
#Define colnames (as displayed in the associated README file)
colnames = ['substrate_hprd_id','substrate_gene_symbol',
            'substrate_isoform_id','substrate_refseq_id','site',
            'residue','enzyme_name','enzyme_hprd_id',
            'modification_type','experiment_type','reference_id']

hprd_df = pd.read_table('~/Desktop/Projects/KEA3/POST_TRANSLATIONAL_MODIFICATIONS.txt'
                        , header = None, names = colnames)

hprd_df.head(100)

Unnamed: 0,substrate_hprd_id,substrate_gene_symbol,substrate_isoform_id,substrate_refseq_id,site,residue,enzyme_name,enzyme_hprd_id,modification_type,experiment_type,reference_id
0,1,ALDH1A1,00001_1,NP_000680.2,2,S,-,-,Acetylation,in vitro,6427007
1,1,ALDH1A1,00001_1,NP_000680.2,128,K,-,-,Acetylation,in vivo,19608861
2,1,ALDH1A1,00001_1,NP_000680.2,91,K,-,-,Acetylation,in vivo,19608861
3,1,ALDH1A1,00001_1,NP_000680.2,353,K,-,-,Acetylation,in vivo,19608861
4,1,ALDH1A1,00001_1,NP_000680.2,419,K,-,-,Acetylation,in vivo,19608861
5,1,ALDH1A1,00001_1,NP_000680.2,495,K,-,-,Acetylation,in vivo,19608861
6,1,ALDH1A1,00001_1,NP_000680.2,252,K,-,-,Acetylation,in vivo,19608861
7,1,ALDH1A1,00001_1,NP_000680.2,367,K,-,-,Acetylation,in vivo,19608861
8,1,ALDH1A1,00001_1,NP_000680.2,410,K,-,-,Acetylation,in vivo,19608861
9,1,ALDH1A1,00001_1,NP_000680.2,435,K,-,-,Acetylation,in vivo,19608861


## Filter by columns necessary for .GMT file format

Although not needed for .gmt files, the 'modification_type' column is included so that we can also selectively filter phosphorylations (i.e. protein-kinase interactions)

In [None]:
df = hprd_df[['substrate_gene_symbol', 'enzyme_name', 
              'modification_type']]

df.head()

Unnamed: 0,substrate_gene_symbol,enzyme_name,modification_type
0,ALDH1A1,-,Acetylation
1,ALDH1A1,-,Acetylation
2,ALDH1A1,-,Acetylation
3,ALDH1A1,-,Acetylation
4,ALDH1A1,-,Acetylation


## Drop all without enzyme name and not pertaining to phosphorylation
Once done, remove column pertaining to 'modification_type'

In [None]:
#Determine which rows pertain to phosphorylation and have an enzyme name
indices = [index for index, rowData in df.iterrows()
          if str(rowData['enzyme_name']) != '-' 
           and rowData['modification_type'] == 'Phosphorylation']

#Filter dataframe
df_filter = df.loc[indices]

#drop duplicate rows in the dataframe
df_filter.drop_duplicates(inplace = True)

#drop all rows with an 'NaN' value for the kinases
df_filter.dropna(axis = 0, inplace = True)

#Drop 'modification_type' since no longer needed
df_filter.drop('modification_type', axis=1, inplace = True)

#View dataframe
df_filter.head()

## Specify organism of enzyme as Homo sapiens

In [None]:
#Add 'Homo sapiens' as organism for each enzyme
df_filter['kinase_organism'] = ['_'.join([enzyme, 'Homo sapiens']) 
                                for index, enzyme in 
                                df_filter.enzyme_name.iteritems()]
#Drop redundant 'enzyme_name' column
df_filter.drop('enzyme_name', axis = 1, inplace = True)

#View dataframe
df_filter.head()

## Create File of 'df_filter' for Later Data Aggregation
File will later be used to create .gmt file combining all databases

In [None]:
#Create dictionary 'HPRD' with index numbers as keys
d = dict([(key, '') for key in df_filter.index])

# loop through rows with iterrows()
for index, rowData in df_filter.iterrows():
    line = ('\t'.join(rowData))
    d[index] = line
    
#Transfer tab-separated info into a new txt file
with open('Uncondensed_HPRD.txt', 'w') as openfile:
    for index in d:
        openfile.write(str(d[index]) + '\n')

## Set index to 'Kinase_Organism' and Aggregate Kinase Targets

In [None]:
df_filter.set_index('kinase_organism')

#Group kinases in dataframe 'kin'
#Aggregate data in 'kin' according to kinase groups
kin = df_filter.groupby('kinase_organism').agg(lambda x: tuple(x))

#Create a new column with 'PhosphoELM' as description of data
kin.insert(0, 'Description', 'HPRD Post-Translational Modification')

#View Dataframe
kin.head()

# Exploratory Data Analysis

## Calculate Number of Protein targets for each kinase

In [None]:
# Create column representing counts of protein targets per kinase
kin['kinase_targets_num'] = [len(lst) for kinase, lst in kin['substrate_gene_symbol'].iteritems()]

# Sort kinases from max to min according to number of protein targets each has
kin.sort_values(by = ['kinase_targets_num'], ascending= False, inplace=True)

# View dataframe
kin.head()

## Create Histograms to display distribution of number of targets per kinase

In [None]:
# Create histogram displaying the distribution of the number
#targets per kinase
kin.plot.hist(by = 'kinase_targets_num', bins = 50)

#Show histogram
plt.show()

# Creation of Final . GMT File

## Create Dictionary of Tab-Separated Rows of the Dataframe

In [None]:
#Reset index of the dataframe
kin.reset_index(inplace = True)

#create column 'target_symbol_merged' in which all 'target_symbol' elements are joined by a \t symbol
kin['target_symbol_merged'] = ['\t'.join(x) for x in kin['substrate_gene_symbol']]

#drop the now-unneccesary column 'substrate_gene_symbol' and 'kinase_targets_num'
kin.drop('substrate_gene_symbol', axis=1, inplace = True)
kin.drop('kinase_targets_num', axis=1, inplace = True)

#Create dictionary 'HPRD' with index numbers as keys
HPRD_num = dict([(key, '') for key in kin.index])

# loop through rows with iterrows()
for index, rowData in kin.iterrows():
    line = ('\t'.join(rowData))
    HPRD_num[index] = line

## Write info from Dictionary into a .GMT file

In [None]:
#Transfer tab-separated info into a new txt file
with open('HPRD_PTM.gmt', 'w') as openfile:
    for index in HPRD_num:
        openfile.write(str(HPRD_num[index]) + '\n')

## Test: Reading in the Newly-Created .GMT File

In [None]:
df2 = pd.read_table('HPRD_PTM.gmt', delimiter = '\t', names = ['']*141)
df2 = df2.replace(np.nan, '', regex=True)
df2.rename(columns = {'': 'kinase'}, inplace = True)
df2.rename(columns = {'.1': 'Description'}, inplace = True)

colnames = dict([(key, 'Substrate') for key in df2.columns[2:]])
df2.rename(columns= colnames, inplace = True)
df2