# ARCHS4 (Kinases)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
from collections import Counter
import json
import re
import scipy
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/my_functions.py'>

# Load Data (coexpression data generated gmt file)

In [3]:
df = pd.read_csv('Input/Kinase/ARCHS4_human_kinase_Coexp.gmt', sep='$', header=None)

In [4]:
df.head()

Unnamed: 0,0
0,AKT1_human_kinase_ARCHS4_coexpression\tGNB2\tI...
1,AKT2_human_kinase_ARCHS4_coexpression\tSUPT5H\...
2,AKT3_human_kinase_ARCHS4_coexpression\tARMC12\...
3,CDC42BPA_human_kinase_ARCHS4_coexpression\tCSN...
4,CDC42BPB_human_kinase_ARCHS4_coexpression\tCEP...


In [5]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    
    lst2 = df.ix[index, 0].split('\t')[1:]
    lst1 = [df.ix[index, 0].split('\t')[0].split('_')[0]]*(len(lst2)-1)
    lst2.pop(0)
    temp = pd.DataFrame()
    temp['Kinase'] = lst1
    temp['Gene'] = lst2
    df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  498 Out of 498   

In [6]:
df_interactions.head()

Unnamed: 0,Kinase,Gene
0,AKT1,INPPL1
1,AKT1,CENPB
2,AKT1,TAOK2
3,AKT1,PIEZO1
4,AKT1,SMARCD2


In [7]:
df_interactions.shape

(148902, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [8]:
df_interactions.set_index('Gene', inplace=True)

In [9]:
mf.mapgenesymbols(df_interactions)

Progeres: 100%  148902 Out of 148902   

# Drop Duplicates

In [10]:
df_interactions.reset_index(inplace=True)

In [11]:
df_interactions.drop_duplicates(inplace=True)

In [12]:
df_interactions.shape

(140739, 2)

# Create Binary Matrix

In [13]:
binary_matrix = mf.createBinaryMatix(df_interactions)

Progeres: 100%  17201 Out of 17201   

In [14]:
binary_matrix.head()

Unnamed: 0,MKNK2,PRKY,TESK1,MINK1,SBK2,PIK3CA,MAP2K1,MAP3K8,ANKK1,DAPK3,...,CLK4,PSKH2,NUAK1,PXK,STYK1,TIE1,CAMK1D,PEAK1,PKN3,RPS6KA4
ABCC2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CFAP47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM133B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LYPD4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CYP2J2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
binary_matrix.shape

(17201, 498)

# Save Binary Matrix

In [16]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_kinase_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [17]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [18]:
name = 'archs4_kinase_gene_set'

In [19]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  498 Out of 498   

# Create Attribute Library

In [20]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [21]:
name = 'archs4_kinase_attribute_set'

In [22]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  17201 Out of 17201   

# Create Gene Similarity Matrix

In [23]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [24]:
gene_similarity_matix.head()

Unnamed: 0,ABCC2,CFAP47,FAM133B,LYPD4,CYP2J2,INS-IGF2,TAS2R1,AP3S2,MORC3,DERL2,...,HTR1B,RPSAP26,RIOX2,RPSAP31,NDN,LIX1,GDF5OS,SPATA20,RTKN2,CMAHP
ABCC2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CFAP47,0.0,1.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.125
FAM133B,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LYPD4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CYP2J2,0.0,0.166667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0


# Save Gene Similarity Matrix

In [25]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_kinase_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [26]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [27]:
attribute_similarity_matix.head()

Unnamed: 0,MKNK2,PRKY,TESK1,MINK1,SBK2,PIK3CA,MAP2K1,MAP3K8,ANKK1,DAPK3,...,CLK4,PSKH2,NUAK1,PXK,STYK1,TIE1,CAMK1D,PEAK1,PKN3,RPS6KA4
MKNK2,1.0,0.007519,0.15528,0.237895,0.0,0.010327,0.132692,0.037168,0.001783,0.106942,...,0.0,0.0,0.001786,0.018018,0.0,0.006944,0.006873,0.0,0.059459,0.229167
PRKY,0.007519,1.0,0.0,0.0,0.002141,0.011299,0.0,0.015152,0.015873,0.0,...,0.070664,0.001984,0.001961,0.015779,0.0,0.003788,0.0,0.038618,0.0,0.0
TESK1,0.15528,0.0,1.0,0.206897,0.289474,0.0,0.126506,0.012704,0.0,0.126253,...,0.0,0.0,0.003766,0.003738,0.0,0.007299,0.012704,0.0,0.025641,0.178197
MINK1,0.237895,0.0,0.206897,1.0,0.0,0.008562,0.32809,0.001704,0.0,0.213115,...,0.0,0.0,0.007156,0.0125,0.001712,0.013937,0.048128,0.0,0.09462,0.351598
SBK2,0.0,0.002141,0.289474,0.0,1.0,0.0,0.0,0.0,0.002028,0.001919,...,0.0,0.0,0.002033,0.002016,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [28]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_kinase_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [29]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  17201 Out of 17201   

In [30]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,ABCC2,1244
1,CFAP47,286464
2,FAM133B,257415
3,LYPD4,147719
4,CYP2J2,1573


In [31]:
gene_list.shape

(17201, 2)

# Save Gene List

In [32]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_kinase_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [33]:
attribute_list = mf.createAttributeList(binary_matrix)

In [34]:
attribute_list.head()

Unnamed: 0,Attributes
0,MKNK2
1,PRKY
2,TESK1
3,MINK1
4,SBK2


In [35]:
attribute_list.shape

(498, 1)

# Save Attribute List

In [36]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_kinase_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [37]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [38]:
name = 'archs4_kinase_gene_attribute_edge_list'

In [39]:
mf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  498 Out of 498   

 The number of statisticaly relevent gene-attribute associations is: 140739
