# ARCHS4 (IDG)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: 

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
from collections import Counter
import json
import re
import scipy
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/untility_functions.py'>

# Load Data (coexpression data generated gmt file)

In [7]:
df = pd.read_csv('Input/IDG/ARCHS4_human_IDG_Coexp.gmt', sep='$', header=None)

In [8]:
df.head()

Unnamed: 0,0
0,PRKAG1_IDG_kinase_ARCHS4_coexpression\tSLC29A4...
1,PRKAB1_IDG_kinase_ARCHS4_coexpression\tCIR1P1\...
2,ADCK2_IDG_kinase_ARCHS4_coexpression\tHOXC9\tN...
3,ADCK5_IDG_kinase_ARCHS4_coexpression\tRP11-402...
4,ALPK3_IDG_kinase_ARCHS4_coexpression\tTNNC1\tM...


In [9]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    
    lst2 = df.ix[index, 0].split('\t')[1:]
    lst1 = [df.ix[index, 0].split('\t')[0].split('_')[0]]*(len(lst2)-1)
    lst2.pop(0)
    temp = pd.DataFrame()
    temp['IDG Term'] = lst1
    temp['Gene'] = lst2
    df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  352 Out of 352   

In [10]:
df_interactions.head()

Unnamed: 0,IDG Term,Gene
0,PRKAG1,PPP1R7
1,PRKAG1,OR4F13P
2,PRKAG1,CHMP2B
3,PRKAG1,BBOX1
4,PRKAG1,TRBV2


In [11]:
df_interactions.shape

(105248, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [12]:
df_interactions.set_index('Gene', inplace=True)

In [13]:
uf.mapgenesymbols(df_interactions)

Progeres: 99%  105238 Out of 105248   

# Drop Duplicates

In [14]:
df_interactions.reset_index(inplace=True)

In [15]:
df_interactions.drop_duplicates(inplace=True)

In [16]:
df_interactions.shape

(95958, 2)

# Create Binary Matrix

In [17]:
binary_matrix = uf.createBinaryMatix(df_interactions)

Progeres: 100%  18020 Out of 18020   

In [18]:
binary_matrix.head()

Unnamed: 0,CHRND,SUCNR1,GPR75,GPR63,ADGRG2,HCN3,GPR146,ADCK2,KCNAB3,MRGPRX3,...,KCNIP1,TTYH2,PKD2L2,PIK3C2G,CACNA2D2,GABRG1,ADGRE1,TPRA1,SLC26A1,PRPF4B
CFAP45,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HTR4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
TMEM176B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCDC144CP,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
ZNF223,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
binary_matrix.shape

(18020, 352)

# Save Binary Matrix

In [21]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_idg_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [25]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [26]:
name = 'archs4_idg_gene_set'

In [27]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  352 Out of 352   

# Create Attribute Library

In [28]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [29]:
name = 'archs4_idg_attribute_set'

In [30]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  18020 Out of 18020   

# Create Gene Similarity Matrix

In [31]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [32]:
gene_similarity_matix.head()

Unnamed: 0,CFAP45,HTR4,TMEM176B,CCDC144CP,ZNF223,TAB2,ZNF346,GOLGA2P11,LSMEM1,NDUFA3,...,RFX4,TNFSF8,HAAO,SLC25A41,NDFIP2,RPL12P17,IQCF3,SVIL,CFLAR,GOLGA5
CFAP45,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025641,0.0,0.0
HTR4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.047619,0.0
TMEM176B,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CCDC144CP,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZNF223,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.038462,0.0,0.0


# Save Gene Similarity Matrix

In [33]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_idg_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [34]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [35]:
attribute_similarity_matix.head()

Unnamed: 0,CHRND,SUCNR1,GPR75,GPR63,ADGRG2,HCN3,GPR146,ADCK2,KCNAB3,MRGPRX3,...,KCNIP1,TTYH2,PKD2L2,PIK3C2G,CACNA2D2,GABRG1,ADGRE1,TPRA1,SLC26A1,PRPF4B
CHRND,1.0,0.0,0.005474,0.001767,0.0,0.001754,0.009363,0.001733,0.0,0.001953,...,0.0,0.008666,0.0,0.0,0.00367,0.0,0.0,0.0,0.0,0.0
SUCNR1,0.0,1.0,0.0,0.0,0.0,0.0,0.007449,0.017544,0.0,0.005859,...,0.0,0.001715,0.0,0.0,0.001825,0.0,0.104762,0.154,0.003478,0.003534
GPR75,0.005474,0.0,1.0,0.005505,0.003861,0.003636,0.001927,0.0,0.007921,0.0,...,0.035382,0.048417,0.005952,0.003623,0.00956,0.108696,0.0,0.0,0.001802,0.030132
GPR63,0.001767,0.0,0.005505,1.0,0.044834,0.003534,0.0,0.0,0.031434,0.0,...,0.008818,0.0,0.027505,0.003521,0.007407,0.006981,0.0,0.0,0.0,0.010772
ADGRG2,0.0,0.0,0.003861,0.044834,1.0,0.0,0.0,0.0,0.19759,0.0,...,0.011152,0.0,0.246851,0.013084,0.003891,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [36]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_idg_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [37]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  18020 Out of 18020   

In [38]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,CFAP45,25790
1,HTR4,3360
2,TMEM176B,28959
3,CCDC144CP,348254
4,ZNF223,7766


In [39]:
gene_list.shape

(18020, 2)

# Save Gene List

In [40]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_idg_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [44]:
attribute_list = uf.createAttributeList(binary_matrix)

In [45]:
attribute_list.head()

Unnamed: 0,Attributes
0,CHRND
1,SUCNR1
2,GPR75
3,GPR63
4,ADGRG2


In [46]:
attribute_list.shape

(352, 1)

# Save Attribute List

In [47]:
filename = '~/./Documents/Harmonizome/ARCHS4/Output/archs4_idg_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [48]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ARCHS4/Output/'

In [49]:
name = 'archs4_idg_gene_attribute_edge_list'

In [50]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  352 Out of 352   

 The number of statisticaly relevent gene-attribute associations is: 95958
