# TargetScanHuman

Author: Moshe Silverstein <br/>
Date: 11-17 <br/>
Data Source: http://www.targetscan.org/vert_61/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/TargetScanHuman/untility_functions.py'>

# Load Data

In [11]:
conserved = pd.read_csv('Input/Conserved_Site_Context_Scores.txt', sep="\t")

In [12]:
conserved.head()

Unnamed: 0,Gene ID,Gene Symbol,Transcript ID,Gene Tax ID,miRNA,Site Type,UTR_start,UTR end,3prime pairing,local AU,position,TA,SPS,context+ score,context+ score percentile
0,29974,A1CF,NM_001198819,8364,xtr-miR-200b,3,167,174,0.003,0.048,-0.101,0.023,0.096,-0.178,79.0
1,29974,A1CF,NM_001198820,8364,xtr-miR-429,3,167,174,0.003,0.048,-0.101,0.023,0.096,-0.178,79.0
2,29974,A1CF,NM_001198818,8364,xtr-miR-429,3,167,174,0.003,0.048,-0.101,0.023,0.096,-0.178,79.0
3,29974,A1CF,NM_001198820,8364,xtr-miR-200b,3,167,174,0.003,0.048,-0.101,0.023,0.096,-0.178,79.0
4,29974,A1CF,NM_001198819,8364,xtr-miR-429,3,167,174,0.003,0.048,-0.101,0.023,0.096,-0.178,79.0


In [13]:
conserved.shape

(4056351, 15)

In [14]:
nonconserved = pd.read_csv('Input/Nonconserved_Site_Context_Scores.txt', sep="\t")

In [15]:
nonconserved.head()

Unnamed: 0,Gene ID,Gene Symbol,Transcript ID,Gene Tax ID,miRNA,Site Type,UTR_start,UTR end,3prime pairing,local AU,position,TA,SPS,context+ score,context+ score percentile
0,1,A1BG,NM_130786,9598,ptr-miR-1197,2,28,34,0.012,0.071,-0.058,0.0,-0.037,-0.132,61.0
1,1,A1BG,NM_130786,9598,ptr-miR-1224-5p,2,30,36,0.003,0.043,-0.058,0.001,-0.039,-0.17,79.0
2,1,A1BG,NM_130786,9598,ptr-miR-198,2,51,57,0.03,0.045,-0.055,0.0,-0.039,-0.139,62.0
3,1,A1BG,NM_130786,9598,ptr-miR-455,2,53,59,0.012,0.087,-0.054,-0.002,-0.034,-0.111,47.0
4,1,A1BG,NM_130786,9598,ptr-miR-575,2,56,62,-0.007,0.115,-0.054,0.003,-0.047,-0.11,46.0


In [17]:
nonconserved.shape

(25847058, 15)

In [18]:
df = pd.concat([conserved, nonconserved])

In [19]:
df.shape

(29903409, 15)

# Get Relevant Data

In [20]:
# get only relevetn spcies

human = df[df['Gene Tax ID'] == 9606].copy()
mouse = df[df['Gene Tax ID'] == 10090].copy()

df = pd.concat([human, mouse])

In [21]:
df.shape

(13206058, 15)

In [23]:
df = df[['Gene Symbol', 'miRNA']]

In [24]:
df.head()

Unnamed: 0,Gene Symbol,miRNA
894,A1CF,hsa-miR-4711-3p
895,A1CF,hsa-miR-4711-3p
896,A1CF,hsa-miR-4711-3p
897,A1CF,hsa-miR-4711-3p
898,A1CF,hsa-miR-4711-3p


In [25]:
df.drop_duplicates(inplace=True)

In [26]:
df.shape

(5971589, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [27]:
df.set_index('Gene Symbol', inplace=True)

In [28]:
uf.mapgenesymbols(df)

Progeres: 99%  5971563 Out of 5971589   

# Drop Duplicates

In [29]:
df.reset_index(inplace=True)

In [30]:
df.drop_duplicates(inplace=True)

In [31]:
df.shape

(5898446, 2)

# Create Binary Matrix

In [32]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  18028 Out of 18028   

In [33]:
binary_matrix.head()

Unnamed: 0,hsa-miR-4713-5p,hsa-miR-622,hsa-miR-4771,hsa-miR-2861,hsa-miR-326,mmu-miR-467a,mmu-miR-3098-3p,mmu-miR-466c-3p,hsa-miR-4749-5p,hsa-miR-362-5p,...,hsa-miR-96,hsa-miR-4468,mmu-miR-466p-3p,mmu-miR-125a-5p,mmu-miR-615-5p,hsa-miR-508-3p,mmu-miR-674,hsa-miR-378f,hsa-miR-4634,hsa-miR-4436b-5p
NKAP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
KRTCAP3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SLC25A4,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
ZNF621,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
GZMK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
binary_matrix.shape

(18028, 2318)

# Save Binary Matrix

In [35]:
filename = '~/./Documents/Harmonizome/TargetScanHuman/Output/targetscanhuman_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [39]:
path = '/Users/moshesilverstein/Documents/Harmonizome/TargetScanHuman/Output/'

In [40]:
name = 'targetscanhuman_gene_set'

In [41]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  2318 Out of 2318   

# Create Attribute Library

In [42]:
path = '/Users/moshesilverstein/Documents/Harmonizome/TargetScanHuman/Output/'

In [43]:
name = 'targetscanhuman_attribute_set'

In [44]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  18028 Out of 18028   

# Create Gene Similarity Matrix

In [36]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [37]:
gene_similarity_matix.head()

Unnamed: 0,NKAP,KRTCAP3,SLC25A4,ZNF621,GZMK,PPP1R1C,ITGA3,LOXL1,SOD3,AGO1,...,CPZ,HTR2A,STXBP2,NID1,CACNG7,FKBPL,CCK,RBM27,TICRR,C4orf51
NKAP,1.0,0.015152,0.026426,0.032151,0.0,0.029703,0.026915,0.0,0.007874,0.037201,...,0.0,0.03038,0.011494,0.037225,0.025287,0.0,0.037736,0.027211,0.030769,0.007299
KRTCAP3,0.015152,1.0,0.034106,0.033696,0.014706,0.00885,0.027888,0.044444,0.026022,0.032062,...,0.014493,0.036408,0.0,0.029268,0.019694,0.0,0.010929,0.029216,0.031977,0.053333
SLC25A4,0.026426,0.034106,1.0,0.303437,0.020633,0.073548,0.164241,0.05741,0.075795,0.271246,...,0.024793,0.131723,0.002817,0.186002,0.146838,0.002937,0.057641,0.188793,0.093501,0.052486
ZNF621,0.032151,0.033696,0.303437,1.0,0.034292,0.059979,0.139757,0.055734,0.060277,0.315685,...,0.031938,0.170172,0.00444,0.224315,0.132735,0.001143,0.051392,0.2657,0.117362,0.042576
GZMK,0.0,0.014706,0.020633,0.034292,1.0,0.019231,0.01833,0.017751,0.003861,0.028897,...,0.097345,0.0275,0.010989,0.051107,0.015801,0.0,0.0,0.03406,0.005935,0.021583


# Save Gene Similarity Matrix

In [38]:
filename = '~/./Documents/Harmonizome/TargetScanHuman/Output/targetscanhuman_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [45]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [46]:
attribute_similarity_matix.head()

Unnamed: 0,hsa-miR-4713-5p,hsa-miR-622,hsa-miR-4771,hsa-miR-2861,hsa-miR-326,mmu-miR-467a,mmu-miR-3098-3p,mmu-miR-466c-3p,hsa-miR-4749-5p,hsa-miR-362-5p,...,hsa-miR-96,hsa-miR-4468,mmu-miR-466p-3p,mmu-miR-125a-5p,mmu-miR-615-5p,hsa-miR-508-3p,mmu-miR-674,hsa-miR-378f,hsa-miR-4634,hsa-miR-4436b-5p
hsa-miR-4713-5p,1.0,0.215744,0.165323,0.217311,0.246228,0.138668,0.210659,0.157135,0.098999,0.167989,...,0.198366,0.219328,0.157135,0.208684,0.14496,0.121212,0.205206,0.185346,0.035311,0.204914
hsa-miR-622,0.215744,1.0,0.159639,0.166518,0.210066,0.138323,0.19731,0.154885,0.080306,0.166667,...,0.18407,0.184438,0.154885,0.167719,0.121468,0.129439,0.195737,0.165456,0.036571,0.186002
hsa-miR-4771,0.165323,0.159639,1.0,0.153671,0.168867,0.132589,0.160905,0.130293,0.085878,0.134865,...,0.156945,0.178025,0.130293,0.14868,0.12171,0.105218,0.156908,0.138108,0.036488,0.15949
hsa-miR-2861,0.217311,0.166518,0.153671,1.0,0.264034,0.111813,0.173158,0.107953,0.132638,0.12384,...,0.157267,0.215891,0.107953,0.192811,0.169108,0.090554,0.168632,0.17855,0.037542,0.148373
hsa-miR-326,0.246228,0.210066,0.168867,0.264034,1.0,0.128372,0.211003,0.149983,0.11488,0.150994,...,0.189721,0.240525,0.149983,0.213394,0.159631,0.112832,0.204211,0.191801,0.034735,0.186689


# Save Attribute Similarity Matrix

In [47]:
filename = '~/./Documents/Harmonizome/TargetScanHuman/Output/targetscanhuman_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [48]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  18028 Out of 18028   

In [49]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,NKAP,79576
1,KRTCAP3,200634
2,SLC25A4,291
3,ZNF621,285268
4,GZMK,3003


In [50]:
gene_list.shape

(18028, 2)

# Save Gene List

In [51]:
filename = '~/./Documents/Harmonizome/TargetScanHuman/Output/targetscanhuman_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [52]:
attribute_list = uf.createAttributeList(binary_matrix)

In [53]:
attribute_list.head()

Unnamed: 0,Attributes
0,hsa-miR-4713-5p
1,hsa-miR-622
2,hsa-miR-4771
3,hsa-miR-2861
4,hsa-miR-326


In [54]:
attribute_list.shape

(2318, 1)

# Save Attribute List

In [55]:
filename = '~/./Documents/Harmonizome/TargetScanHuman/Output/targetscanhuman_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [56]:
path = '/Users/moshesilverstein/Documents/Harmonizome/TargetScanHuman/Output/'

In [57]:
name = 'targetscanhuman_gene_attribute_edge_list'

In [58]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  2318 Out of 2318   

 The number of statisticaly relevent gene-attribute associations is: 5898446
