# TargetScanHuman

Author: John Erol Evangelista <br/>
Adapted from: https://github.com/MaayanLab/HarmonizomePythonScripts/blob/master/TargetScanHuman/TargetScanHuman.ipynb <br/>
Date: 01-19 <br/>
Data Source: http://www.targetscan.org/vert_72/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
if "/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts" not in sys.path:
    sys.path.append("/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts")
import utility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'utility_functions' from '/Users/maayan/sigsets/Harmonizome/HarmonizomePythonScripts/utility_functions.py'>

## Load tsv

In [3]:
conserved = pd.read_csv('../../Data/Conserved_Site_Context_Scores.txt', sep="\t")

In [4]:
conserved.head()

Unnamed: 0,Gene ID,Gene Symbol,Transcript ID,Gene Tax ID,miRNA,Site Type,UTR_start,UTR end,context++ score,context++ score percentile,weighted context++ score,weighted context++ score percentile
0,ENSG00000121410.7,A1BG,ENST00000263100.3,9544,mml-miR-23a-3p,3,142,149,-0.428,97,-0.388,97
1,ENSG00000121410.7,A1BG,ENST00000263100.3,9544,mml-miR-23b-3p,3,142,149,-0.428,97,-0.388,97
2,ENSG00000121410.7,A1BG,ENST00000263100.3,9598,ptr-miR-23a,3,143,150,-0.419,97,-0.419,98
3,ENSG00000121410.7,A1BG,ENST00000263100.3,9598,ptr-miR-23b,3,143,150,-0.419,97,-0.419,98
4,ENSG00000121410.7,A1BG,ENST00000263100.3,9598,ptr-miR-23c,3,143,150,-0.419,97,-0.419,98


In [5]:
conserved.shape

(1468777, 12)

In [6]:
nonconserved = pd.read_csv('../../Data/Nonconserved_Site_Context_Scores.txt', sep="\t")

In [7]:
nonconserved.head()

Unnamed: 0,Gene ID,Gene Symbol,Transcript ID,Gene Tax ID,miRNA,Site Type,UTR_start,UTR end,context++ score,context++ score percentile,weighted context++ score,weighted context++ score percentile
0,ENSG00000121410.7,A1BG,ENST00000263100.3,13616,mdo-miR-129-3p,2,52,58,-0.278,93,-0.06,66
1,ENSG00000121410.7,A1BG,ENST00000263100.3,13616,mdo-miR-140-3p,2,70,76,-0.209,89,-0.046,65
2,ENSG00000121410.7,A1BG,ENST00000263100.3,13616,mdo-miR-193a-5p,1,155,161,-0.218,86,-0.047,52
3,ENSG00000121410.7,A1BG,ENST00000263100.3,13616,mdo-miR-22-3p,2,30,36,-0.125,70,-0.028,49
4,ENSG00000121410.7,A1BG,ENST00000263100.3,13616,mdo-miR-28,2,33,39,-0.11,72,-0.025,43


In [8]:
nonconserved.shape

(38497659, 12)

In [9]:
df = pd.concat([conserved, nonconserved])

In [11]:
df.shape

(39966436, 12)

## Filter Data

In [15]:
df = df[df["Gene Tax ID"].isin([9606, 10090])]

In [16]:
df = df[['Gene Symbol', 'miRNA']]

In [17]:
df.head(10)

Unnamed: 0,Gene Symbol,miRNA
5,A1BG,hsa-miR-23c
6,A1BG,hsa-miR-23b-3p
7,A1BG,hsa-miR-23a-3p
8,A1BG,hsa-miR-130a-5p
101,A1CF,hsa-miR-7-5p
102,A1CF,hsa-miR-30c-5p
103,A1CF,hsa-miR-30b-5p
104,A1CF,hsa-miR-30a-5p
105,A1CF,hsa-miR-30e-5p
106,A1CF,hsa-miR-30d-5p


In [18]:
df.drop_duplicates(inplace=True)

In [23]:
df.shape

(15275658, 2)

## Standardize Gene Symbols

In [20]:
df.set_index('Gene Symbol', inplace=True)

In [21]:
uf.mapgenesymbols_updated(df)

Progress: 99%  15513001 Out of 15513272   

## Drop Duplicates

In [22]:
df.reset_index(inplace=True)

In [24]:
df.drop_duplicates(inplace=True)

In [26]:
df.shape

(15191473, 2)

## Create Binary Matrix

In [28]:
binary_matrix = uf.createBinaryMatrix(df)

Progress: 100%  18375 Out of 18375   

In [29]:
binary_matrix.head()

Unnamed: 0,hsa-miR-4298,mmu-miR-6953-5p,hsa-miR-611,mmu-miR-3074-2-3p,mmu-miR-1896,hsa-miR-3658,mmu-miR-291b-3p,mmu-miR-6401,hsa-miR-2115-5p,mmu-miR-1247-3p,...,hsa-miR-4295,hsa-miR-488-3p,mmu-miR-3072-5p,mmu-miR-296-5p,hsa-miR-4508,mmu-miR-6959-5p,hsa-miR-33b-5p,hsa-miR-4637,mmu-miR-7093-3p,mmu-miR-7236-3p
FABP2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAPPC5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FBXW9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
MT1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DNAH17,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [30]:
binary_matrix.shape

(18375, 4543)

In [31]:
filename = '../../Output/TargetScan/targetscanhuman_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

## Create Gene Set Library

In [32]:
outpath = "../../Output/TargetScan"

In [34]:
name = 'targetscanhuman_gene_set'

In [36]:
uf.createUpGeneSetLib(binary_matrix, outpath, name)

Progress: 100%  4543 Out of 4543   

## Create Attribute Library

In [37]:
name = 'targetscanhuman_attribute_set'

In [38]:
uf.createUpAttributeSetLib(binary_matrix, outpath, name)

Progress: 100%  18375 Out of 18375   

## Create Gene Similarity Matrix

In [39]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [40]:
gene_similarity_matix.head()

Unnamed: 0,FABP2,TRAPPC5,FBXW9,MT1H,DNAH17,JHY,NSMCE2,LYG2,SFI1,TADA2B,...,FAM167A,STOX2,STX5,MPHOSPH10,MGAT5B,PRKCZ,SLC41A1,OR7A17,PDE3B,CSK
,,,,,,,,,,,,,,,,,,,,,
FABP2,1.0,0.001595,0.015056,0.065097,0.16233,0.045699,0.077934,0.02568,0.098756,0.125633,...,0.113428,0.148023,0.063256,0.014173,0.089385,0.036295,0.101266,0.004862,0.174312,0.026804
TRAPPC5,0.001595,1.0,0.008032,0.0,0.013541,0.0,0.0,0.0,0.010707,0.008485,...,0.008658,0.005654,0.003431,0.0,0.013145,0.0,0.00968,0.0,0.004603,0.006897
FBXW9,0.015056,0.008032,1.0,0.020833,0.061354,0.012626,0.03629,0.023729,0.055347,0.050085,...,0.064005,0.04581,0.04502,0.0,0.076433,0.034404,0.073324,0.0,0.033244,0.045608
MT1H,0.065097,0.0,0.020833,1.0,0.069191,0.014045,0.04578,0.003831,0.067913,0.050029,...,0.049763,0.038279,0.056769,0.004425,0.053456,0.059278,0.053483,0.015,0.050083,0.026596
DNAH17,0.16233,0.013541,0.061354,0.069191,1.0,0.034548,0.133146,0.014417,0.184316,0.250405,...,0.233562,0.279647,0.118399,0.004648,0.218766,0.055348,0.240749,0.009492,0.248721,0.083672


In [41]:
filename = outpath+'/targetscanhuman_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

## Create Attribute Similarity matrix

In [42]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [43]:
attribute_similarity_matix.head()

Unnamed: 0,hsa-miR-4298,mmu-miR-6953-5p,hsa-miR-611,mmu-miR-3074-2-3p,mmu-miR-1896,hsa-miR-3658,mmu-miR-291b-3p,mmu-miR-6401,hsa-miR-2115-5p,mmu-miR-1247-3p,...,hsa-miR-4295,hsa-miR-488-3p,mmu-miR-3072-5p,mmu-miR-296-5p,hsa-miR-4508,mmu-miR-6959-5p,hsa-miR-33b-5p,hsa-miR-4637,mmu-miR-7093-3p,mmu-miR-7236-3p
,,,,,,,,,,,,,,,,,,,,,
hsa-miR-4298,1.0,0.17194,0.07141,0.160169,0.198211,0.20643,0.145427,0.141212,0.211495,0.142768,...,0.184932,0.208005,0.158399,0.1378,0.113821,0.193702,0.185986,0.134868,0.175377,0.075914
mmu-miR-6953-5p,0.17194,1.0,0.067615,0.144708,0.187177,0.148363,0.144886,0.121739,0.168223,0.146718,...,0.141246,0.158027,0.15844,0.173222,0.117306,0.179307,0.139625,0.093471,0.164676,0.077135
hsa-miR-611,0.07141,0.067615,1.0,0.052809,0.064824,0.052476,0.052695,0.04494,0.069379,0.058053,...,0.056794,0.058701,0.075596,0.073551,0.07783,0.065951,0.056696,0.044051,0.055376,0.047733
mmu-miR-3074-2-3p,0.160169,0.144708,0.052809,1.0,0.221541,0.245942,0.187062,0.187593,0.186076,0.147437,...,0.185998,0.221582,0.126413,0.100103,0.070282,0.214911,0.208782,0.160187,0.220678,0.083162
mmu-miR-1896,0.198211,0.187177,0.064824,0.221541,1.0,0.262511,0.191776,0.187143,0.2377,0.174196,...,0.200212,0.254023,0.16036,0.152143,0.095481,0.244849,0.227002,0.154418,0.24834,0.088485


In [45]:
filename = outpath+'/targetscanhuman_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

## Create Gene List

In [46]:
gene_list = uf.createGeneList_updated(binary_matrix)

Progress: 100%  18375 Out of 18375   

In [47]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,FABP2,2169
1,TRAPPC5,126003
2,FBXW9,84261
3,MT1H,4496
4,DNAH17,8632


In [48]:
gene_list.shape

(18375, 2)

In [49]:
filename = outpath + '/targetscanhuman_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

## Create Attribute List

In [50]:
attribute_list = uf.createAttributeList(binary_matrix)

In [51]:
attribute_list.head()

hsa-miR-4298
mmu-miR-6953-5p
hsa-miR-611
mmu-miR-3074-2-3p
mmu-miR-1896


In [52]:
attribute_list.shape

(4543, 0)

In [53]:
filename = outpath + '/targetscanhuman_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

## Create Gene-Attribute Edge List

In [54]:
name = 'targetscanhuman_gene_attribute_edge_list'

In [57]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, outpath, name)

Progress: 100%  4543 Out of 4543   

 The number of statisticaly relevent gene-attribute associations is: 15191473
