# miRTarBase

Author: Moshe Silverstein <br/>
Date: 11-17 <br/>
Data Source: http://mirtarbase.mbc.nctu.edu.tw/php/index.php

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/miTarBase/untility_functions.py'>

# Load Data

In [4]:
df = pd.read_excel('Input/miRTarBase_MTI.xlsx')

In [6]:
df.head()

Unnamed: 0,miRTarBase ID,miRNA,Species (miRNA),Target Gene,Target Gene (Entrez ID),Species (Target Gene),Experiments,Support Type,References (PMID)
0,MIRT002091,ath-miR398c-3p,Arabidopsis thaliana,CSD2,817365,Arabidopsis thaliana,Western blot,Functional MTI,18392778
1,MIRT002091,ath-miR398c-3p,Arabidopsis thaliana,CSD2,817365,Arabidopsis thaliana,"5""RACE//Northern blot",Functional MTI (Weak),20400846
2,MIRT002092,ath-miR398b-3p,Arabidopsis thaliana,CSD2,817365,Arabidopsis thaliana,Western blot,Functional MTI,18392778
3,MIRT002092,ath-miR398b-3p,Arabidopsis thaliana,CSD2,817365,Arabidopsis thaliana,"5""RACE//Northern blot",Functional MTI (Weak),20400846
4,MIRT002093,ath-miR398c-3p,Arabidopsis thaliana,CSD1,837405,Arabidopsis thaliana,Western blot,Functional MTI,18392778


In [7]:
df.shape

(557182, 9)

# Get Relevant Data

In [8]:
# get only relevetn spcies

human = df[df['Species (miRNA)'] == 'Homo sapiens'].copy()
mouse = df[df['Species (miRNA)'] == 'Mus musculus'].copy()

df = pd.concat([human, mouse])

In [9]:
df.shape

(552486, 9)

In [10]:
# get only relevetn spcies

human = df[df['Species (Target Gene)'] == 'Homo sapiens'].copy()
mouse = df[df['Species (Target Gene)'] == 'Mus musculus'].copy()

df = pd.concat([human, mouse])

In [11]:
df.shape

(552482, 9)

In [12]:
df = df[['miRNA', 'Target Gene']] 

In [13]:
df.head()

Unnamed: 0,miRNA,Target Gene
4014,hsa-miR-20a-5p,HIF1A
4015,hsa-miR-20a-5p,HIF1A
4016,hsa-miR-20a-5p,HIF1A
4017,hsa-miR-146a-5p,CXCR4
4018,hsa-miR-146a-5p,CXCR4


In [14]:
df.drop_duplicates(inplace=True)

In [15]:
df.shape

(421320, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [16]:
df.set_index('Target Gene', inplace=True)

In [17]:
uf.mapgenesymbols(df)

Progeres: 100%  421320 Out of 421320   

# Drop Duplicates

In [18]:
df.reset_index(inplace=True)

In [19]:
df.drop_duplicates(inplace=True)

In [20]:
df.shape

(417884, 2)

# Create Binary Matrix

In [21]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  15575 Out of 15575   

In [22]:
binary_matrix.head()

Unnamed: 0,hsa-miR-507,hsa-miR-6723-5p,mmu-miR-7222-3p,hsa-miR-3934-3p,hsa-miR-4657,hsa-miR-130b-5p,mmu-miR-1193-5p,hsa-miR-6782-5p,hsa-miR-190b,hsa-miR-1286,...,hsa-miR-6886-3p,hsa-miR-1226-3p,hsa-miR-513a-5p,hsa-miR-6767-5p,mmu-miR-374c-5p,hsa-miR-126-5p,hsa-miR-3944-5p,hsa-miR-548ay-5p,hsa-miR-4322,mmu-miR-935
PABPN1L,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GJB7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DEFB105B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NME2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CALN1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
binary_matrix.shape

(15575, 3551)

# Save Binary Matrix

In [24]:
filename = '~/./Documents/Harmonizome/miTarBase/Output/mirtarbase_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [25]:
path = '/Users/moshesilverstein/Documents/Harmonizome/miTarBase/Output/'

In [26]:
name = 'mirtarbase_gene_set'

In [27]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  3551 Out of 3551   

# Create Attribute Library

In [28]:
path = '/Users/moshesilverstein/Documents/Harmonizome/miTarBase/Output/'

In [29]:
name = 'mirtarbase_attribute_set'

In [30]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  15575 Out of 15575   

# Create Gene Similarity Matrix

In [31]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [32]:
gene_similarity_matix.head()

Unnamed: 0,PABPN1L,GJB7,DEFB105B,NME2,CALN1,CLDN14,TTC22,CCDC120,MT-ND1,SMG5,...,EIF1,SPRR2C,ZBTB37,TCAM1P,CBLN3,IGLC1,TXNL4A,ECE1,HARS,ADGRF2
PABPN1L,1.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.03125,0.0,0.0,...,0.0,0.0,0.011834,0.0,0.0,0.0,0.02381,0.0,0.027027,0.0
GJB7,0.0,1.0,0.0,0.032258,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DEFB105B,0.0,0.0,1.0,0.029851,0.015385,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.005,0.0,0.0,0.0,0.0,0.022727,0.0,0.025316
NME2,0.0,0.032258,0.029851,1.0,0.0,0.0,0.037736,0.045455,0.0,0.0,...,0.026316,0.0,0.010989,0.0,0.0,0.0,0.0,0.053571,0.0,0.015873
CALN1,0.0,0.0,0.015385,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.013514,0.0,0.005556,0.0,0.0,0.0,0.0,0.026786,0.0,0.0


# Save Gene Similarity Matrix

In [33]:
filename = '~/./Documents/Harmonizome/miTarBase/Output/mirtarbase_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [34]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [35]:
attribute_similarity_matix.head()

Unnamed: 0,hsa-miR-507,hsa-miR-6723-5p,mmu-miR-7222-3p,hsa-miR-3934-3p,hsa-miR-4657,hsa-miR-130b-5p,mmu-miR-1193-5p,hsa-miR-6782-5p,hsa-miR-190b,hsa-miR-1286,...,hsa-miR-6886-3p,hsa-miR-1226-3p,hsa-miR-513a-5p,hsa-miR-6767-5p,mmu-miR-374c-5p,hsa-miR-126-5p,hsa-miR-3944-5p,hsa-miR-548ay-5p,hsa-miR-4322,mmu-miR-935
hsa-miR-507,1.0,0.00495,0.0,0.0,0.011673,0.019268,0.0,0.003559,0.004032,0.010453,...,0.0,0.009756,0.007673,0.0,0.0,0.016611,0.0033,0.034853,0.016064,0.0
hsa-miR-6723-5p,0.00495,1.0,0.0,0.009259,0.0,0.002801,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,0.0
mmu-miR-7222-3p,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hsa-miR-3934-3p,0.0,0.009259,0.0,1.0,0.012195,0.013986,0.0,0.010753,0.019737,0.020833,...,0.006329,0.006289,0.016949,0.0,0.0,0.004739,0.014493,0.010381,0.032468,0.0
hsa-miR-4657,0.011673,0.0,0.0,0.012195,1.0,0.007282,0.0,0.012048,0.015038,0.005714,...,0.0,0.003344,0.01083,0.012987,0.0,0.021277,0.016043,0.014925,0.0,0.0


# Save Attribute Similarity Matrix

In [36]:
filename = '~/./Documents/Harmonizome/miTarBase/Output/mirtarbase_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [37]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  15575 Out of 15575   

In [38]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,PABPN1L,390748
1,GJB7,375519
2,DEFB105B,504180
3,NME2,4831
4,CALN1,83698


In [39]:
gene_list.shape

(15575, 2)

# Save Gene List

In [40]:
filename = '~/./Documents/Harmonizome/miTarBase/Output/mirtarbase_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [41]:
attribute_list = uf.createAttributeList(binary_matrix)

In [42]:
attribute_list.head()

Unnamed: 0,Attributes
0,hsa-miR-507
1,hsa-miR-6723-5p
2,mmu-miR-7222-3p
3,hsa-miR-3934-3p
4,hsa-miR-4657


In [43]:
attribute_list.shape

(3551, 1)

# Save Attribute List

In [44]:
filename = '~/./Documents/Harmonizome/miTarBase/Output/mirtarbase_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [45]:
path = '/Users/moshesilverstein/Documents/Harmonizome/miTarBase/Output/'

In [46]:
name = 'mirtarbase_gene_attribute_edge_list'

In [47]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  3551 Out of 3551   

 The number of statisticaly relevent gene-attribute associations is: 417884
