# BioGPS (Human Tissue and Cell Type)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: http://biogps.org/#goto=welcome

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/BioGPS/untility_functions.py'>

# Load Data

In [3]:
matrix = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/BioGPS/Human U133A-GNF1H Gene Atlas/U133AGNF1B.gcrma.csv', sep=',')

In [4]:
matrix.head()

Unnamed: 0.1,Unnamed: 0,721_B_lymphoblasts,721_B_lymphoblasts.1,Adipocyte,Adipocyte.1,AdrenalCortex,AdrenalCortex.1,Adrenalgland,Adrenalgland.1,Amygdala,...,pineal_night.1,pineal_night.2,pineal_night.3,pineal_night.4,retina,retina.1,retina.2,retina.3,small_intestine,small_intestine.1
0,1007_s_at,167.8,106.2,31.6,30.6,61.2,45.4,98.0,64.4,401.6,...,543.3,182.5,134.4,117.3,1307.4,1530.6,335.0,349.7,134.2,202.1
1,1053_at,69.7,93.8,7.9,6.8,9.6,8.2,6.8,7.6,6.4,...,7.8,9.4,8.2,7.4,7.0,9.7,11.2,11.7,7.1,9.9
2,117_at,12.6,12.5,10.7,10.5,12.3,15.7,9.5,10.2,6.6,...,11.9,13.2,11.9,10.9,12.7,12.5,13.2,13.5,8.0,11.0
3,121_at,14.9,11.1,13.1,16.0,16.2,12.0,15.5,11.9,10.5,...,17.0,14.9,19.6,14.7,18.4,30.4,17.7,17.5,7.7,9.5
4,1255_g_at,5.5,5.4,4.8,4.2,5.7,4.3,3.8,4.1,3.3,...,545.1,590.7,37.0,158.9,4.3,5.4,2293.6,2155.1,3.7,5.2


In [5]:
matrix.shape

(44775, 177)

# Load Gene Data

In [6]:
matrix['Unnamed: 0'].to_csv('probes.txt', sep='\t', header=False, index=False)

In [7]:
%%capture
%system r -f MapToGenesU133A.R;

In [8]:
map_to_genes_U133A = pd.read_csv('ProbesToGenesU133A.tsv', sep='\t', header=None)

In [9]:
map_to_genes_U133A.drop_duplicates(subset=[0], keep=False, inplace=True)

In [10]:
map_to_genes_U133A.set_index(0, inplace=True)

In [11]:
map_to_genes_U133A.head()

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1053_at,RFC2,5982.0,replication factor C subunit 2
117_at,HSPA6,3310.0,heat shock protein family A (Hsp70) member 6
121_at,PAX8,7849.0,paired box 8
1255_g_at,GUCA1A,2978.0,guanylate cyclase activator 1A
1316_at,THRA,7067.0,"thyroid hormone receptor, alpha"


In [12]:
map_to_genes_U133A.shape

(43519, 3)

In [13]:
map_to_genes_GNF1H = pd.read_csv('Chip Annotation/gnf1h.annot2007.tsv', sep='\t')

In [14]:
map_to_genes_GNF1H.drop_duplicates(subset=['ProbesetID'], keep=False, inplace=True)

In [15]:
map_to_genes_GNF1H.set_index('ProbesetID', inplace=True)

In [16]:
map_to_genes_GNF1H.head()

Unnamed: 0_level_0,Num_matched_probes,RefSeq,UniGene,RIKEN,EntrezGene,Symbol,Description,Ensembl_representative,Cross_hyb_EntrezGeneIDs
ProbesetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
gnf1h00001_x_at,,,,,,CD36,"Human CD36 antigen (collagen type I receptor, ...",,
gnf1h00002_x_at,,,,,,ICAM1,Human intercellular adhesion molecule 1 (CD54)...,,
gnf1h00003_x_at,,,,,,CDKN1A,Human cyclin-dependent kinase inhibitor 1A (p2...,,
gnf1h00004_x_at,16.0,NM_000595,Hs.36,,4049.0,LTA,"lymphotoxin alpha (TNF superfamily, member 1)",,
gnf1h00005_x_at,,,,,,ACTA1,"Human actin, alpha 1, skeletal muscle (ACTA1),...",,


In [17]:
map_to_genes_GNF1H.shape

(22558, 9)

# Map Gene Symbol to Probe ID

In [18]:
lst = []

for index in matrix.index:
    if matrix.ix[index, 'Unnamed: 0'] in map_to_genes_U133A.index and type(map_to_genes_U133A.ix[matrix.ix[index, 'Unnamed: 0'], 1]) != float:
        lst.append(map_to_genes_U133A.ix[matrix.ix[index, 'Unnamed: 0'], 1])
    elif matrix.ix[index, 'Unnamed: 0'] in map_to_genes_GNF1H.index and type(map_to_genes_GNF1H.ix[matrix.ix[index, 'Unnamed: 0'], 'Symbol']) != float:
        lst.append(map_to_genes_GNF1H.ix[matrix.ix[index, 'Unnamed: 0'], 'Symbol'])
    else:
        lst.append(np.nan)

matrix['Unnamed: 0'] = lst        
        

In [19]:
matrix.head()

Unnamed: 0.1,Unnamed: 0,721_B_lymphoblasts,721_B_lymphoblasts.1,Adipocyte,Adipocyte.1,AdrenalCortex,AdrenalCortex.1,Adrenalgland,Adrenalgland.1,Amygdala,...,pineal_night.1,pineal_night.2,pineal_night.3,pineal_night.4,retina,retina.1,retina.2,retina.3,small_intestine,small_intestine.1
0,,167.8,106.2,31.6,30.6,61.2,45.4,98.0,64.4,401.6,...,543.3,182.5,134.4,117.3,1307.4,1530.6,335.0,349.7,134.2,202.1
1,RFC2,69.7,93.8,7.9,6.8,9.6,8.2,6.8,7.6,6.4,...,7.8,9.4,8.2,7.4,7.0,9.7,11.2,11.7,7.1,9.9
2,HSPA6,12.6,12.5,10.7,10.5,12.3,15.7,9.5,10.2,6.6,...,11.9,13.2,11.9,10.9,12.7,12.5,13.2,13.5,8.0,11.0
3,PAX8,14.9,11.1,13.1,16.0,16.2,12.0,15.5,11.9,10.5,...,17.0,14.9,19.6,14.7,18.4,30.4,17.7,17.5,7.7,9.5
4,GUCA1A,5.5,5.4,4.8,4.2,5.7,4.3,3.8,4.1,3.3,...,545.1,590.7,37.0,158.9,4.3,5.4,2293.6,2155.1,3.7,5.2


In [20]:
matrix.shape

(44775, 177)

In [21]:
matrix.dropna(subset=['Unnamed: 0'], inplace=True)

In [22]:
matrix.head()

Unnamed: 0.1,Unnamed: 0,721_B_lymphoblasts,721_B_lymphoblasts.1,Adipocyte,Adipocyte.1,AdrenalCortex,AdrenalCortex.1,Adrenalgland,Adrenalgland.1,Amygdala,...,pineal_night.1,pineal_night.2,pineal_night.3,pineal_night.4,retina,retina.1,retina.2,retina.3,small_intestine,small_intestine.1
1,RFC2,69.7,93.8,7.9,6.8,9.6,8.2,6.8,7.6,6.4,...,7.8,9.4,8.2,7.4,7.0,9.7,11.2,11.7,7.1,9.9
2,HSPA6,12.6,12.5,10.7,10.5,12.3,15.7,9.5,10.2,6.6,...,11.9,13.2,11.9,10.9,12.7,12.5,13.2,13.5,8.0,11.0
3,PAX8,14.9,11.1,13.1,16.0,16.2,12.0,15.5,11.9,10.5,...,17.0,14.9,19.6,14.7,18.4,30.4,17.7,17.5,7.7,9.5
4,GUCA1A,5.5,5.4,4.8,4.2,5.7,4.3,3.8,4.1,3.3,...,545.1,590.7,37.0,158.9,4.3,5.4,2293.6,2155.1,3.7,5.2
6,THRA,9.2,10.3,9.8,8.5,14.4,11.6,7.8,8.1,6.5,...,12.2,11.6,10.4,9.8,8.8,11.7,12.5,12.5,8.6,10.7


In [23]:
matrix.shape

(32900, 177)

In [24]:
matrix.rename(columns={'Unnamed: 0':'Genes'}, inplace=True)

In [25]:
matrix.set_index('Genes', inplace=True)

In [26]:
matrix.head()

Unnamed: 0_level_0,721_B_lymphoblasts,721_B_lymphoblasts.1,Adipocyte,Adipocyte.1,AdrenalCortex,AdrenalCortex.1,Adrenalgland,Adrenalgland.1,Amygdala,Amygdala.1,...,pineal_night.1,pineal_night.2,pineal_night.3,pineal_night.4,retina,retina.1,retina.2,retina.3,small_intestine,small_intestine.1
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RFC2,69.7,93.8,7.9,6.8,9.6,8.2,6.8,7.6,6.4,9.6,...,7.8,9.4,8.2,7.4,7.0,9.7,11.2,11.7,7.1,9.9
HSPA6,12.6,12.5,10.7,10.5,12.3,15.7,9.5,10.2,6.6,13.7,...,11.9,13.2,11.9,10.9,12.7,12.5,13.2,13.5,8.0,11.0
PAX8,14.9,11.1,13.1,16.0,16.2,12.0,15.5,11.9,10.5,18.1,...,17.0,14.9,19.6,14.7,18.4,30.4,17.7,17.5,7.7,9.5
GUCA1A,5.5,5.4,4.8,4.2,5.7,4.3,3.8,4.1,3.3,6.4,...,545.1,590.7,37.0,158.9,4.3,5.4,2293.6,2155.1,3.7,5.2
THRA,9.2,10.3,9.8,8.5,14.4,11.6,7.8,8.1,6.5,12.8,...,12.2,11.6,10.4,9.8,8.8,11.7,12.5,12.5,8.6,10.7


# Fix Sample Columns

In [27]:
lst = []

for col in matrix.columns:
    lst.append(col.split('.')[0])
    
matrix.columns = lst 

In [28]:
matrix.head()

Unnamed: 0_level_0,721_B_lymphoblasts,721_B_lymphoblasts,Adipocyte,Adipocyte,AdrenalCortex,AdrenalCortex,Adrenalgland,Adrenalgland,Amygdala,Amygdala,...,pineal_night,pineal_night,pineal_night,pineal_night,retina,retina,retina,retina,small_intestine,small_intestine
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RFC2,69.7,93.8,7.9,6.8,9.6,8.2,6.8,7.6,6.4,9.6,...,7.8,9.4,8.2,7.4,7.0,9.7,11.2,11.7,7.1,9.9
HSPA6,12.6,12.5,10.7,10.5,12.3,15.7,9.5,10.2,6.6,13.7,...,11.9,13.2,11.9,10.9,12.7,12.5,13.2,13.5,8.0,11.0
PAX8,14.9,11.1,13.1,16.0,16.2,12.0,15.5,11.9,10.5,18.1,...,17.0,14.9,19.6,14.7,18.4,30.4,17.7,17.5,7.7,9.5
GUCA1A,5.5,5.4,4.8,4.2,5.7,4.3,3.8,4.1,3.3,6.4,...,545.1,590.7,37.0,158.9,4.3,5.4,2293.6,2155.1,3.7,5.2
THRA,9.2,10.3,9.8,8.5,14.4,11.6,7.8,8.1,6.5,12.8,...,12.2,11.6,10.4,9.8,8.8,11.7,12.5,12.5,8.6,10.7


# Merge Like Column (by taking the mean)

In [29]:
matrix = uf.merge(matrix, 'column', 'mean')

In [30]:
matrix.shape

(32900, 84)

# Drop Any Genes That Have Zero Expression Across 95% Of The Samples

In [31]:
matrix.replace(0, np.nan, inplace=True)

matrix.dropna(thresh=(0.05*matrix.shape[1]), axis=0, inplace=True)

matrix.replace(np.nan, 0, inplace=True)

In [32]:
matrix.shape

(32900, 84)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [33]:
uf.mapgenesymbols(matrix)

Progeres: 100%  32900 Out of 32900   

In [34]:
matrix.shape

(27337, 84)

# Merge Duplicate genes (by taking the mean)

In [35]:
matrix = uf.merge(matrix, 'row', 'mean')

In [36]:
matrix.shape

(16290, 84)

# Save Unfiltered Matrix To File

In [37]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_tissue_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Normalize Matrix (Quantile Normalize the matrix for the columns)

In [38]:
normalized_matrix = uf.quantileNormalize(matrix)

Step 2/2 progress: 100%  84 Out of 84   

In [39]:
normalized_matrix.head()

Unnamed: 0_level_0,721_B_lymphoblasts,Adipocyte,AdrenalCortex,Adrenalgland,Amygdala,Appendix,AtrioventricularNode,BDCA4+_DentriticCells,Bonemarrow,BronchialEpithelialCells,...,TrigeminalGanglion,Uterus,UterusCorpus,WholeBlood,Wholebrain,colon,pineal_day,pineal_night,retina,small_intestine
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,3.199901,4.041548,3.906944,4.5212,3.634861,3.806369,4.643323,3.644474,3.991111,4.054127,...,3.895397,4.688155,4.200655,3.681052,4.473393,3.931607,3.280913,3.301865,3.267679,4.102748
A1CF,23.015377,45.264664,119.112252,78.657728,43.001081,172.58881,134.577191,17.407373,109.159048,43.827039,...,176.313988,41.510339,184.067103,46.541845,36.856684,209.563909,25.732613,22.925188,38.271736,215.614018
A2M,4.523333,515.155962,1071.795397,1162.577718,235.745476,1830.685694,1331.946012,5.960268,44.544415,4.470694,...,942.678474,1994.252232,1423.887878,14.031111,178.359702,486.338194,240.90504,298.059673,1856.987609,1022.991583
A2ML1,5.719881,6.556875,6.150506,7.014725,6.159802,6.153735,6.96004,6.188879,6.349653,6.598065,...,6.15494,7.144673,6.604454,6.13296,6.997986,6.429673,5.581862,5.637321,5.562614,6.552163
A4GALT,9.792452,12.34496,9.446047,9.335357,9.341147,7.484554,8.588153,9.308036,9.575996,13.722976,...,9.232199,9.158194,29.440565,9.667222,9.080918,9.473061,8.60997,8.549683,8.950694,9.379573


# Nomalize Matrix (z-score the rows)

In [40]:
uf.zscore(normalized_matrix, 'row')

Progress: 100%  16290 Out of 16290   

In [41]:
normalized_matrix.head()

Unnamed: 0_level_0,721_B_lymphoblasts,Adipocyte,AdrenalCortex,Adrenalgland,Amygdala,Appendix,AtrioventricularNode,BDCA4+_DentriticCells,Bonemarrow,BronchialEpithelialCells,...,TrigeminalGanglion,Uterus,UterusCorpus,WholeBlood,Wholebrain,colon,pineal_day,pineal_night,retina,small_intestine
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.115044,-0.11357,-0.113805,-0.112729,-0.114282,-0.113982,-0.112515,-0.114265,-0.113658,-0.113547,...,-0.113826,-0.112437,-0.113291,-0.114201,-0.112813,-0.113762,-0.114902,-0.114866,-0.114925,-0.113462
A1CF,-0.740754,-0.474764,0.408085,-0.07555,-0.501825,1.047397,0.592968,-0.807798,0.289094,-0.491951,...,1.091932,-0.519647,1.18462,-0.459495,-0.575281,1.489435,-0.70827,-0.741832,-0.558364,1.561764
A2M,-0.774974,-0.171022,0.487344,0.594717,-0.501495,1.384923,0.795037,-0.773274,-0.727639,-0.775036,...,0.334631,1.578381,0.903782,-0.763728,-0.569369,-0.205107,-0.495393,-0.427793,1.416031,0.429621
A2ML1,-0.854018,0.058569,-0.384501,0.55777,-0.374366,-0.380981,0.498145,-0.342663,-0.167369,0.10348,...,-0.379666,0.699454,0.110445,-0.403632,0.539519,-0.080122,-1.004503,-0.944034,-1.025489,0.053431
A4GALT,-0.101142,0.775994,-0.220179,-0.258216,-0.256227,-0.894221,-0.514983,-0.267605,-0.175524,1.249531,...,-0.293665,-0.319096,6.650675,-0.144175,-0.345651,-0.210897,-0.507486,-0.528203,-0.390401,-0.243022


# Save Filtered Matrix

In [42]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_tissue_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [43]:
tertiary_matrix = uf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  84 Out of 84   

In [44]:
tertiary_matrix.head()

Unnamed: 0_level_0,721_B_lymphoblasts,Adipocyte,AdrenalCortex,Adrenalgland,Amygdala,Appendix,AtrioventricularNode,BDCA4+_DentriticCells,Bonemarrow,BronchialEpithelialCells,...,TrigeminalGanglion,Uterus,UterusCorpus,WholeBlood,Wholebrain,colon,pineal_day,pineal_night,retina,small_intestine
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
A2M,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,-1.0,...,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,1.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GALT,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Teriary Matrix

In [45]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_tissue_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

## Path to output files

In [46]:
path = '/Users/moshesilverstein/Documents/Harmonizome/BioGPS/Output/'

# Create Up Gene Set Library

In [47]:
name = 'biogps_tissue_gene_up_set'

In [48]:
uf.createUpGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  84 Out of 84   

# Create Down Gene Set Library

In [49]:
name = 'biogps_tissue_gene_down_set'

In [50]:
uf.createDownGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  84 Out of 84   

# Create Up Attribute Library

In [51]:
name = 'biogps_tissue_attribute_up_set'

In [52]:
uf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  16290 Out of 16290   

# Create Down Attribute Library

In [53]:
name = 'biogps_tissue_attribute_down_set'

In [54]:
uf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  16290 Out of 16290   

# Create Gene Similarity Matrix

In [55]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [56]:
gene_similarity_matix.head()

Genes,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,...,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,0.322959,0.110656,-0.134977,0.09306,0.252333,0.041273,-0.069542,0.13184,-0.186413,...,0.033096,-0.036431,-0.040592,0.084693,0.054873,-0.186413,-0.116763,-0.007497,-0.085189,-0.033879
A1CF,0.322959,1.0,0.402491,0.080385,0.024362,-0.131374,0.308008,-0.140971,0.356973,0.073664,...,0.026062,-0.169989,-0.110484,0.117417,-0.263662,0.073664,0.241705,-0.075082,-0.118614,-0.255403
A2M,0.110656,0.402491,1.0,0.132584,-0.040558,-0.199738,0.081111,-0.161729,0.184152,0.239683,...,-0.236796,-0.174343,-0.207875,-0.001864,-0.237624,0.239683,0.285857,0.151805,0.238288,-0.210511
A2ML1,-0.134977,0.080385,0.132584,1.0,-0.051676,-0.651447,0.079173,0.016533,-0.004324,0.652901,...,-0.236542,-0.041155,-0.008923,-0.046431,-0.157336,0.652901,0.625789,-0.012733,0.152922,-0.23664
A4GALT,0.09306,0.024362,-0.040558,-0.051676,1.0,0.004739,0.024308,-0.088234,-0.024872,-0.040255,...,-0.053479,0.028592,0.237689,-0.047273,0.001343,-0.040255,0.025602,0.19755,-0.011837,-0.059276


# Save Gene Similarity Matrix 

In [57]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_tissue_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [58]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [59]:
attribute_similarity_matix.head()

Unnamed: 0,721_B_lymphoblasts,Adipocyte,AdrenalCortex,Adrenalgland,Amygdala,Appendix,AtrioventricularNode,BDCA4+_DentriticCells,Bonemarrow,BronchialEpithelialCells,...,TrigeminalGanglion,Uterus,UterusCorpus,WholeBlood,Wholebrain,colon,pineal_day,pineal_night,retina,small_intestine
721_B_lymphoblasts,1.0,-0.094872,-0.100289,-0.303127,0.001224,-0.157276,-0.341758,0.332251,-0.136569,0.148001,...,-0.173913,-0.194198,-0.273877,0.101298,-0.156576,0.036652,0.178185,0.159789,0.16776,-0.001557
Adipocyte,-0.094872,1.0,-0.000761,0.15522,-0.07432,-0.029929,-0.002661,-0.07262,-0.001283,0.12418,...,-0.073781,0.151581,0.087753,-0.036637,-0.002973,0.057661,-0.110901,-0.101052,-0.014133,0.088108
AdrenalCortex,-0.100289,-0.000761,1.0,0.252652,-0.13242,0.280405,0.213322,-0.150851,0.114985,-0.125041,...,0.333139,-0.181191,0.155313,-0.077131,-0.269412,-0.083766,-0.00533,-0.007318,0.026936,-0.119775
Adrenalgland,-0.303127,0.15522,0.252652,1.0,-0.18709,-0.064232,0.216149,-0.213493,0.098472,0.018975,...,-0.059864,0.21629,0.177941,-0.146454,0.175232,-0.021892,-0.305815,-0.286969,-0.290552,0.083955
Amygdala,0.001224,-0.07432,-0.13242,-0.18709,1.0,-0.160093,-0.285712,-0.007874,-0.187175,-0.052741,...,-0.187734,-0.029465,-0.228677,0.000774,0.437693,0.050403,0.254513,0.262621,0.217881,0.010457


# Save Attribute Similarity Matrix

In [60]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_tissue_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [61]:
gene_list = uf.createGeneList(normalized_matrix)

Progeres: 100%  16290 Out of 16290   

In [62]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1CF,29974
2,A2M,2
3,A2ML1,144568
4,A4GALT,53947


In [63]:
gene_list.shape

(16290, 2)

# Save Gene List

In [64]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_tissue_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [65]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [66]:
attribute_list.head()

Unnamed: 0,Attributes
0,721_B_lymphoblasts
1,Adipocyte
2,AdrenalCortex
3,Adrenalgland
4,Amygdala


In [67]:
attribute_list.shape

(84, 1)

# Save Attribute List

In [68]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_tissue_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [69]:
path = '/Users/moshesilverstein/Documents/Harmonizome/BioGPS/Output/'

In [70]:
name = 'biogps_tissue_gene_attribute_edge_list'

In [71]:
uf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  84 Out of 84   

 The number of statisticaly relevent gene-attribute associations is: 274099
