# BioGPS (Human Cell Line)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: http://biogps.org/#goto=welcome

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/BioGPS/untility_functions.py'>

# Load Data

In [3]:
matrix = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/BioGPS/Human NCI60 Cell Lines/NCI60_U133A_20070815.raw.csv', sep=',')

In [4]:
matrix.head()

Unnamed: 0.1,Unnamed: 0,huh-7,huh-7.1,HEK293,HEK293.1,HL60,HL60.1,HEK 293T,HEK 293T.1,DU145,...,NCI H226.1,U87,U87.1,U118,U118.1,U138,U138.1,HEK 293 T-rex,HEK 293 T-rex .1,HT29
0,1007_s_at,31.727,55.551,360.731,334.639,33.483,49.691,50.638,55.918,346.27,...,184.953,211.477,210.273,242.274,302.585,320.841,274.376,164.18,162.016,258.504
1,1053_at,402.633,441.622,557.782,869.346,336.006,114.442,662.357,435.929,593.534,...,554.368,280.474,303.285,298.215,373.033,474.686,447.062,744.322,679.515,624.654
2,117_at,9.968,10.178,9.565,8.118,13.902,17.116,10.039,8.978,7.896,...,10.875,8.442,8.697,8.862,8.054,7.47,8.098,18.473,11.431,9.929
3,121_at,117.051,115.552,117.618,111.873,118.448,124.955,112.094,109.924,107.586,...,111.738,111.995,110.149,114.154,110.82,213.564,229.69,124.694,118.34,116.772
4,1255_g_at,4.341,4.278,4.32,4.062,4.525,4.38,4.262,4.124,4.034,...,4.17,4.115,3.959,4.116,4.031,3.851,3.911,4.254,4.501,4.498


In [5]:
matrix.shape

(22283, 109)

# Load Gene Data

In [6]:
matrix['Unnamed: 0'].to_csv('probes.txt', sep='\t', header=False, index=False)

In [7]:
%%capture
%system r -f MapToGenesU133A.R;

In [8]:
map_to_genes_U133A = pd.read_csv('ProbesToGenesU133A.tsv', sep='\t', header=None)

In [9]:
map_to_genes_U133A.drop_duplicates(subset=[0], keep=False, inplace=True)

In [10]:
map_to_genes_U133A.set_index(0, inplace=True)

In [11]:
map_to_genes_U133A.head()

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1053_at,RFC2,5982.0,replication factor C subunit 2
117_at,HSPA6,3310.0,heat shock protein family A (Hsp70) member 6
121_at,PAX8,7849.0,paired box 8
1255_g_at,GUCA1A,2978.0,guanylate cyclase activator 1A
1316_at,THRA,7067.0,"thyroid hormone receptor, alpha"


In [12]:
map_to_genes_U133A.shape

(21027, 3)

# Map Gene Symbol to Probe ID

In [13]:
lst = []

for index in matrix.index:
    if matrix.ix[index, 'Unnamed: 0'] in map_to_genes_U133A.index and type(map_to_genes_U133A.ix[matrix.ix[index, 'Unnamed: 0'], 1]) != float:
        lst.append(map_to_genes_U133A.ix[matrix.ix[index, 'Unnamed: 0'], 1])
    else:
        lst.append(np.nan)

matrix['Unnamed: 0'] = lst    

In [14]:
matrix.head()

Unnamed: 0.1,Unnamed: 0,huh-7,huh-7.1,HEK293,HEK293.1,HL60,HL60.1,HEK 293T,HEK 293T.1,DU145,...,NCI H226.1,U87,U87.1,U118,U118.1,U138,U138.1,HEK 293 T-rex,HEK 293 T-rex .1,HT29
0,,31.727,55.551,360.731,334.639,33.483,49.691,50.638,55.918,346.27,...,184.953,211.477,210.273,242.274,302.585,320.841,274.376,164.18,162.016,258.504
1,RFC2,402.633,441.622,557.782,869.346,336.006,114.442,662.357,435.929,593.534,...,554.368,280.474,303.285,298.215,373.033,474.686,447.062,744.322,679.515,624.654
2,HSPA6,9.968,10.178,9.565,8.118,13.902,17.116,10.039,8.978,7.896,...,10.875,8.442,8.697,8.862,8.054,7.47,8.098,18.473,11.431,9.929
3,PAX8,117.051,115.552,117.618,111.873,118.448,124.955,112.094,109.924,107.586,...,111.738,111.995,110.149,114.154,110.82,213.564,229.69,124.694,118.34,116.772
4,GUCA1A,4.341,4.278,4.32,4.062,4.525,4.38,4.262,4.124,4.034,...,4.17,4.115,3.959,4.116,4.031,3.851,3.911,4.254,4.501,4.498


In [15]:
matrix.shape

(22283, 109)

In [16]:
matrix.dropna(subset=['Unnamed: 0'], inplace=True)

In [17]:
matrix.head()

Unnamed: 0.1,Unnamed: 0,huh-7,huh-7.1,HEK293,HEK293.1,HL60,HL60.1,HEK 293T,HEK 293T.1,DU145,...,NCI H226.1,U87,U87.1,U118,U118.1,U138,U138.1,HEK 293 T-rex,HEK 293 T-rex .1,HT29
1,RFC2,402.633,441.622,557.782,869.346,336.006,114.442,662.357,435.929,593.534,...,554.368,280.474,303.285,298.215,373.033,474.686,447.062,744.322,679.515,624.654
2,HSPA6,9.968,10.178,9.565,8.118,13.902,17.116,10.039,8.978,7.896,...,10.875,8.442,8.697,8.862,8.054,7.47,8.098,18.473,11.431,9.929
3,PAX8,117.051,115.552,117.618,111.873,118.448,124.955,112.094,109.924,107.586,...,111.738,111.995,110.149,114.154,110.82,213.564,229.69,124.694,118.34,116.772
4,GUCA1A,4.341,4.278,4.32,4.062,4.525,4.38,4.262,4.124,4.034,...,4.17,4.115,3.959,4.116,4.031,3.851,3.911,4.254,4.501,4.498
6,THRA,9.632,9.38,9.315,8.204,10.234,9.846,9.3,8.48,8.746,...,9.085,8.264,7.802,8.351,8.205,7.85,8.033,9.867,9.975,10.837


In [18]:
matrix.shape

(19862, 109)

In [19]:
matrix.rename(columns={'Unnamed: 0':'Genes'}, inplace=True)

In [20]:
matrix.set_index('Genes', inplace=True)

In [21]:
matrix.head()

Unnamed: 0_level_0,huh-7,huh-7.1,HEK293,HEK293.1,HL60,HL60.1,HEK 293T,HEK 293T.1,DU145,ACHN,...,NCI H226.1,U87,U87.1,U118,U118.1,U138,U138.1,HEK 293 T-rex,HEK 293 T-rex .1,HT29
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RFC2,402.633,441.622,557.782,869.346,336.006,114.442,662.357,435.929,593.534,556.449,...,554.368,280.474,303.285,298.215,373.033,474.686,447.062,744.322,679.515,624.654
HSPA6,9.968,10.178,9.565,8.118,13.902,17.116,10.039,8.978,7.896,7.811,...,10.875,8.442,8.697,8.862,8.054,7.47,8.098,18.473,11.431,9.929
PAX8,117.051,115.552,117.618,111.873,118.448,124.955,112.094,109.924,107.586,14698.486,...,111.738,111.995,110.149,114.154,110.82,213.564,229.69,124.694,118.34,116.772
GUCA1A,4.341,4.278,4.32,4.062,4.525,4.38,4.262,4.124,4.034,3.857,...,4.17,4.115,3.959,4.116,4.031,3.851,3.911,4.254,4.501,4.498
THRA,9.632,9.38,9.315,8.204,10.234,9.846,9.3,8.48,8.746,7.978,...,9.085,8.264,7.802,8.351,8.205,7.85,8.033,9.867,9.975,10.837


# Fix Sample Columns

In [22]:
lst = []

for col in matrix.columns:
    lst.append(col.split('.')[0])
    
matrix.columns = lst 

In [23]:
matrix.head()

Unnamed: 0_level_0,huh-7,huh-7,HEK293,HEK293,HL60,HL60,HEK 293T,HEK 293T,DU145,ACHN,...,NCI H226,U87,U87,U118,U118,U138,U138,HEK 293 T-rex,HEK 293 T-rex,HT29
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RFC2,402.633,441.622,557.782,869.346,336.006,114.442,662.357,435.929,593.534,556.449,...,554.368,280.474,303.285,298.215,373.033,474.686,447.062,744.322,679.515,624.654
HSPA6,9.968,10.178,9.565,8.118,13.902,17.116,10.039,8.978,7.896,7.811,...,10.875,8.442,8.697,8.862,8.054,7.47,8.098,18.473,11.431,9.929
PAX8,117.051,115.552,117.618,111.873,118.448,124.955,112.094,109.924,107.586,14698.486,...,111.738,111.995,110.149,114.154,110.82,213.564,229.69,124.694,118.34,116.772
GUCA1A,4.341,4.278,4.32,4.062,4.525,4.38,4.262,4.124,4.034,3.857,...,4.17,4.115,3.959,4.116,4.031,3.851,3.911,4.254,4.501,4.498
THRA,9.632,9.38,9.315,8.204,10.234,9.846,9.3,8.48,8.746,7.978,...,9.085,8.264,7.802,8.351,8.205,7.85,8.033,9.867,9.975,10.837


# Merge Like Column (by taking the mean)

In [24]:
matrix = uf.merge(matrix, 'column', 'mean')

In [25]:
matrix.shape

(19862, 93)

# Drop Any Genes That Have Zero Expression Across 95% Of The Samples

In [26]:
matrix.replace(0, np.nan, inplace=True)

matrix.dropna(thresh=(0.05*matrix.shape[1]), axis=0, inplace=True)

matrix.replace(np.nan, 0, inplace=True)

In [27]:
matrix.shape

(19862, 93)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [28]:
uf.mapgenesymbols(matrix)

Progeres: 98%  19659 Out of 19862   

In [29]:
matrix.shape

(19736, 93)

# Merge Duplicate genes (by taking the mean)

In [30]:
matrix = uf.merge(matrix, 'row', 'mean')

In [31]:
matrix.shape

(12296, 93)

# Save Unfiltered Matrix To File

In [32]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_celline_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Normalize Matrix (Quantile Normalize the matrix for the columns)

In [33]:
normalized_matrix = uf.quantileNormalize(matrix)

Step 2/2 progress: 100%  93 Out of 93   

In [34]:
normalized_matrix.head()

Unnamed: 0_level_0,786,A172,A204,A361,A498,A549,ACC3,ACHN,ALVA31,CAKI1,...,U138,U20S,U251,U87,UACC257,UACC62,UO31,ZR75_1,astrocytes,huh-7
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1CF,9.030562,8.673039,8.826278,8.711599,9.118827,8.853977,12.894987,8.946287,8.301336,10.148112,...,8.965899,23.979935,8.988879,8.923787,8.843534,8.666947,8.309525,8.344695,35.999118,860.691333
A2M,4.682693,4.706318,4.649568,15.98168,4.701961,4.689463,4.579769,4.870681,4.638478,4.653149,...,4.756517,4.687373,4.701961,4.631064,3345.817603,259.99421,4.591836,4.590537,4.653827,2462.519255
A4GALT,5.100708,5.086537,5.023925,5.057906,5.112048,5.065972,4.935492,5.034096,5.087227,4.998526,...,5.080843,4.972978,5.126039,5.174119,5.098963,4.997068,5.18246,5.019695,4.997822,5.02472
A4GNT,4.27688,4.182743,4.256261,4.254953,4.274593,4.2676,4.185514,4.242747,4.182,4.234151,...,4.201745,4.300164,4.297818,4.209505,4.274593,4.287378,4.279103,4.269121,4.285208,4.238018
AAAS,89.913332,42.988196,383.629343,181.791491,134.539286,151.576725,293.501461,132.797981,158.045589,117.260254,...,87.478925,252.347874,122.354213,36.88278,156.810452,189.227613,124.425902,224.230616,129.035628,127.718687


# Nomalize Matrix (z-score the rows)

In [35]:
uf.zscore(normalized_matrix, 'row')

Progress: 100%  12296 Out of 12296   

In [36]:
normalized_matrix.head()

Unnamed: 0_level_0,786,A172,A204,A361,A498,A549,ACC3,ACHN,ALVA31,CAKI1,...,U138,U20S,U251,U87,UACC257,UACC62,UO31,ZR75_1,astrocytes,huh-7
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1CF,-0.163308,-0.165014,-0.164283,-0.16483,-0.162887,-0.164151,-0.144871,-0.16371,-0.166788,-0.157976,...,-0.163617,-0.091983,-0.163507,-0.163818,-0.164201,-0.165043,-0.166749,-0.166581,-0.034639,3.900054
A2M,-0.28314,-0.283127,-0.283158,-0.276954,-0.283129,-0.283136,-0.283196,-0.283037,-0.283164,-0.283156,...,-0.2831,-0.283137,-0.283129,-0.283168,1.54609,-0.14336,-0.28319,-0.28319,-0.283156,1.062495
A4GALT,-0.087577,-0.102782,-0.169962,-0.133501,-0.075409,-0.124848,-0.264847,-0.159049,-0.102041,-0.197214,...,-0.108891,-0.224626,-0.060397,-0.008809,-0.089449,-0.198779,0.00014,-0.174501,-0.19797,-0.169109
A4GNT,0.101692,-1.137341,-0.1697,-0.186917,0.071593,-0.020446,-1.100862,-0.347562,-1.147112,-0.460713,...,-0.887233,0.408156,0.37728,-0.785097,0.071593,0.239869,0.130952,-0.000432,0.211304,-0.40981
AAAS,-0.871391,-1.331086,2.005954,0.02868,-0.434219,-0.267315,1.123029,-0.451278,-0.203944,-0.603491,...,-0.895239,0.719875,-0.553589,-1.390897,-0.216043,0.101526,-0.533294,0.444428,-0.488135,-0.501036


# Save Filtered Matrix

In [37]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_celline_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [38]:
tertiary_matrix = uf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  93 Out of 93   

In [39]:
tertiary_matrix.head()

Unnamed: 0_level_0,786,A172,A204,A361,A498,A549,ACC3,ACHN,ALVA31,CAKI1,...,U138,U20S,U251,U87,UACC257,UACC62,UO31,ZR75_1,astrocytes,huh-7
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
A4GALT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GNT,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,-1.0,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Teriary Matrix

In [40]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_celline_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

## Path to output files

In [41]:
path = '/Users/moshesilverstein/Documents/Harmonizome/BioGPS/Output/'

# Create Up Gene Set Library

In [42]:
name = 'biogps_celline_gene_up_set'

In [43]:
uf.createUpGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  93 Out of 93   

# Create Down Gene Set Library

In [44]:
name = 'biogps_celline_gene_down_set'

In [45]:
uf.createDownGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  93 Out of 93   

# Create Up Attribute Library

In [46]:
name = 'biogps_celline_attribute_up_set'

In [47]:
uf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  12296 Out of 12296   

# Create Down Attribute Library

In [48]:
name = 'biogps_tissue_attribute_down_set'

In [49]:
uf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  12296 Out of 12296   

# Create Gene Similarity Matrix

In [50]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [51]:
gene_similarity_matix.head()

Genes,A1CF,A2M,A4GALT,A4GNT,AAAS,AACS,AADAC,AAGAB,AAK1,AAMDC,...,ZSWIM1,ZSWIM8,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYX,ZZEF1,ZZZ3
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1CF,1.0,0.349482,-0.01164,-0.038098,0.111974,0.15471,0.353846,-0.054673,0.163281,-0.023838,...,-0.045186,0.007094,0.015812,-0.050293,0.171455,-0.039292,0.338792,-0.040618,-0.091918,-0.104068
A2M,0.349482,1.0,-0.039664,-0.137902,-0.040096,-0.062617,0.08105,-0.215253,0.129757,-0.141014,...,-0.101278,0.010064,-0.179585,-0.2308,-0.181112,-0.062468,0.034253,-0.06992,-0.12587,0.239089
A4GALT,-0.01164,-0.039664,1.0,0.078319,0.211662,-0.09162,-0.022903,-0.016431,-0.089542,-0.11928,...,0.661186,0.200741,-0.064096,-0.183731,-0.096633,-0.053158,-0.079646,-0.087568,0.059166,-0.122521
A4GNT,-0.038098,-0.137902,0.078319,1.0,0.131649,-0.013222,-0.038887,0.007008,0.001814,-0.058061,...,-0.019681,-0.210875,0.141194,0.101721,0.055369,-0.171831,0.065789,0.036849,0.090574,-0.132059
AAAS,0.111974,-0.040096,0.211662,0.131649,1.0,-0.036658,-0.048788,0.014603,-0.190564,-0.124335,...,0.181602,0.495289,-0.011923,0.248281,0.32016,0.219163,0.120166,-0.104526,0.205333,-0.485404


# Save Gene Similarity Matrix 

In [52]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_celline_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [53]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [54]:
attribute_similarity_matix.head()

Unnamed: 0,786,A172,A204,A361,A498,A549,ACC3,ACHN,ALVA31,CAKI1,...,U138,U20S,U251,U87,UACC257,UACC62,UO31,ZR75_1,astrocytes,huh-7
786,1.0,-0.136167,-0.110364,-0.073712,0.262267,0.17693,-0.101618,-0.019178,0.10022,0.275175,...,-0.062546,-0.088654,0.040393,0.00871,-0.137433,-0.089766,0.233834,-0.094403,-0.09471,-0.016141
A172,-0.136167,1.0,-0.034111,0.003763,0.103223,-0.007913,-0.060479,0.156034,-0.185432,-0.004094,...,0.256474,0.021621,0.021778,0.298232,0.062759,0.034749,-0.012675,-0.064519,0.176081,-0.049924
A204,-0.110364,-0.034111,1.0,-0.010593,-0.05642,0.009572,0.052724,-0.045299,-0.019114,-0.068758,...,-0.011088,-0.033322,-0.034654,-0.068259,-0.038057,-0.000931,-0.101301,0.071975,0.007059,-0.029536
A361,-0.073712,0.003763,-0.010593,1.0,0.034394,-0.020949,-0.043275,0.009113,-0.066597,-0.029386,...,-0.000368,-0.04505,-0.006899,0.017027,0.272316,0.245206,-0.037218,-0.038677,-0.027006,-0.047109
A498,0.262267,0.103223,-0.05642,0.034394,1.0,0.181834,-0.061225,0.130878,-0.038455,0.218491,...,0.000334,-0.041593,0.114002,0.057076,-0.07557,-0.015797,0.124008,-0.079305,-0.025236,-0.069481


# Save Attribute Similarity Matrix

In [55]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_celline_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [59]:
gene_list = uf.createGeneList(normalized_matrix)

Progeres: 100%  12296 Out of 12296   

In [60]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1CF,29974
1,A2M,2
2,A4GALT,53947
3,A4GNT,51146
4,AAAS,8086


In [61]:
gene_list.shape

(12296, 2)

# Save Gene List

In [62]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_celline_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [63]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [64]:
attribute_list.head()

Unnamed: 0,Attributes
0,786
1,A172
2,A204
3,A361
4,A498


In [65]:
attribute_list.shape

(93, 1)

# Save Attribute List

In [66]:
filename = '~/./Documents/Harmonizome/BioGPS/Output/biogps_celline_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [67]:
path = '/Users/moshesilverstein/Documents/Harmonizome/BioGPS/Output/'

In [68]:
name = 'biogps_celline_gene_attribute_edge_list'

In [69]:
uf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  93 Out of 93   

 The number of statisticaly relevent gene-attribute associations is: 228687
