# GTEx

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Source: https://www.gtexportal.org/home/

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GTEx_Python_Script/my_functions.py'>

# Load Data

In [3]:
file = 'Input/GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct'
matrix = pd.read_csv(file, sep='\t' ,skiprows=2)

In [4]:
matrix.head()

Unnamed: 0,Name,Description,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-5NQ91,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
0,ENSG00000223972.4,DDX11L1,0.0,0.0,0.0,0.0,0.0,0.0,0.023956,0.0,...,0.019434,0.061915,0.0,0.013643,0.339341,0.287188,0.046711,0.278409,0.120472,0.0405
1,ENSG00000227232.4,WASH7P,6.50896,10.745692,6.670499,6.384469,6.83363,7.115656,7.954349,6.863994,...,2.884658,8.876904,4.701957,5.904234,8.391052,6.43569,8.869359,12.766889,7.886225,9.202537
2,ENSG00000243485.2,MIR1302-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ENSG00000237613.2,FAM138A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENSG00000268020.2,OR4G4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
matrix.shape

(56238, 8557)

# Load Sample Metadata 

In [6]:
sample_meta = pd.read_csv('Input/GTEx_Data_V6_Annotations_SampleAttributesDS.txt', sep='\t', index_col=0)

In [7]:
sample_meta.head()

Unnamed: 0_level_0,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,SMTSTPTREF,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,,,Actual Death,...,,,,,,,,,,
GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,,,Actual Death,...,,,,,,,,,,
GTEX-1117F-0226-SM-5GZZ7,0.0,B1,"2 pieces, ~15% vessel stroma, rep delineated",6.8,Adipose Tissue,Adipose - Subcutaneous,2190,1214.0,1125.0,Actual Death,...,,,,,,,,,,
GTEX-1117F-0426-SM-5EGHI,0.0,B1,"2 pieces, !5% fibrous connective tissue, delin...",7.1,Muscle,Muscle - Skeletal,11907,1220.0,1119.0,Actual Death,...,12207544.0,10849322.0,0.002745,12393839.0,50.378628,0.008671,0.94627,837.0,0.276804,50.254814
GTEX-1117F-0526-SM-5EGHJ,0.0,B1,"2 pieces, clean, Monckebeg medial sclerosis, r...",8.0,Blood Vessel,Artery - Tibial,7610,1221.0,1120.0,Actual Death,...,,,,,,,,,,


In [8]:
sample_meta.shape

(11983, 63)

# Load Subject Meta

In [9]:
subject_meta = pd.read_csv('Input/GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt', sep='\t', index_col=0)

#### change gender from 1,2 to M,F

In [10]:
def geneder(s):
    if s == 1:
        return('M')
    elif s== 2:
        return('F')

In [11]:
subject_meta['GENDER'] = subject_meta['GENDER'].map(geneder)

In [12]:
subject_meta.head()

Unnamed: 0_level_0,GENDER,AGE,DTHHRDY
SUBJID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GTEX-N7MS,M,60-69,2.0
GTEX-NFK9,M,40-49,0.0
GTEX-NL3G,F,60-69,2.0
GTEX-NL4W,M,50-59,4.0
GTEX-NPJ7,F,60-69,4.0


In [13]:
subject_meta.shape

(570, 3)

# Set Matrix to Show Gene Symbols

In [14]:
matrix.rename(columns={"Description": "Gene Symbol"}, inplace=True)

In [15]:
matrix.set_index('Gene Symbol', inplace=True)

In [16]:
matrix.drop('Name', axis=1, inplace=True)

In [17]:
matrix.head()

Unnamed: 0_level_0,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-5NQ91,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,GTEX-11DXW-0326-SM-5H11W,GTEX-11DXX-2326-SM-5Q5A2,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DDX11L1,0.0,0.0,0.0,0.0,0.0,0.0,0.023956,0.0,0.0,0.0,...,0.019434,0.061915,0.0,0.013643,0.339341,0.287188,0.046711,0.278409,0.120472,0.0405
WASH7P,6.50896,10.745692,6.670499,6.384469,6.83363,7.115656,7.954349,6.863994,6.805336,4.403805,...,2.884658,8.876904,4.701957,5.904234,8.391052,6.43569,8.869359,12.766889,7.886225,9.202537
MIR1302-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM138A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4G4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Map Gene Symbols To Up-to-date Approved Gene Symbols

In [18]:
mf.mapgenesymbols(matrix)

Progeres: 100%  56238 Out of 56238   

In [19]:
matrix.shape

(33133, 8555)

## Merge Duplicate Genes By Rows

In [20]:
matrix.shape

(33133, 8555)

In [21]:
matrix = mf.merge(matrix, 'row', 'mean')

In [22]:
matrix.shape

(32379, 8555)

## Map Sample Metadata to Sample ID

In [23]:
lst = []

for i,col in enumerate(matrix.columns):
    
    progressPercent = ((i+1)/len(matrix.columns))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(matrix.columns)))
    sys.stdout.flush()
    
    donor = 'Donor:'+('-').join(col.split('-')[0:2])
    gender = 'Gender:'+subject_meta.ix[donor.split(':')[1], 'GENDER']
    age = 'Age:'+subject_meta.ix[donor.split(':')[1], 'AGE']
    tissue = 'Tissue:'+sample_meta.ix[col, 'SMTSD']
    lst.append(tuple((donor, gender, age, tissue)))
    
matrix.columns = lst

Progress: 100%  8555 Out of 8555   

In [24]:
matrix.head()

Unnamed: 0_level_0,"(Donor:GTEX-111CU, Gender:M, Age:50-59, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111FC, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111VG, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111YS, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-1122O, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-1128S, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-113IC, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-117YX, Gender:M, Age:50-59, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-11DXW, Gender:M, Age:40-49, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-11DXX, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)",...,"(Donor:GTEX-ZVE2, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVP2, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVT2, Gender:F, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVT3, Gender:F, Age:60-69, Tissue:Whole Blood)","(Donor:GTEX-ZVT4, Gender:F, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVTK, Gender:M, Age:20-29, Tissue:Whole Blood)","(Donor:GTEX-ZVZP, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVZQ, Gender:F, Age:60-69, Tissue:Whole Blood)","(Donor:GTEX-ZXES, Gender:F, Age:30-39, Tissue:Whole Blood)","(Donor:GTEX-ZXG5, Gender:M, Age:60-69, Tissue:Whole Blood)"
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.45904,1.505384,2.04869,0.595234,0.96475,0.77931,2.074153,0.587864,1.095912,0.665989,...,0.481572,1.994508,0.637642,1.057956,1.037017,0.385123,0.857898,0.836602,0.825334,0.596233
A1BG-AS1,0.856568,0.962106,0.974077,0.401879,0.885118,0.604135,1.10064,0.256644,0.946916,0.243068,...,0.152733,1.188405,0.475428,0.263927,0.381991,0.381954,0.720083,0.750913,0.582638,0.342768
A1CF,0.0,0.0,0.005385,0.010977,0.0,0.00462,0.002215,0.003925,0.020054,0.0,...,0.003594,0.002863,0.0,0.035323,0.004328,0.005311,0.0,0.174263,0.00557,0.014979
A2M,212.201385,309.729248,358.070068,253.837097,226.042175,227.005432,252.30043,272.957306,349.170715,228.335312,...,0.499567,0.440703,0.299877,0.581543,0.639656,0.518631,0.334381,0.888483,0.3038,0.220731
A2M-AS1,2.25861,2.95156,2.239747,2.14502,1.667437,1.536585,3.170306,1.764703,3.62862,2.147331,...,0.0,0.203534,0.093819,0.370364,0.019853,0.133997,0.069347,0.190764,0.127751,0.111661


## Drop Any Genes That Have Zero Expression Across 95% Of The Samples

In [25]:
matrix.replace(0, np.nan, inplace=True)

In [26]:
matrix.shape

(32379, 8555)

In [27]:
matrix.dropna(thresh=(0.05*matrix.shape[1]), axis=0, inplace=True)

In [28]:
matrix.shape

(25563, 8555)

In [29]:
matrix.replace(np.nan, 0, inplace=True)

## Save Unfiltered Sample Matrix To File

In [102]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_sample_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Filtered Samples Matrix

## Normalize Matrix (Quantile Normalize the matrix for the columns)

In [30]:
normalized_matrix = matrix.copy()

In [31]:
normalized_matrix = mf.quantileNormalize(normalized_matrix)

Step 2/2 progress: 100%  8555 Out of 8555   

In [32]:
normalized_matrix.head()

Unnamed: 0_level_0,"(Donor:GTEX-111CU, Gender:M, Age:50-59, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111FC, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111VG, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111YS, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-1122O, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-1128S, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-113IC, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-117YX, Gender:M, Age:50-59, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-11DXW, Gender:M, Age:40-49, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-11DXX, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)",...,"(Donor:GTEX-ZVE2, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVP2, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVT2, Gender:F, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVT3, Gender:F, Age:60-69, Tissue:Whole Blood)","(Donor:GTEX-ZVT4, Gender:F, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVTK, Gender:M, Age:20-29, Tissue:Whole Blood)","(Donor:GTEX-ZVZP, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVZQ, Gender:F, Age:60-69, Tissue:Whole Blood)","(Donor:GTEX-ZXES, Gender:F, Age:30-39, Tissue:Whole Blood)","(Donor:GTEX-ZXG5, Gender:M, Age:60-69, Tissue:Whole Blood)"
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.39099,1.206402,1.693003,0.713117,1.091747,0.911487,1.586214,0.704666,0.998262,0.878721,...,2.820968,3.99431,2.668434,3.227352,3.149834,1.154417,3.339131,2.718687,2.487828,1.77041
A1BG-AS1,0.895398,0.824654,0.825017,0.501564,1.010577,0.738648,0.85961,0.348926,0.865315,0.377619,...,1.047756,2.854908,2.168857,1.043826,1.368661,1.142232,2.987514,2.473603,1.920849,1.153951
A1CF,0.0,0.0,0.00458,0.021648,0.0,0.021404,0.009244,0.01295,0.019805,0.0,...,0.062856,0.032312,0.0,0.164209,0.042926,0.048027,0.0,0.69919,0.045365,0.076032
A2M,174.182655,263.007498,291.139614,273.117925,235.425136,204.978218,209.622322,253.644612,329.950476,195.20923,...,2.893217,1.444436,1.508096,2.00942,2.168857,1.504192,1.766084,2.857321,1.123246,0.817735
A2M-AS1,2.024696,2.252271,1.846706,2.150835,1.686406,1.603056,2.446727,1.813962,3.183666,2.288575,...,0.0,0.814103,0.569307,1.394174,0.081387,0.434952,0.470978,0.774585,0.53287,0.437752


## Nomalize Matrix (z-score the rows)

In [33]:
mf.zscore(normalized_matrix, 'row')

Progress: 100%  25563 Out of 25563   

In [34]:
normalized_matrix.head()

Unnamed: 0_level_0,"(Donor:GTEX-111CU, Gender:M, Age:50-59, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111FC, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111VG, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111YS, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-1122O, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-1128S, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-113IC, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-117YX, Gender:M, Age:50-59, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-11DXW, Gender:M, Age:40-49, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-11DXX, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)",...,"(Donor:GTEX-ZVE2, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVP2, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVT2, Gender:F, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVT3, Gender:F, Age:60-69, Tissue:Whole Blood)","(Donor:GTEX-ZVT4, Gender:F, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVTK, Gender:M, Age:20-29, Tissue:Whole Blood)","(Donor:GTEX-ZVZP, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVZQ, Gender:F, Age:60-69, Tissue:Whole Blood)","(Donor:GTEX-ZXES, Gender:F, Age:30-39, Tissue:Whole Blood)","(Donor:GTEX-ZXG5, Gender:M, Age:60-69, Tissue:Whole Blood)"
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.118566,-0.124971,-0.108086,-0.142088,-0.12895,-0.135205,-0.111791,-0.142382,-0.132194,-0.136342,...,-0.068944,-0.028228,-0.074237,-0.054842,-0.057532,-0.126775,-0.050963,-0.072493,-0.080504,-0.105399
A1BG-AS1,-0.09815,-0.186434,-0.185981,-0.589629,0.045587,-0.293764,-0.14281,-0.780112,-0.135692,-0.744305,...,0.091984,2.347194,1.491047,0.087079,0.492452,0.209884,2.512677,1.871349,1.181548,0.224508
A1CF,-0.174084,-0.174084,-0.17275,-0.167781,-0.174084,-0.167852,-0.171392,-0.170313,-0.168317,-0.174084,...,-0.155782,-0.164676,-0.174084,-0.126271,-0.161585,-0.1601,-0.174084,0.029502,-0.160875,-0.151946
A2M,-0.055525,0.095257,0.143011,0.112419,0.048435,-0.003249,0.004635,0.079363,0.208893,-0.019832,...,-0.34629,-0.34875,-0.348642,-0.347791,-0.34752,-0.348648,-0.348204,-0.346351,-0.349295,-0.349813
A2M-AS1,0.239284,0.40247,0.111653,0.329734,-0.003292,-0.06306,0.541908,0.088173,1.070342,0.428502,...,-1.212557,-0.628791,-0.804327,-0.212842,-1.154197,-0.900668,-0.874835,-0.657129,-0.830454,-0.89866


## Save Filtered Sample Matrix To File

In [108]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_sample_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [35]:
tertiary_matrix = mf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  8555 Out of 8555   

In [36]:
tertiary_matrix.head()

Unnamed: 0_level_0,"(Donor:GTEX-111CU, Gender:M, Age:50-59, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111FC, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111VG, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111YS, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-1122O, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-1128S, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-113IC, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-117YX, Gender:M, Age:50-59, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-11DXW, Gender:M, Age:40-49, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-11DXX, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)",...,"(Donor:GTEX-ZVE2, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVP2, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVT2, Gender:F, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVT3, Gender:F, Age:60-69, Tissue:Whole Blood)","(Donor:GTEX-ZVT4, Gender:F, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVTK, Gender:M, Age:20-29, Tissue:Whole Blood)","(Donor:GTEX-ZVZP, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVZQ, Gender:F, Age:60-69, Tissue:Whole Blood)","(Donor:GTEX-ZXES, Gender:F, Age:30-39, Tissue:Whole Blood)","(Donor:GTEX-ZXG5, Gender:M, Age:60-69, Tissue:Whole Blood)"
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1BG-AS1,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M-AS1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0


## Save Teriary Matrix

In [111]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_sample_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

#### Create matrix with simplified column names for libraries

In [37]:
file = 'Input/GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct'
col = pd.read_csv(file, sep='\t' ,skiprows=2, nrows=0)

tertiary_matrix_simple_col = tertiary_matrix.copy()
tertiary_matrix_simple_col.columns = col.columns[2:]

In [38]:
tertiary_matrix_simple_col.head()

Unnamed: 0_level_0,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-5NQ91,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,GTEX-11DXW-0326-SM-5H11W,GTEX-11DXX-2326-SM-5Q5A2,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1BG-AS1,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M-AS1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0


#### Path to output files

In [39]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GTEx_Python_Script/Output/'

# Create Up Gene Set Library

In [152]:
name = 'gtex_sample_gene_up_set'

In [155]:
mf.createUpGeneSetLib(tertiary_matrix_simple_col, path, name, details=tertiary_matrix.columns.tolist())

Progeres: 100%  8555 Out of 8555   

# Create Down Gene Set Library

In [41]:
name = 'gtex_sample_gene_down_set'

In [42]:
mf.createDownGeneSetLib(tertiary_matrix_simple_col, path, name, details=tertiary_matrix.columns.tolist())

Progeres: 100%  8555 Out of 8555   

# Create Up Attribute Library

In [160]:
name = 'gtex_sample_attribute_up_set'

In [161]:
mf.createUpAttributeSetLib(tertiary_matrix_simple_col, path, name)

Progeres: 100%  25563 Out of 25563   

# Create Down Attribute Library

In [43]:
name = 'gtex_sample_attribute_down_set'

In [44]:
mf.createDownAttributeSetLib(tertiary_matrix_simple_col, path, name)

Progeres: 100%  25563 Out of 25563   

# Create Gene Similaruty Matrix

In [45]:
gene_similarity_matix = mf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [46]:
gene_similarity_matix.head()

Gene Symbol,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2ML1-AS1,A2MP1,A3GALT2,A4GALT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,0.365238,0.882954,0.003886,0.179384,-0.033727,-0.031597,0.050353,-0.016965,-0.123022,...,-0.02973,-0.023906,-0.049821,0.369333,0.183995,0.206839,0.017458,-0.086731,-0.078704,0.057218
A1BG-AS1,0.365238,1.0,0.305397,0.001667,-0.026308,-0.12468,-0.078224,-0.074271,0.281922,-0.15871,...,0.067529,0.163941,0.045196,0.164588,0.0209,0.198019,-0.098823,0.259465,-0.108671,0.115304
A1CF,0.882954,0.305397,1.0,-0.010103,0.110563,-0.041088,-0.03937,0.017114,-0.03523,-0.165035,...,-0.032612,-0.005864,-0.058956,0.350361,0.220884,0.176226,0.001266,-0.117599,0.092365,0.038317
A2M,0.003886,0.001667,-0.010103,1.0,0.271196,-0.060227,-0.048228,0.019431,-0.039331,0.115488,...,-0.072909,-0.083038,-0.076248,-0.047639,-0.071266,-0.075654,-0.091549,0.158334,-0.018932,-0.062636
A2M-AS1,0.179384,-0.026308,0.110563,0.271196,1.0,-0.190845,-0.117178,0.241012,-0.102034,0.403801,...,-0.223525,-0.294977,0.199693,0.115869,-0.076744,-0.186943,0.039717,0.189163,-0.038878,0.100883


## Save Gene Similarity Matrix 

In [47]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_sample_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

#### Create matrix with simplified column names for attribute

In [49]:
file = 'Input/GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct'
col = pd.read_csv(file, sep='\t' ,skiprows=2, nrows=0)

normalized_matrix_simple_col = normalized_matrix.copy()
normalized_matrix_simple_col.columns = col.columns[2:]

In [50]:
attribute_similarity_matix = mf.createSimilarityMatrix(normalized_matrix_simple_col.T, 'cosine')

In [51]:
attribute_similarity_matix.head()

Unnamed: 0,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-5NQ91,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,GTEX-11DXW-0326-SM-5H11W,GTEX-11DXX-2326-SM-5Q5A2,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
GTEX-111CU-1826-SM-5GZYN,1.0,0.401957,0.364976,0.502307,0.487537,0.495569,0.430883,0.416425,0.351929,0.54439,...,0.012688,0.078528,0.028819,0.001348,0.019734,0.017371,0.025781,-0.001843,0.02134,0.022801
GTEX-111FC-0226-SM-5N9B8,0.401957,1.0,0.306808,0.382541,0.313275,0.285828,0.589088,0.238485,0.549767,0.195671,...,-0.083439,-0.051779,-0.061273,-0.064501,-0.085032,-0.072966,-0.069495,-0.084648,-0.073214,-0.064986
GTEX-111VG-2326-SM-5N9BK,0.364976,0.306808,1.0,0.243179,0.212508,0.291648,0.423739,0.139178,0.311516,0.195656,...,0.114536,0.140076,0.075707,0.148009,0.14461,0.145369,0.07039,0.105556,0.074454,0.133563
GTEX-111YS-2426-SM-5GZZQ,0.502307,0.382541,0.243179,1.0,0.680274,0.591048,0.352426,0.554932,0.39407,0.525094,...,-0.021032,-0.018367,-0.004344,-0.06322,-0.019875,-0.019581,-0.02719,-0.083941,-0.032772,-0.008329
GTEX-1122O-2026-SM-5NQ91,0.487537,0.313275,0.212508,0.680274,1.0,0.67044,0.30569,0.536968,0.31016,0.580017,...,-0.015503,0.003498,0.008937,-0.058096,-0.029755,-0.047346,-0.009299,-0.085781,-0.012281,-0.021405


## Save Attribute Similarity Matrix

In [52]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_sample_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [53]:
gene_list = mf.createGeneList(normalized_matrix)

Progeres: 100%  25563 Out of 25563   

In [54]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1BG-AS1,503538
2,A1CF,29974
3,A2M,2
4,A2M-AS1,144571


In [55]:
gene_list.shape

(25563, 2)

### Save Gene List

In [56]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_sample_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [57]:
attribute_list = mf.createAttributeList(normalized_matrix_simple_col)

In [58]:
attribute_list.head()

Unnamed: 0,Attributes
0,GTEX-111CU-1826-SM-5GZYN
1,GTEX-111FC-0226-SM-5N9B8
2,GTEX-111VG-2326-SM-5N9BK
3,GTEX-111YS-2426-SM-5GZZQ
4,GTEX-1122O-2026-SM-5NQ91


In [59]:
attribute_list.shape

(8555, 1)

### Save Attribute List

In [60]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_sample_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [61]:
gene_attribute_edge_list = mf.createGeneAttributeEdgeList(tertiary_matrix_simple_col, gene_list)

Progeres: 100%  8555 Out of 8555   

In [62]:
gene_attribute_edge_list.head()

Unnamed: 0,Attribute,Gene,GeneID,Weight
0,GTEX-111CU-1826-SM-5GZYN,A1BG,1,0.0
1,GTEX-111CU-1826-SM-5GZYN,A1BG-AS1,503538,0.0
2,GTEX-111CU-1826-SM-5GZYN,A1CF,29974,0.0
3,GTEX-111CU-1826-SM-5GZYN,A2M,2,0.0
4,GTEX-111CU-1826-SM-5GZYN,A2M-AS1,144571,0.0


In [63]:
gene_attribute_edge_list.shape

(218691465, 4)

### Get Number of (Statistically Relevant) Gene-Attribute Associations

In [64]:
gene_attribute_edge_list[gene_attribute_edge_list['Weight'] != 0].shape

(43741715, 4)

### Save Gene-Attribute Edge List

In [65]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_sample_gene_attribute_edge_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_attribute_edge_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Tissue Data

In [67]:
tissue_matrix = matrix.copy()

In [68]:
tissue_matrix.head()

Unnamed: 0_level_0,"(Donor:GTEX-111CU, Gender:M, Age:50-59, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111FC, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111VG, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-111YS, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-1122O, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-1128S, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-113IC, Gender:M, Age:60-69, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-117YX, Gender:M, Age:50-59, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-11DXW, Gender:M, Age:40-49, Tissue:Adipose - Subcutaneous)","(Donor:GTEX-11DXX, Gender:F, Age:60-69, Tissue:Adipose - Subcutaneous)",...,"(Donor:GTEX-ZVE2, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVP2, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVT2, Gender:F, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVT3, Gender:F, Age:60-69, Tissue:Whole Blood)","(Donor:GTEX-ZVT4, Gender:F, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVTK, Gender:M, Age:20-29, Tissue:Whole Blood)","(Donor:GTEX-ZVZP, Gender:M, Age:50-59, Tissue:Whole Blood)","(Donor:GTEX-ZVZQ, Gender:F, Age:60-69, Tissue:Whole Blood)","(Donor:GTEX-ZXES, Gender:F, Age:30-39, Tissue:Whole Blood)","(Donor:GTEX-ZXG5, Gender:M, Age:60-69, Tissue:Whole Blood)"
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.45904,1.505384,2.04869,0.595234,0.96475,0.77931,2.074153,0.587864,1.095912,0.665989,...,0.481572,1.994508,0.637642,1.057956,1.037017,0.385123,0.857898,0.836602,0.825334,0.596233
A1BG-AS1,0.856568,0.962106,0.974077,0.401879,0.885118,0.604135,1.10064,0.256644,0.946916,0.243068,...,0.152733,1.188405,0.475428,0.263927,0.381991,0.381954,0.720083,0.750913,0.582638,0.342768
A1CF,0.0,0.0,0.005385,0.010977,0.0,0.00462,0.002215,0.003925,0.020054,0.0,...,0.003594,0.002863,0.0,0.035323,0.004328,0.005311,0.0,0.174263,0.00557,0.014979
A2M,212.201385,309.729248,358.070068,253.837097,226.042175,227.005432,252.30043,272.957306,349.170715,228.335312,...,0.499567,0.440703,0.299877,0.581543,0.639656,0.518631,0.334381,0.888483,0.3038,0.220731
A2M-AS1,2.25861,2.95156,2.239747,2.14502,1.667437,1.536585,3.170306,1.764703,3.62862,2.147331,...,0.0,0.203534,0.093819,0.370364,0.019853,0.133997,0.069347,0.190764,0.127751,0.111661


In [69]:
tissue_matrix.shape

(25563, 8555)

## Change Columns To Show Only Tissue Information

In [72]:
lst = []

for col in tissue_matrix.columns:
    lst.append(col[3].split(':')[1])

tissue_matrix.columns = lst

In [73]:
tissue_matrix.head()

Unnamed: 0_level_0,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose - Subcutaneous,Adipose - Subcutaneous,...,Whole Blood,Whole Blood,Whole Blood,Whole Blood,Whole Blood,Whole Blood,Whole Blood,Whole Blood,Whole Blood,Whole Blood
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.45904,1.505384,2.04869,0.595234,0.96475,0.77931,2.074153,0.587864,1.095912,0.665989,...,0.481572,1.994508,0.637642,1.057956,1.037017,0.385123,0.857898,0.836602,0.825334,0.596233
A1BG-AS1,0.856568,0.962106,0.974077,0.401879,0.885118,0.604135,1.10064,0.256644,0.946916,0.243068,...,0.152733,1.188405,0.475428,0.263927,0.381991,0.381954,0.720083,0.750913,0.582638,0.342768
A1CF,0.0,0.0,0.005385,0.010977,0.0,0.00462,0.002215,0.003925,0.020054,0.0,...,0.003594,0.002863,0.0,0.035323,0.004328,0.005311,0.0,0.174263,0.00557,0.014979
A2M,212.201385,309.729248,358.070068,253.837097,226.042175,227.005432,252.30043,272.957306,349.170715,228.335312,...,0.499567,0.440703,0.299877,0.581543,0.639656,0.518631,0.334381,0.888483,0.3038,0.220731
A2M-AS1,2.25861,2.95156,2.239747,2.14502,1.667437,1.536585,3.170306,1.764703,3.62862,2.147331,...,0.0,0.203534,0.093819,0.370364,0.019853,0.133997,0.069347,0.190764,0.127751,0.111661


## Merge Like Column (by taking the mean)

In [74]:
tissue_matrix = mf.merge(tissue_matrix, 'column', 'mean')

In [75]:
tissue_matrix.shape

(25563, 53)

## Save Unfiltered Sample Matrix To File

In [76]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_tissue_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tissue_matrix.to_csv(filename, sep='\t', compression='gzip')

# Filtered Tissue Matrix

## Normalize Matrix (Quantile Normalize the matrix for the columns)

In [77]:
normalized_tissue_matrix = tissue_matrix.copy()

In [78]:
normalized_tissue_matrix = mf.quantileNormalize(normalized_tissue_matrix)

Step 2/2 progress: 100%  53 Out of 53   

In [79]:
normalized_tissue_matrix.head()

Unnamed: 0_level_0,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),Brain - Caudate (basal ganglia),...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.208744,1.415101,0.849489,1.390583,2.518175,2.406906,0.503637,2.477348,1.77819,2.233247,...,0.907719,0.798094,0.905191,2.278792,0.59835,0.128034,1.200575,2.227686,1.701714,2.456508
A1BG-AS1,0.651623,0.755501,0.63117,0.909387,1.670071,1.515457,0.401378,0.663335,1.256861,0.781954,...,0.579429,0.483133,0.793571,1.887458,0.426379,0.040569,0.9411,1.676179,0.947237,2.12157
A1CF,0.005316,0.013918,0.016709,0.010085,0.008234,0.014051,0.010062,0.002443,0.003464,0.004387,...,0.006483,0.006316,4.301813,0.005873,0.147932,0.002221,0.006737,0.006094,0.005214,0.205095
A2M,228.42977,246.997952,83.234194,509.969867,391.400156,378.613934,360.134746,40.897361,30.19457,50.37548,...,36.399245,49.177524,125.364453,120.609981,117.355696,24.34325,178.149124,216.839346,108.157764,3.331791
A2M-AS1,2.192827,1.823415,2.781015,3.254069,2.74218,2.922649,2.048982,0.881582,0.826593,0.848087,...,0.465272,0.586984,0.807712,1.889373,1.172446,0.717759,1.072474,4.054502,1.646128,0.66007


## Nomalize Matrix (z-score the rows)

In [80]:
mf.zscore(normalized_tissue_matrix, 'row')

Progress: 100%  25563 Out of 25563   

In [81]:
normalized_tissue_matrix.head()

Unnamed: 0_level_0,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),Brain - Caudate (basal ganglia),...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.154891,-0.148422,-0.166152,-0.149191,-0.113844,-0.117332,-0.176994,-0.115124,-0.13704,-0.122776,...,-0.164327,-0.167764,-0.164406,-0.121348,-0.174025,-0.188768,-0.155147,-0.12295,-0.139438,-0.115777
A1BG-AS1,-0.612641,-0.462196,-0.642262,-0.239324,0.862365,0.638439,-0.975067,-0.595678,0.263918,-0.423884,...,-0.717198,-0.856663,-0.40706,1.177203,-0.938858,-1.497623,-0.193395,0.871211,-0.184506,1.516264
A1CF,-0.199906,-0.197603,-0.196856,-0.198629,-0.199125,-0.197568,-0.198636,-0.200676,-0.200402,-0.200155,...,-0.199594,-0.199639,0.950583,-0.199757,-0.161718,-0.200735,-0.199526,-0.199698,-0.199934,-0.146411
A2M,0.394586,0.488829,-0.342358,1.82355,1.221746,1.156849,1.063058,-0.557239,-0.611562,-0.509133,...,-0.58007,-0.515213,-0.128525,-0.152656,-0.169173,-0.64126,0.139385,0.335759,-0.215858,-0.747904
A2M-AS1,0.390815,0.037761,0.952956,1.405062,0.915841,1.088319,0.25334,-0.862365,-0.914919,-0.894376,...,-1.26024,-1.143917,-0.932963,0.100799,-0.584381,-1.018933,-0.679926,2.17005,-0.131674,-1.074068


## Save Filtered Tissue Matrix To File

In [82]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_tissue_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_tissue_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [83]:
tertiary_tissue_matrix = mf.createTertiaryMarix(normalized_tissue_matrix)

Progeres: 100%  53 Out of 53   

In [84]:
tertiary_tissue_matrix.head()

Unnamed: 0_level_0,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),Brain - Caudate (basal ganglia),...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1BG-AS1,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,1.0,-1.0,-1.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M-AS1,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0


## Save Teriary Matrix

In [85]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_tissue_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_tissue_matrix.to_csv(filename, sep='\t', compression='gzip')

#### Path to output files

In [87]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GTEx_Python_Script/Output/'

# Create Up Gene Set Library

In [88]:
name = 'gtex_tissue_gene_up_set'

In [89]:
mf.createUpGeneSetLib(tertiary_tissue_matrix, path, name)

Progeres: 100%  53 Out of 53   

# Create Down Gene Set Library

In [90]:
name = 'gtex_tissue_gene_down_set'

In [91]:
mf.createDownGeneSetLib(tertiary_tissue_matrix, path, name)

Progeres: 100%  53 Out of 53   

# Create Up Attribute Library

In [93]:
name = 'gtex_tissue_attribute_up_set'

In [94]:
mf.createUpAttributeSetLib(tertiary_tissue_matrix, path, name)

Progeres: 100%  25563 Out of 25563   

# Create Down Attribute Library

In [95]:
name = 'gtex_tissue_attribute_down_set'

In [96]:
mf.createDownAttributeSetLib(tertiary_tissue_matrix, path, name)

Progeres: 100%  25563 Out of 25563   

# Create Gene Similaruty Matrix

In [98]:
gene_tissue_similarity_matix = mf.createSimilarityMatrix(normalized_tissue_matrix, 'cosine')

In [99]:
gene_tissue_similarity_matix.head()

Gene Symbol,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2ML1-AS1,A2MP1,A3GALT2,A4GALT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,0.489179,0.96999,0.072316,0.307547,-0.044048,-0.076856,0.094917,-0.030419,-0.158467,...,-0.029714,-0.01738,-0.058227,0.6067,0.316085,0.26132,0.056219,-0.122183,-0.08523,0.111985
A1BG-AS1,0.489179,1.0,0.421178,-0.022467,0.113174,-0.134407,-0.091872,-0.065978,0.266725,-0.185902,...,0.092768,0.247134,0.233098,0.470485,0.068187,0.350925,-0.008085,0.117278,-0.11169,0.263607
A1CF,0.96999,0.421178,1.0,0.065284,0.253487,-0.05727,-0.101408,0.048739,-0.045723,-0.17046,...,-0.039265,-0.006949,-0.074792,0.584022,0.37217,0.2372,0.01035,-0.131107,0.081371,0.074555
A2M,0.072316,-0.022467,0.065284,1.0,0.443471,-0.098798,-0.159357,0.057757,-0.052376,0.359037,...,-0.153694,-0.170615,-0.025017,0.01919,0.08484,-0.111746,-0.19363,0.35294,0.076582,-0.010967
A2M-AS1,0.307547,0.113174,0.253487,0.443471,1.0,-0.204516,-0.208569,0.387755,-0.035539,0.524137,...,-0.263733,-0.384142,0.339639,0.355063,0.212319,-0.222619,0.099239,0.443253,0.087369,0.258583


## Save Gene Similarity Matrix 

In [100]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_tissue_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_tissue_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [102]:
attribute_tissue_similarity_matix = mf.createSimilarityMatrix(normalized_tissue_matrix.T, 'cosine')

In [103]:
attribute_tissue_similarity_matix.head()

Unnamed: 0,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),Brain - Caudate (basal ganglia),...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
Adipose - Subcutaneous,1.0,0.725279,-0.017274,0.226766,0.398617,0.320223,0.126051,-0.313858,-0.347208,-0.327486,...,0.060968,0.10093,0.019224,0.047948,0.007818,-0.134753,0.05404,0.193362,0.153919,0.019345
Adipose - Visceral (Omentum),0.725279,1.0,0.003012,0.165445,0.383384,0.127813,0.07839,-0.253833,-0.294685,-0.267867,...,0.02847,0.014813,0.073091,0.090722,0.092253,-0.131996,0.04699,0.089363,0.18685,0.03283
Adrenal Gland,-0.017274,0.003012,1.0,-0.036711,-0.027731,-0.057023,-0.012423,-0.127956,-0.116723,-0.113745,...,-0.068419,-0.069665,-0.038594,0.009952,0.039421,-0.033669,0.043771,-0.049055,-0.114963,-0.027814
Artery - Aorta,0.226766,0.165445,-0.036711,1.0,0.769123,0.698868,0.105269,-0.241587,-0.24191,-0.244375,...,-0.062577,-0.05363,-0.105207,-0.089896,-0.024687,-0.12895,0.040528,0.268895,0.091254,-0.024696
Artery - Coronary,0.398617,0.383384,-0.027731,0.769123,1.0,0.681045,0.142582,-0.254034,-0.286044,-0.270421,...,-0.082065,-0.070008,-0.025464,-0.013802,0.045759,-0.159656,0.039656,0.239062,0.154836,-0.050375


## Save Attribute Similarity Matrix

In [104]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_tissue_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_tissue_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [106]:
gene_tissue_list = mf.createGeneList(normalized_tissue_matrix)

Progeres: 100%  25563 Out of 25563   

In [107]:
gene_tissue_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1BG-AS1,503538
2,A1CF,29974
3,A2M,2
4,A2M-AS1,144571


In [108]:
gene_tissue_list.shape

(25563, 2)

### Save Gene List

In [109]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_tissue_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_tissue_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [111]:
attribute_tissue_list = mf.createAttributeList(normalized_tissue_matrix)

In [112]:
attribute_tissue_list.head()

Unnamed: 0,Attributes
0,Adipose - Subcutaneous
1,Adipose - Visceral (Omentum)
2,Adrenal Gland
3,Artery - Aorta
4,Artery - Coronary


In [113]:
attribute_tissue_list.shape

(53, 1)

### Save Attribute List

In [114]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_tissue_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_tissue_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [116]:
gene_tissue_attribute_edge_list = mf.createGeneAttributeEdgeList(tertiary_tissue_matrix, gene_tissue_list)

Progeres: 100%  53 Out of 53   

In [117]:
gene_tissue_attribute_edge_list.head()

Unnamed: 0,Attribute,Gene,GeneID,Weight
0,Adipose - Subcutaneous,A1BG,1,0.0
1,Adipose - Subcutaneous,A1BG-AS1,503538,0.0
2,Adipose - Subcutaneous,A1CF,29974,0.0
3,Adipose - Subcutaneous,A2M,2,0.0
4,Adipose - Subcutaneous,A2M-AS1,144571,0.0


In [118]:
gene_tissue_attribute_edge_list.shape

(1354839, 4)

### Get Number of (Statistically Relevant) Gene-Attribute Associations

In [119]:
gene_tissue_attribute_edge_list[gene_tissue_attribute_edge_list['Weight'] != 0].shape

(270989, 4)

### Save Gene-Attribute Edge List

In [120]:
filename = '~/./Documents/Harmonizome/GTEx_Python_Script/Output/gtex_tissue_gene_attribute_edge_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_tissue_attribute_edge_list.to_csv(filename, sep='\t', index=False, compression='gzip')