# The Cancer Gene Atlas (TCGA)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Source: https://cancergenome.nih.gov/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
%matplotlib inline

# Functions

In [2]:
def getGenes(inputDF):
    
    inputDF.reset_index(inplace=True)
    
    inputDF.rename(columns={'index':'Gene Symbol'}, inplace=True)

    inputDF.set_index('Gene Symbol', inplace=True)

    lst = []

    for index in  inputDF.index:
        lst.append(index.split('|')[5])

    inputDF.index = lst

In [3]:
def getSampleData(inputDF, metaDF):
    
    lst = []

    for i,col in enumerate(inputDF.columns):

        progressPercent = ((i+1)/len(inputDF.columns))*100

        sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(inputDF.columns)))
        sys.stdout.flush()
        
        ID = ('-').join(meta.ix[col, 'a_AliquotBarcode'].split('-')[0:3])
        
        donor = 'Donor:'+ID
        
        if ID in metaDF.index:
            gender = 'Gender:'+metaDF.ix[ID, 'gender']
            if type(metaDF.ix[ID, 'ethnicity']) != float:
                ethnicity = 'Ethnicity:'+metaDF.ix[ID, 'ethnicity']
            else:
                ethnicity = 'Ethnicity:NA'
            if type(metaDF.ix[ID, 'tumor_tissue_site']) != float:    
                tissue = 'Tissue:'+metaDF.ix[ID, 'tumor_tissue_site']
            else:
                ethnicity = 'Tissue:NA'
        else:
            gender = 'Gender:NA'
            ethnicity = 'Ethnicity:NA'
            tissue = 'Tissue:NA'
        tumor = 'Tumor:'+meta.ix[col, 'b_disease']
         
        lst.append(tuple((donor, gender, ethnicity,tissue, tumor)))

        
    inputDF.columns = lst

In [4]:
def dropZeroExpressGenes(inputDF):

    inputDF.replace(0, np.nan, inplace=True)

    inputDF.dropna(thresh=(0.05*inputDF.shape[1]), axis=0, inplace=True)

    inputDF.replace(np.nan, 0, inplace=True)

# Load MetaData

In [5]:
meta = pd.read_csv('Input/TCGA_Metadata.csv', index_col=9)

In [6]:
meta.head()

Unnamed: 0_level_0,a_AliquotBarcode,b_disease,a_objectSizeBytes,b_uploaded,b_state,b_library_type,b_center,b_assembly,a_GCSobject
a_CGHubAnalysisID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3a8e6a74-137f-468f-8987-fa0acdde2836,TCGA-CS-6188-01A-11R-1896-07,LGG,6354301047,8/14/2013,Live,RNA-Seq,UNC-LCCC,unaligned,gs://5aa919de-0aa0-43ec-9ec3-288481102b6d/tcga...
c63d44f1-c9ad-4391-b1aa-0feb3713441b,TCGA-EO-A22Y-01A-11R-A180-07,UCEC,5497637066,8/29/2013,Live,RNA-Seq,UNC-LCCC,unaligned,gs://5aa919de-0aa0-43ec-9ec3-288481102b6d/tcga...
21912089-1e42-4bcc-9ad9-fe9a9b88fb09,TCGA-94-A5I4-01A-11R-A26W-07,LUSC,4600317850,8/29/2013,Live,RNA-Seq,UNC-LCCC,unaligned,gs://5aa919de-0aa0-43ec-9ec3-288481102b6d/tcga...
6b86e544-0372-434a-9f28-6fa4075dd228,TCGA-N5-A4RV-01A-21R-A28V-07,UCS,4964387768,8/30/2013,Live,RNA-Seq,UNC-LCCC,unaligned,gs://5aa919de-0aa0-43ec-9ec3-288481102b6d/tcga...
30518eb4-2783-4988-bcd4-36a1a8bb4dfa,TCGA-BK-A56F-01A-32R-A27V-07,UCEC,5668623460,8/31/2013,Live,RNA-Seq,UNC-LCCC,unaligned,gs://5aa919de-0aa0-43ec-9ec3-288481102b6d/tcga...


In [7]:
meta.shape

(11373, 9)

# Load Data (mapping to approved gene symbols and merging duplicate genes)

In [8]:
matrix = pd.DataFrame()


count = 0
for filename in os.listdir('Input'):
    if '.tsv' in filename:
        count +=1 
i = 0
for filename in os.listdir('Input'):
    if '.tsv' in filename:
        i += 1
        progressPercent = ((i)/count)*100

        sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i), count))
        sys.stdout.flush()
        
        temp = pd.read_csv('Input/'+filename, sep='\t', index_col=0)
        
        # Set Index To Only Show Gene Symbols 
        getGenes(temp)
        
        # Map Gene Symbols To Up-to-date Approved Gene Symbols
        mf.mapgenesymbols(temp)
        
        # Merge Duplicate Genes By Rows
        temp = mf.merge(temp, 'row', 'mean')
        
        matrix = pd.concat([matrix, temp], axis=1)
        

Progeres: 99%  198691 Out of 199169    

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.


Progeres: 99%  199074 Out of 199169    

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.


Progeres: 99%  198710 Out of 199169    

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.


In [9]:
matrix.head()

Unnamed: 0_level_0,ca6d9c83-f6bc-474a-a87a-04a8d64f6752,e6f11961-dfda-4c6e-b051-54b27ad9d0de,25de321c-8ad7-4882-8c04-056649e9b4e5,80653994-a361-4e45-b044-502ce874a9d8,8c0b0a9b-6066-4bc3-8f09-1e0454e8b8a3,ec870197-fd7c-405b-bcdf-03c309ed4274,f33c9ee1-d32c-4606-b264-99bcd01e2059,0009f27d-a053-49b0-a030-ba9c1469224f,9b75d36c-9d68-45c1-8a47-bde800bf287e,8e4b078b-e6f7-4424-95fd-51a550203835,...,590edf46-250f-45e5-9080-a95b0d85b47f,5189ff30-67eb-407e-b481-9426eca96908,50cc07f8-cd48-4d6c-9a0f-5c777cf32366,7fd637fc-6d56-4de1-84b4-dd0460c95edd,cfea40ed-d73b-4052-84a5-cf228f9af854,4b1d4b71-b78f-4c33-8dc9-7120894f5b9e,6b894228-ed7f-4907-ad92-a95cb3293a26,fceac077-5a7b-4bdf-a7fa-f937342c85d6,b863da66-25c4-43f7-bd2a-4737095c8fc9,dba72b72-e7ab-4971-8e01-3b130d235924
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.514213,1.307944,1.30208,0.38,0.866746,4.858427,11.294831,29.502162,39.405359,3.334072,...,5.456438,2.607938,6.164272,9.467584,4.93395,2.28126,3.737174,7.301709,9.044614,5.810413
A1BG-AS1,0.081734,0.098419,0.090536,0.064848,0.105425,0.590316,0.56671,2.382888,2.075218,0.579593,...,0.865216,0.353622,0.467968,0.596158,0.665443,0.13177,0.94774,0.913018,1.15243,0.813726
A1CF,0.002684,0.001379,0.001749,0.007381,0.002376,0.012596,0.000682,0.002707,0.00028,0.070824,...,0.001318,0.000529,0.000238,0.001055,0.000797,0.0,0.000952,0.0,0.002568,0.000529
A2M,5.012012,7.384641,3.538513,15.682749,6.464579,11.328956,8.117481,5.609811,4.62459,313.991266,...,75.680612,52.973476,21.599688,23.991228,72.393899,36.401848,93.16846,13.819997,104.625704,134.418843
A2M-AS1,1.51488,0.996223,1.65215,1.24562,1.67223,0.238829,0.230361,0.213854,0.217506,3.77877,...,2.42148,5.12718,4.45664,2.0373,1.75616,1.36797,2.37243,2.40566,2.72236,1.57268


In [10]:
matrix.shape

(35402, 11373)

# Load Sample MetaData

In [11]:
sample_meta = pd.DataFrame()


count = 0
for filename in os.listdir('Input'):
    if 'metadata_' in filename:
        count +=1 
i = 0
for filename in os.listdir('Input'):
    if 'metadata_' in filename:
        i += 1
        progressPercent = ((i)/count)*100

        sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i), count))
        sys.stdout.flush()
        
        temp = pd.read_csv('Input/'+filename, sep='\t', index_col=0)
        
        sample_meta = pd.concat([sample_meta, temp])

Progress: 100%  33 Out of 33   

In [12]:
sample_meta.set_index('ParticipantBarcode', inplace=True)

In [13]:
sample_meta.head()

Unnamed: 0_level_0,Study,Project,ParticipantUUID,TSSCode,age_at_initial_pathologic_diagnosis,anatomic_neoplasm_subdivision,batch_number,bcr,clinical_M,clinical_N,...,BMI,age_began_smoking_in_years,h_pylori_infection,other_dx,other_malignancy_anatomic_site,other_malignancy_histological_type,other_malignancy_malignancy_type,stopped_smoking_year,venous_invasion,year_of_tobacco_smoking_onset
ParticipantBarcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-OR-A5K3,ACC,TCGA,C72B973B-4719-46AB-AC99-05D14088D3BC,OR,53.0,,304.0,Nationwide Children's Hospital,M0,,...,,,,No,,,,,,
TCGA-OR-A5KV,ACC,TCGA,202F7A77-9E5A-4F12-8FFE-78AE5527A746,OR,17.0,,304.0,Nationwide Children's Hospital,M0,,...,,,,No,,,,,,
TCGA-OR-A5L5,ACC,TCGA,2F04E760-8A64-49F2-A1D6-3D390604745C,OR,77.0,,304.0,Nationwide Children's Hospital,M0,,...,,,,"Yes, History of Synchronous/Bilateral Malignancy",Ovary,"Other, specify",Synchronous Malignancy,,,
TCGA-OR-A5KX,ACC,TCGA,33C02ECE-911B-4F45-B410-9DF07E9B189E,OR,25.0,,304.0,Nationwide Children's Hospital,M0,,...,,,,No,,,,,,
TCGA-OR-A5JB,ACC,TCGA,D63F2F1E-1970-445F-907B-23CF4D9EFD83,OR,52.0,,304.0,Nationwide Children's Hospital,M1,,...,,,,No,,,,,,


In [14]:
sample_meta.shape

(10960, 69)

## Map Sample Metadata to Sample ID

In [15]:
getSampleData(matrix, sample_meta)

Progress: 100%  11373 Out of 11373   

In [16]:
matrix.head()

Unnamed: 0_level_0,"(Donor:TCGA-OR-A5JG, Gender:MALE, Ethnicity:NA, Tissue:Adrenal, Tumor:ACC)","(Donor:TCGA-OR-A5LG, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Adrenal, Tumor:ACC)","(Donor:TCGA-OR-A5JD, Gender:FEMALE, Ethnicity:NA, Tissue:Adrenal, Tumor:ACC)","(Donor:TCGA-OR-A5LH, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Adrenal, Tumor:ACC)","(Donor:TCGA-OR-A5KY, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Adrenal, Tumor:ACC)","(Donor:TCGA-PK-A5HB, Gender:MALE, Ethnicity:NA, Tissue:Adrenal, Tumor:ACC)","(Donor:TCGA-PK-A5HA, Gender:MALE, Ethnicity:NA, Tissue:Adrenal, Tumor:ACC)","(Donor:TCGA-OR-A5KX, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Adrenal, Tumor:ACC)","(Donor:TCGA-PK-A5H8, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Adrenal, Tumor:ACC)","(Donor:TCGA-PK-A5H9, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Adrenal, Tumor:ACC)",...,"(Donor:TCGA-V4-A9EX, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Choroid, Tumor:UVM)","(Donor:TCGA-V4-A9EO, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Choroid, Tumor:UVM)","(Donor:TCGA-VD-A8KO, Gender:MALE, Ethnicity:NA, Tissue:Choroid, Tumor:UVM)","(Donor:TCGA-YZ-A983, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Choroid, Tumor:UVM)","(Donor:TCGA-V4-A9EW, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Choroid, Tumor:UVM)","(Donor:TCGA-V4-A9EQ, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Choroid, Tumor:UVM)","(Donor:TCGA-V4-A9EV, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Choroid, Tumor:UVM)","(Donor:TCGA-V4-A9EY, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Choroid, Tumor:UVM)","(Donor:TCGA-V4-A9EU, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Choroid, Tumor:UVM)","(Donor:TCGA-V4-A9EM, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Choroid, Tumor:UVM)"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.514213,1.307944,1.30208,0.38,0.866746,4.858427,11.294831,29.502162,39.405359,3.334072,...,5.456438,2.607938,6.164272,9.467584,4.93395,2.28126,3.737174,7.301709,9.044614,5.810413
A1BG-AS1,0.081734,0.098419,0.090536,0.064848,0.105425,0.590316,0.56671,2.382888,2.075218,0.579593,...,0.865216,0.353622,0.467968,0.596158,0.665443,0.13177,0.94774,0.913018,1.15243,0.813726
A1CF,0.002684,0.001379,0.001749,0.007381,0.002376,0.012596,0.000682,0.002707,0.00028,0.070824,...,0.001318,0.000529,0.000238,0.001055,0.000797,0.0,0.000952,0.0,0.002568,0.000529
A2M,5.012012,7.384641,3.538513,15.682749,6.464579,11.328956,8.117481,5.609811,4.62459,313.991266,...,75.680612,52.973476,21.599688,23.991228,72.393899,36.401848,93.16846,13.819997,104.625704,134.418843
A2M-AS1,1.51488,0.996223,1.65215,1.24562,1.67223,0.238829,0.230361,0.213854,0.217506,3.77877,...,2.42148,5.12718,4.45664,2.0373,1.75616,1.36797,2.37243,2.40566,2.72236,1.57268


## Merge Like Column (by taking the mean)

In [17]:
matrix = mf.merge(matrix, 'column', 'mean')

In [18]:
matrix.shape

(35402, 10420)

## Drop Any Genes That Have Zero Expression Across 95% Of The Samples

In [19]:
dropZeroExpressGenes(matrix)

In [20]:
matrix.shape

(31116, 10420)

# Merge Duplicate genes (by taking the mean)

In [21]:
matrix = mf.merge(matrix, 'row', 'mean')

In [22]:
matrix.shape

(31116, 10420)

## Save Unfiltered Matrix To File

In [23]:
filename = '~/./Documents/Harmonizome/TCGA/Output/tcga_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Normalize Matrix (Quantile Normalize the matrix for the columns)

In [24]:
normalized_matrix = mf.quantileNormalize(matrix)

Step 2/2 progress: 100%  10420 Out of 10420   

In [25]:
normalized_matrix.head()

Unnamed: 0_level_0,"(Donor:TCGA-02-0047, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-0055, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-2483, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-2485, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-2486, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-04-1331, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1332, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1337, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1338, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1341, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)",...,"(Donor:TCGA-ZP-A9D4, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZQ-A9CR, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Stomach, Tumor:STAD)","(Donor:TCGA-ZR-A9CJ, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Esophagus, Tumor:ESCA)","(Donor:TCGA-ZS-A9CD, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZS-A9CE, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZS-A9CF, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZS-A9CG, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZT-A8OM, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Anterior Mediastinum, Tumor:THYM)","(Donor:TCGA-ZU-A8S4, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Bile duct, Tumor:CHOL)","(Donor:TCGA-ZX-AA5X, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Cervical, Tumor:CESC)"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,3.500122,8.551152,7.605301,1.233906,2.54207,4.755987,1.311879,2.923073,3.120143,7.731534,...,266.051601,1.664587,1.545757,367.978589,519.380496,62.137046,95.02966,36.998568,190.403966,1.848744
A1BG-AS1,0.448939,0.744276,0.647442,0.198097,0.439716,0.776454,0.079355,0.516571,0.399217,0.447006,...,0.469845,0.108267,0.313376,0.464896,0.769626,0.13914,0.287323,1.612295,0.107904,0.390266
A1CF,0.007353,0.008785,0.01161,0.0,0.014865,0.003085,0.001159,0.001148,0.010473,0.008235,...,4.586558,0.134291,0.113927,3.874013,5.838722,5.366094,6.666369,0.0,2.034771,0.015038
A2M,42.577243,35.13452,20.053129,7.621996,33.999556,9.8724,12.457361,5.661991,4.915035,2.731184,...,6.772024,22.397889,25.474237,50.460718,14.484341,23.892089,8.559427,29.810896,24.612049,8.723992
A2M-AS1,3.325177,1.961738,0.538191,0.303942,1.327171,0.979255,2.27235,0.626972,1.20156,1.603376,...,1.828706,1.424396,0.495803,2.168734,1.351711,1.257526,1.583657,1.077171,1.133465,0.407444


# Nomalize Matrix (z-score the rows)

In [26]:
mf.zscore(normalized_matrix, 'row')

Progress: 100%  31116 Out of 31116   

In [27]:
normalized_matrix.head()

Unnamed: 0_level_0,"(Donor:TCGA-02-0047, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-0055, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-2483, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-2485, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-2486, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-04-1331, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1332, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1337, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1338, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1341, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)",...,"(Donor:TCGA-ZP-A9D4, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZQ-A9CR, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Stomach, Tumor:STAD)","(Donor:TCGA-ZR-A9CJ, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Esophagus, Tumor:ESCA)","(Donor:TCGA-ZS-A9CD, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZS-A9CE, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZS-A9CF, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZS-A9CG, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZT-A8OM, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Anterior Mediastinum, Tumor:THYM)","(Donor:TCGA-ZU-A8S4, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Bile duct, Tumor:CHOL)","(Donor:TCGA-ZX-AA5X, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Cervical, Tumor:CESC)"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.1477,-0.059286,-0.075842,-0.187369,-0.16447,-0.125717,-0.186004,-0.157801,-0.154351,-0.073632,...,4.448064,-0.17983,-0.18191,6.232219,8.882395,0.878695,1.454455,0.438665,3.12391,-0.176606
A1BG-AS1,-0.01504,0.6949,0.462127,-0.618019,-0.037209,0.77225,-0.903456,0.147536,-0.134561,-0.019686,...,0.035215,-0.833956,-0.340908,0.023318,0.755837,-0.759742,-0.403536,2.781467,-0.834828,-0.156078
A1CF,-0.272337,-0.270865,-0.267962,-0.279893,-0.264617,-0.276723,-0.278702,-0.278713,-0.26913,-0.27143,...,4.433518,-0.141888,-0.162816,3.701266,5.720314,5.234614,6.570852,-0.279893,1.811154,-0.264439
A2M,0.861379,0.605322,0.086467,-0.34121,0.566275,-0.263788,-0.174856,-0.408641,-0.434339,-0.509472,...,-0.370452,0.167135,0.272973,1.132599,-0.10512,0.218541,-0.308959,0.42217,0.24331,-0.303297
A2M-AS1,1.696066,0.622286,-0.498832,-0.683316,0.122531,-0.151471,0.866909,-0.428913,0.023606,0.340057,...,0.517516,0.199101,-0.532215,0.785306,0.141857,0.067682,0.324527,-0.074357,-0.030023,-0.601803


## Save Filtered Matrix

In [28]:
filename = '~/./Documents/Harmonizome/TCGA/Output/tcga_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [29]:
tertiary_matrix = mf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  10420 Out of 10420   

In [30]:
tertiary_matrix.head()

Unnamed: 0_level_0,"(Donor:TCGA-02-0047, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-0055, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-2483, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-2485, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-02-2486, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Brain, Tumor:GBM)","(Donor:TCGA-04-1331, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1332, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1337, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1338, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)","(Donor:TCGA-04-1341, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Ovary, Tumor:OV)",...,"(Donor:TCGA-ZP-A9D4, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZQ-A9CR, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Stomach, Tumor:STAD)","(Donor:TCGA-ZR-A9CJ, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Esophagus, Tumor:ESCA)","(Donor:TCGA-ZS-A9CD, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZS-A9CE, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZS-A9CF, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZS-A9CG, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Liver, Tumor:LIHC)","(Donor:TCGA-ZT-A8OM, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Anterior Mediastinum, Tumor:THYM)","(Donor:TCGA-ZU-A8S4, Gender:MALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Bile duct, Tumor:CHOL)","(Donor:TCGA-ZX-AA5X, Gender:FEMALE, Ethnicity:NOT HISPANIC OR LATINO, Tissue:Cervical, Tumor:CESC)"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
A1BG-AS1,0.0,1.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,1.0,-1.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
A2M,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M-AS1,1.0,1.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0


## Save Teriary Matrix

In [31]:
filename = '~/./Documents/Harmonizome/TCGA/Output/tcga_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

#### Create matrix with simplified column names for libraries

In [32]:
lst = []
for col in tertiary_matrix.columns:
    lst.append(col[0].split(':')[1]+'_'+col[3].split(':')[1]+'_'+col[4].split(':')[1])

tertiary_matrix_simple_col = tertiary_matrix.copy()
tertiary_matrix_simple_col.columns = lst

In [33]:
tertiary_matrix_simple_col.head()

Unnamed: 0_level_0,TCGA-02-0047_Brain_GBM,TCGA-02-0055_Brain_GBM,TCGA-02-2483_Brain_GBM,TCGA-02-2485_Brain_GBM,TCGA-02-2486_Brain_GBM,TCGA-04-1331_Ovary_OV,TCGA-04-1332_Ovary_OV,TCGA-04-1337_Ovary_OV,TCGA-04-1338_Ovary_OV,TCGA-04-1341_Ovary_OV,...,TCGA-ZP-A9D4_Liver_LIHC,TCGA-ZQ-A9CR_Stomach_STAD,TCGA-ZR-A9CJ_Esophagus_ESCA,TCGA-ZS-A9CD_Liver_LIHC,TCGA-ZS-A9CE_Liver_LIHC,TCGA-ZS-A9CF_Liver_LIHC,TCGA-ZS-A9CG_Liver_LIHC,TCGA-ZT-A8OM_Anterior Mediastinum_THYM,TCGA-ZU-A8S4_Bile duct_CHOL,TCGA-ZX-AA5X_Cervical_CESC
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
A1BG-AS1,0.0,1.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,1.0,-1.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
A2M,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M-AS1,1.0,1.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0


#### Path to output files

In [34]:
path = '/Users/moshesilverstein/Documents/Harmonizome/TCGA/Output/'

# Create Up Gene Set Library

In [35]:
name = 'tcga_gene_up_set'

In [36]:
mf.createUpGeneSetLib(tertiary_matrix_simple_col, path, name, details=tertiary_matrix.columns.tolist())

Progeres: 100%  10420 Out of 10420   

# Create Down Gene Set Library

In [37]:
name = 'tcga_gene_down_set'

In [38]:
mf.createDownGeneSetLib(tertiary_matrix_simple_col, path, name, details=tertiary_matrix.columns.tolist())

Progeres: 100%  10420 Out of 10420   

# Create Up Attribute Library

In [39]:
name = 'tcga_attribute_up_set'

In [40]:
mf.createUpAttributeSetLib(tertiary_matrix_simple_col, path, name)

Progeres: 100%  31116 Out of 31116   

# Create Down Attribute Library

In [41]:
name = 'tcga_attribute_down_set'

In [42]:
mf.createDownAttributeSetLib(tertiary_matrix_simple_col, path, name)

Progeres: 100%  31116 Out of 31116   

# Create Gene Similaruty Matrix

In [43]:
gene_similarity_matix = mf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [44]:
gene_similarity_matix.head()

index,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2ML1-AS1,A2ML1-AS2,A2MP1,A3GALT2,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11AP1,ZYG11B,ZYX,ZZEF1,ZZZ3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,0.116493,0.694504,0.275436,0.1785,-0.038911,-0.029849,-0.004237,0.074557,-0.040985,...,-0.073043,-0.090796,0.144568,0.010248,0.014914,-0.007047,0.014514,-0.073304,-0.088321,-0.030026
A1BG-AS1,0.116493,1.0,-0.05515,0.126002,0.144157,-0.08179,-0.054484,-0.03347,0.019838,0.01981,...,-0.023675,-0.069902,-0.042921,-0.118888,0.003921,0.166333,-0.045374,0.01705,-0.001413,0.061001
A1CF,0.694504,-0.05515,1.0,0.242718,0.110688,-0.056437,-0.041388,-0.005185,0.056913,-0.068005,...,-0.047225,-0.029196,0.19441,0.035226,-0.0047,-0.027154,0.032193,-0.098956,-0.061513,-0.035481
A2M,0.275436,0.126002,0.242718,1.0,0.367169,-0.075521,-0.03635,-0.026936,0.068859,-0.02308,...,-0.169839,0.074228,0.030915,-0.070017,-0.060819,0.078346,0.100449,0.140198,-0.02319,-0.007753
A2M-AS1,0.1785,0.144157,0.110688,0.367169,1.0,-0.103743,-0.046309,0.018906,0.047931,0.018741,...,-0.172022,0.053002,-0.003202,-0.039905,-0.08375,0.107086,0.039901,0.110037,-0.033705,0.046674


## Save Gene Similarity Matrix 

In [45]:
filename = '~/./Documents/Harmonizome/TCGA/Output/tcga_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

#### Create matrix with simplified column names for attribute

In [46]:
lst = []
for col in tertiary_matrix.columns:
    lst.append(col[0].split(':')[1]+'_'+col[3].split(':')[1]+'_'+col[4].split(':')[1])

normalized_matrix_simple_col = normalized_matrix.copy()
normalized_matrix_simple_col.columns = lst

In [47]:
attribute_similarity_matix = mf.createSimilarityMatrix(normalized_matrix_simple_col.T, 'cosine')

In [48]:
attribute_similarity_matix.head()

Unnamed: 0,TCGA-02-0047_Brain_GBM,TCGA-02-0055_Brain_GBM,TCGA-02-2483_Brain_GBM,TCGA-02-2485_Brain_GBM,TCGA-02-2486_Brain_GBM,TCGA-04-1331_Ovary_OV,TCGA-04-1332_Ovary_OV,TCGA-04-1337_Ovary_OV,TCGA-04-1338_Ovary_OV,TCGA-04-1341_Ovary_OV,...,TCGA-ZP-A9D4_Liver_LIHC,TCGA-ZQ-A9CR_Stomach_STAD,TCGA-ZR-A9CJ_Esophagus_ESCA,TCGA-ZS-A9CD_Liver_LIHC,TCGA-ZS-A9CE_Liver_LIHC,TCGA-ZS-A9CF_Liver_LIHC,TCGA-ZS-A9CG_Liver_LIHC,TCGA-ZT-A8OM_Anterior Mediastinum_THYM,TCGA-ZU-A8S4_Bile duct_CHOL,TCGA-ZX-AA5X_Cervical_CESC
TCGA-02-0047_Brain_GBM,1.0,0.202787,0.330217,0.278746,0.287948,-0.023859,0.006932,0.013133,-0.049065,-0.054028,...,-0.039395,-0.060445,-0.063892,-0.071719,-0.065871,-0.035974,-0.033098,-0.10196,0.004168,-0.071647
TCGA-02-0055_Brain_GBM,0.202787,1.0,0.1914,0.126908,0.379421,-0.092156,-0.057064,-0.004409,0.004893,-0.035387,...,-0.064123,0.078561,0.039641,0.001396,-0.053643,-0.046953,-0.010863,-0.063226,0.153696,0.005395
TCGA-02-2483_Brain_GBM,0.330217,0.1914,1.0,0.32088,0.208332,0.078093,-0.014081,-0.004762,-0.0121,0.042621,...,-0.074094,-0.066737,-0.085566,-0.06423,-0.071307,-0.062375,-0.053615,0.058715,-0.03652,-0.091451
TCGA-02-2485_Brain_GBM,0.278746,0.126908,0.32088,1.0,0.217863,0.049109,-7.9e-05,-0.016325,-0.024446,-0.013045,...,-0.037563,-0.035253,-0.063266,-0.067455,-0.043763,-0.029661,-0.037368,-0.015934,-0.040071,-0.057882
TCGA-02-2486_Brain_GBM,0.287948,0.379421,0.208332,0.217863,1.0,-0.018279,-0.050648,0.011709,-0.020894,0.024314,...,-0.020858,-0.045553,-0.046858,-0.012286,-0.010172,-0.039937,0.006492,-0.010584,0.038967,-0.062181


## Save Attribute Similarity Matrix

In [49]:
filename = '~/./Documents/Harmonizome/TCGA/Output/tcga_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [50]:
gene_list = mf.createGeneList(normalized_matrix)

Progeres: 100%  31116 Out of 31116   

In [51]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1BG-AS1,503538
2,A1CF,29974
3,A2M,2
4,A2M-AS1,144571


In [52]:
gene_list.shape

(31116, 2)

### Save Gene List

In [53]:
filename = '~/./Documents/Harmonizome/TCGA/Output/tcga_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [54]:
attribute_list = mf.createAttributeList(normalized_matrix_simple_col)

In [55]:
attribute_list.head()

Unnamed: 0,Attributes
0,TCGA-02-0047_Brain_GBM
1,TCGA-02-0055_Brain_GBM
2,TCGA-02-2483_Brain_GBM
3,TCGA-02-2485_Brain_GBM
4,TCGA-02-2486_Brain_GBM


In [56]:
attribute_list.shape

(10420, 1)

### Save Attribute List

In [57]:
filename = '~/./Documents/Harmonizome/TCGA/Output/tcga_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [58]:
path = '/Users/moshesilverstein/Documents/Harmonizome/TCGA/Output/'

In [59]:
name = 'tcga_gene_attribute_edge_list'

In [60]:
mf.createGeneAttributeEdgeList(tertiary_matrix_simple_col, gene_list, path, name)

Progeres: 100%  10420 Out of 10420   

 The number of statisticaly relevent gene-attribute associations is: 64843697
