# Cancer Cell Line Encyclopedia (CCLE)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Source: http://software.broadinstitute.org/software/cprg/?q=node/11

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
%matplotlib inline

In [78]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/CCLE/my_functions.py'>

# Functions

## Load Data

In [42]:
matrix = pd.read_csv('Input/CCLE_Expression_Entrez_2012-09-29.gct', sep='\t', skiprows=2)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
matrix.head()

Unnamed: 0,Name,Description,LN18_CENTRAL_NERVOUS_SYSTEM,769P_KIDNEY,786O_KIDNEY,CAOV3_OVARY,HEPG2_LIVER,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,NCIH524_LUNG,NCIH209_LUNG,...,SLR21_KIDNEY,LNZ308_CENTRAL_NERVOUS_SYSTEM,LN340_CENTRAL_NERVOUS_SYSTEM,HCC827GR5_LUNG,SLR20_KIDNEY,HK2_KIDNEY,EW8_BONE,UOK101_KIDNEY,JHESOAD1_OESOPHAGUS,CH157MN_CENTRAL_NERVOUS_SYSTEM
0,100009676_at,LOC100009676,5.987545,5.444892,5.838828,6.074743,5.7886,5.459675,5.75556,7.190493,...,5.473156,5.517208,5.858379,5.196033,5.831437,5.362021,5.799747,5.865606,5.463812,5.720593
1,10000_at,AKT3,6.230233,7.544216,7.32845,4.27072,4.478293,6.212102,7.562398,8.642669,...,6.375324,6.119814,6.561409,4.521773,6.830904,7.03169,4.881235,6.91464,5.313795,5.757825
2,10001_at,MED6,9.36355,8.715909,8.410834,9.845271,9.761157,10.53282,10.39396,9.478429,...,8.849773,8.767192,8.521635,8.224544,9.325785,8.362727,8.990524,8.958629,9.7481,9.758431
3,10002_at,NR2E3,3.803069,4.173643,3.776557,3.934091,3.822202,3.949198,3.807546,3.930186,...,3.717506,3.977377,3.659459,3.933996,4.515748,4.434658,4.127832,3.942736,4.062648,4.074257
4,10003_at,NAALAD2,3.58643,3.663081,4.047007,3.81725,6.444302,4.081071,5.462774,4.252446,...,3.520843,4.036661,4.168351,3.535915,4.445632,3.622032,5.43658,3.666404,3.556565,3.728828


## Set Matrix to Show Gene Symbols

In [44]:
matrix.rename(columns={'Description':'Gene Symbol'}, inplace=True)

In [45]:
matrix.set_index('Gene Symbol', inplace=True)

In [46]:
matrix.drop('Name', axis=1, inplace=True)

In [47]:
matrix.head()

Unnamed: 0_level_0,LN18_CENTRAL_NERVOUS_SYSTEM,769P_KIDNEY,786O_KIDNEY,CAOV3_OVARY,HEPG2_LIVER,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,NCIH524_LUNG,NCIH209_LUNG,MIAPACA2_PANCREAS,MCAS_OVARY,...,SLR21_KIDNEY,LNZ308_CENTRAL_NERVOUS_SYSTEM,LN340_CENTRAL_NERVOUS_SYSTEM,HCC827GR5_LUNG,SLR20_KIDNEY,HK2_KIDNEY,EW8_BONE,UOK101_KIDNEY,JHESOAD1_OESOPHAGUS,CH157MN_CENTRAL_NERVOUS_SYSTEM
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LOC100009676,5.987545,5.444892,5.838828,6.074743,5.7886,5.459675,5.75556,7.190493,5.449818,5.80182,...,5.473156,5.517208,5.858379,5.196033,5.831437,5.362021,5.799747,5.865606,5.463812,5.720593
AKT3,6.230233,7.544216,7.32845,4.27072,4.478293,6.212102,7.562398,8.642669,5.556191,6.808673,...,6.375324,6.119814,6.561409,4.521773,6.830904,7.03169,4.881235,6.91464,5.313795,5.757825
MED6,9.36355,8.715909,8.410834,9.845271,9.761157,10.53282,10.39396,9.478429,9.112954,9.815614,...,8.849773,8.767192,8.521635,8.224544,9.325785,8.362727,8.990524,8.958629,9.7481,9.758431
NR2E3,3.803069,4.173643,3.776557,3.934091,3.822202,3.949198,3.807546,3.930186,4.161937,4.028581,...,3.717506,3.977377,3.659459,3.933996,4.515748,4.434658,4.127832,3.942736,4.062648,4.074257
NAALAD2,3.58643,3.663081,4.047007,3.81725,6.444302,4.081071,5.462774,4.252446,3.932451,3.835827,...,3.520843,4.036661,4.168351,3.535915,4.445632,3.622032,5.43658,3.666404,3.556565,3.728828


## Map Gene Symbols To Up-to-date Approved Gene Symbols

In [9]:
mf.mapgenesymbols(matrix)

Progeres: 100%  18988 Out of 18988   

In [10]:
matrix.shape

(17514, 1037)

In [11]:
matrix.head()

Unnamed: 0_level_0,LN18_CENTRAL_NERVOUS_SYSTEM,769P_KIDNEY,786O_KIDNEY,CAOV3_OVARY,HEPG2_LIVER,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,NCIH524_LUNG,NCIH209_LUNG,MIAPACA2_PANCREAS,MCAS_OVARY,...,SLR21_KIDNEY,LNZ308_CENTRAL_NERVOUS_SYSTEM,LN340_CENTRAL_NERVOUS_SYSTEM,HCC827GR5_LUNG,SLR20_KIDNEY,HK2_KIDNEY,EW8_BONE,UOK101_KIDNEY,JHESOAD1_OESOPHAGUS,CH157MN_CENTRAL_NERVOUS_SYSTEM
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AKT3,6.230233,7.544216,7.32845,4.27072,4.478293,6.212102,7.562398,8.642669,5.556191,6.808673,...,6.375324,6.119814,6.561409,4.521773,6.830904,7.03169,4.881235,6.91464,5.313795,5.757825
MED6,9.36355,8.715909,8.410834,9.845271,9.761157,10.53282,10.39396,9.478429,9.112954,9.815614,...,8.849773,8.767192,8.521635,8.224544,9.325785,8.362727,8.990524,8.958629,9.7481,9.758431
NR2E3,3.803069,4.173643,3.776557,3.934091,3.822202,3.949198,3.807546,3.930186,4.161937,4.028581,...,3.717506,3.977377,3.659459,3.933996,4.515748,4.434658,4.127832,3.942736,4.062648,4.074257
NAALAD2,3.58643,3.663081,4.047007,3.81725,6.444302,4.081071,5.462774,4.252446,3.932451,3.835827,...,3.520843,4.036661,4.168351,3.535915,4.445632,3.622032,5.43658,3.666404,3.556565,3.728828
CDKN2B-AS1,3.824073,4.069507,4.41817,4.110248,3.960018,4.662574,3.975327,4.444996,3.789725,4.035644,...,4.046835,3.973535,3.819885,4.262679,3.815356,4.004599,3.675165,3.884458,3.886335,4.015238


# Merge Duplicate Genes By Rows

In [12]:
matrix = mf.merge(matrix, 'row', 'mean')

In [13]:
matrix.shape

(17337, 1037)

In [14]:
matrix.head()

Unnamed: 0_level_0,LN18_CENTRAL_NERVOUS_SYSTEM,769P_KIDNEY,786O_KIDNEY,CAOV3_OVARY,HEPG2_LIVER,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,NCIH524_LUNG,NCIH209_LUNG,MIAPACA2_PANCREAS,MCAS_OVARY,...,SLR21_KIDNEY,LNZ308_CENTRAL_NERVOUS_SYSTEM,LN340_CENTRAL_NERVOUS_SYSTEM,HCC827GR5_LUNG,SLR20_KIDNEY,HK2_KIDNEY,EW8_BONE,UOK101_KIDNEY,JHESOAD1_OESOPHAGUS,CH157MN_CENTRAL_NERVOUS_SYSTEM
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,6.965083,4.833232,4.790854,7.606735,7.424735,7.903102,6.556581,7.439617,4.467285,4.840129,...,5.282991,6.746258,7.737832,5.63905,5.1951,8.018287,7.032084,5.275265,4.980379,6.999456
A1BG-AS1,5.112451,4.226488,3.904246,4.868372,4.115945,4.514233,4.674061,5.078262,4.218495,4.624676,...,4.319376,4.932691,4.937516,4.846121,4.299712,4.641312,4.937402,4.43422,4.237546,5.034802
A1CF,3.576425,4.834175,3.599883,3.670599,9.999267,3.617272,3.603085,3.274729,3.64703,3.953115,...,3.522273,3.474908,3.631742,3.742338,3.605626,3.460358,3.583378,3.58395,3.981553,3.581215
A2M,4.200074,3.743379,3.54026,4.848288,14.09086,3.889207,3.75085,3.727195,3.900514,3.823264,...,3.653691,3.909273,5.147755,4.003809,3.608467,3.678678,7.916182,3.675804,3.774968,3.707107
A2ML1,3.526345,3.60913,3.473109,3.598152,3.551845,3.540173,3.253036,3.422079,3.656358,3.884837,...,3.377822,3.532808,3.638774,3.492496,3.426634,3.566901,3.701338,3.577564,3.549599,3.665536


## Load Sample Metadata 

In [15]:
sample_meta = pd.read_csv('Input/CCLE_Expression.Arrays.sif_2012-10-18.txt', sep='\t', index_col=1)

In [16]:
sample_meta.head()

Unnamed: 0_level_0,ID,Cell line primary name,Gender,Batch,Site Primary,Histology,Hist Subtype1
CCLE name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LN18_CENTRAL_NERVOUS_SYSTEM,METIS_p_NCLE_RNA1_Human_U133_Plus_2.0_A02_240852,LN-18,M,1,central_nervous_system,glioma,astrocytoma_Grade_IV
769P_KIDNEY,METIS_p_NCLE_RNA1_Human_U133_Plus_2.0_A05_240858,769-P,F,1,kidney,carcinoma,clear_cell_renal_cell_carcinoma
786O_KIDNEY,METIS_p_NCLE_RNA1_Human_U133_Plus_2.0_A07_240862,786-O,M,1,kidney,carcinoma,clear_cell_renal_cell_carcinoma
CAOV3_OVARY,METIS_p_NCLE_RNA1_Human_U133_Plus_2.0_A08_240864,Caov-3,F,1,ovary,carcinoma,adenocarcinoma
HEPG2_LIVER,METIS_p_NCLE_RNA1_Human_U133_Plus_2.0_A09_240866,Hep G2,M,1,liver,carcinoma,hepatocellular_carcinoma


# Normalize For Batch Effects

In [17]:
normalized_matrix = matrix.copy()
for i,batch in enumerate(sample_meta['Batch'].unique()):
    
    progressPercent = ((i+1)/len(sample_meta['Batch'].unique()))*100
    
    cols = sample_meta[sample_meta['Batch'] == batch].index.values.tolist()
    
    normalized_matrix.ix[:, cols] = mf.quantileNormalize(normalized_matrix.ix[:,cols])
    
    for j,index in enumerate(normalized_matrix.ix[:, cols].index):

        progressPercentinner = ((j+1)/len(normalized_matrix.ix[:, cols].index))*100

        sys.stdout.write("Outter: %d%%  %d Out of %d Inner:  %d%%  %d Out of %d \r" % (progressPercent, (i+1), len(sample_meta['Batch'].unique()), progressPercentinner, (j+1), len(normalized_matrix.ix[:, cols].index)))
        sys.stdout.flush()

        mean = normalized_matrix.ix[index, cols].mean()
        std = normalized_matrix.ix[index, cols].std()
        temp = normalized_matrix.ix[index, cols].copy()
        temp = temp.apply(lambda x: ((x-mean)/std))
        normalized_matrix.ix[index, cols] = temp

Outter: 100%  14 Out of 14 Inner:  100%  17337 Out of 17337 

In [18]:
normalized_matrix.head()

Unnamed: 0_level_0,LN18_CENTRAL_NERVOUS_SYSTEM,769P_KIDNEY,786O_KIDNEY,CAOV3_OVARY,HEPG2_LIVER,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,NCIH524_LUNG,NCIH209_LUNG,MIAPACA2_PANCREAS,MCAS_OVARY,...,SLR21_KIDNEY,LNZ308_CENTRAL_NERVOUS_SYSTEM,LN340_CENTRAL_NERVOUS_SYSTEM,HCC827GR5_LUNG,SLR20_KIDNEY,HK2_KIDNEY,EW8_BONE,UOK101_KIDNEY,JHESOAD1_OESOPHAGUS,CH157MN_CENTRAL_NERVOUS_SYSTEM
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.502695,-0.97262,-1.006189,1.020175,0.837669,1.168023,0.230597,0.83652,-1.232227,-1.01545,...,-0.733904,0.610321,1.445425,-0.431498,-0.749541,1.813942,0.898835,-0.68014,-1.006266,0.898407
A1BG-AS1,1.244843,-0.723821,-1.488741,0.377839,-0.940552,0.005174,0.214648,1.117908,-0.600372,0.075751,...,-1.012036,0.348292,0.380517,-0.080611,-0.910432,-0.458806,0.10372,-0.673,-1.16149,0.478357
A1CF,-0.307085,0.708578,-0.350131,-0.238931,4.661455,-0.325196,-0.249422,-0.542779,-0.258301,-0.080076,...,-0.429413,-0.624122,-0.095026,0.114054,-0.151785,-0.674296,-0.385131,-0.307983,0.842803,-0.386687
A2M,-0.214048,-0.446497,-0.542891,-0.018156,4.07029,-0.388704,-0.399565,-0.412471,-0.352169,-0.422669,...,-0.409386,-0.243157,0.590895,-0.201081,-0.423245,-0.406003,2.397774,-0.399119,-0.326927,-0.403062
A2ML1,0.099417,0.231727,-0.476481,0.393972,0.172222,-0.121695,-1.005227,-0.248635,0.650954,1.306716,...,-0.359839,-0.161895,0.032715,-0.244966,-0.311123,-0.122912,0.041057,-0.0984,-0.129961,-0.015836


# Map Sample Metadata to Sample ID

In [48]:
lst = []

for i,col in enumerate(matrix.columns):
    
    col = col.split('.')[0]

    progressPercent = ((i+1)/len(matrix.columns))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(matrix.columns)))
    sys.stdout.flush()

    
    tissue = 'Tissue:'+(' ').join(col.split('_')[1:])
    cellline = 'Cell-Line:'+col.split('_')[0]
    gender = 'Gender:'+str(sample_meta.ix[col, 'Gender'])
    histology = 'Histology:'+sample_meta.ix[col, 'Histology'].upper()

    lst.append(tuple((tissue, cellline, gender, histology)))


matrix.columns = lst

Progress: 100%  1037 Out of 1037   

In [59]:
matrix.head()

Unnamed: 0_level_0,"(Tissue:CENTRAL NERVOUS SYSTEM, Cell-Line:LN18, Gender:M, Histology:GLIOMA)","(Tissue:KIDNEY, Cell-Line:769P, Gender:F, Histology:CARCINOMA)","(Tissue:KIDNEY, Cell-Line:786O, Gender:M, Histology:CARCINOMA)","(Tissue:OVARY, Cell-Line:CAOV3, Gender:F, Histology:CARCINOMA)","(Tissue:LIVER, Cell-Line:HEPG2, Gender:M, Histology:CARCINOMA)","(Tissue:HAEMATOPOIETIC AND LYMPHOID TISSUE, Cell-Line:MOLT4, Gender:M, Histology:LYMPHOID_NEOPLASM)","(Tissue:LUNG, Cell-Line:NCIH524, Gender:M, Histology:CARCINOMA)","(Tissue:LUNG, Cell-Line:NCIH209, Gender:M, Histology:CARCINOMA)","(Tissue:PANCREAS, Cell-Line:MIAPACA2, Gender:M, Histology:CARCINOMA)","(Tissue:OVARY, Cell-Line:MCAS, Gender:F, Histology:CARCINOMA)",...,"(Tissue:KIDNEY, Cell-Line:SLR21, Gender:nan, Histology:CARCINOMA)","(Tissue:CENTRAL NERVOUS SYSTEM, Cell-Line:LNZ308, Gender:nan, Histology:GLIOMA)","(Tissue:CENTRAL NERVOUS SYSTEM, Cell-Line:LN340, Gender:nan, Histology:GLIOMA)","(Tissue:LUNG, Cell-Line:HCC827GR5, Gender:nan, Histology:CARCINOMA)","(Tissue:KIDNEY, Cell-Line:SLR20, Gender:nan, Histology:CARCINOMA)","(Tissue:KIDNEY, Cell-Line:HK2, Gender:nan, Histology:OTHER)","(Tissue:BONE, Cell-Line:EW8, Gender:nan, Histology:EWINGS_SARCOMA-PERIPHERAL_PRIMITIVE_NEUROECTODERMAL_TUMOUR)","(Tissue:KIDNEY, Cell-Line:UOK101, Gender:nan, Histology:CARCINOMA)","(Tissue:OESOPHAGUS, Cell-Line:JHESOAD1, Gender:nan, Histology:CARCINOMA)","(Tissue:CENTRAL NERVOUS SYSTEM, Cell-Line:CH157MN, Gender:nan, Histology:MENINGIOMA)"
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LOC100009676,5.987545,5.444892,5.838828,6.074743,5.7886,5.459675,5.75556,7.190493,5.449818,5.80182,...,5.473156,5.517208,5.858379,5.196033,5.831437,5.362021,5.799747,5.865606,5.463812,5.720593
AKT3,6.230233,7.544216,7.32845,4.27072,4.478293,6.212102,7.562398,8.642669,5.556191,6.808673,...,6.375324,6.119814,6.561409,4.521773,6.830904,7.03169,4.881235,6.91464,5.313795,5.757825
MED6,9.36355,8.715909,8.410834,9.845271,9.761157,10.53282,10.39396,9.478429,9.112954,9.815614,...,8.849773,8.767192,8.521635,8.224544,9.325785,8.362727,8.990524,8.958629,9.7481,9.758431
NR2E3,3.803069,4.173643,3.776557,3.934091,3.822202,3.949198,3.807546,3.930186,4.161937,4.028581,...,3.717506,3.977377,3.659459,3.933996,4.515748,4.434658,4.127832,3.942736,4.062648,4.074257
NAALAD2,3.58643,3.663081,4.047007,3.81725,6.444302,4.081071,5.462774,4.252446,3.932451,3.835827,...,3.520843,4.036661,4.168351,3.535915,4.445632,3.622032,5.43658,3.666404,3.556565,3.728828


# Merge Like Column (by taking the mean)

In [21]:
matrix = mf.merge(matrix, 'column', 'mean')

In [22]:
matrix.shape

(17337, 1036)

## Save Unfiltered Sample Matrix To File

In [23]:
filename = '~/./Documents/Harmonizome/CCLE/Output/ccle_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Filtered Samples Matrix

# Map Sample Metadata to Sample ID

In [24]:
lst = []

for i,col in enumerate(normalized_matrix.columns):
    
    col = col.split('.')[0]

    progressPercent = ((i+1)/len(normalized_matrix.columns))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(normalized_matrix.columns)))
    sys.stdout.flush()

    
    tissue = 'Tissue:'+(' ').join(col.split('_')[1:])
    cellline = 'Cell-Line:'+col.split('_')[0]
    gender = 'Gender:'+str(sample_meta.ix[col, 'Gender'])
    histology = 'Histology:'+sample_meta.ix[col, 'Histology'].upper()

    lst.append(tuple((tissue, cellline, gender, histology)))


normalized_matrix.columns = lst

Progress: 100%  1037 Out of 1037   

In [25]:
normalized_matrix.head()

Unnamed: 0_level_0,"(Tissue:CENTRAL NERVOUS SYSTEM, Cell-Line:LN18, Gender:M, Histology:GLIOMA)","(Tissue:KIDNEY, Cell-Line:769P, Gender:F, Histology:CARCINOMA)","(Tissue:KIDNEY, Cell-Line:786O, Gender:M, Histology:CARCINOMA)","(Tissue:OVARY, Cell-Line:CAOV3, Gender:F, Histology:CARCINOMA)","(Tissue:LIVER, Cell-Line:HEPG2, Gender:M, Histology:CARCINOMA)","(Tissue:HAEMATOPOIETIC AND LYMPHOID TISSUE, Cell-Line:MOLT4, Gender:M, Histology:LYMPHOID_NEOPLASM)","(Tissue:LUNG, Cell-Line:NCIH524, Gender:M, Histology:CARCINOMA)","(Tissue:LUNG, Cell-Line:NCIH209, Gender:M, Histology:CARCINOMA)","(Tissue:PANCREAS, Cell-Line:MIAPACA2, Gender:M, Histology:CARCINOMA)","(Tissue:OVARY, Cell-Line:MCAS, Gender:F, Histology:CARCINOMA)",...,"(Tissue:KIDNEY, Cell-Line:SLR21, Gender:nan, Histology:CARCINOMA)","(Tissue:CENTRAL NERVOUS SYSTEM, Cell-Line:LNZ308, Gender:nan, Histology:GLIOMA)","(Tissue:CENTRAL NERVOUS SYSTEM, Cell-Line:LN340, Gender:nan, Histology:GLIOMA)","(Tissue:LUNG, Cell-Line:HCC827GR5, Gender:nan, Histology:CARCINOMA)","(Tissue:KIDNEY, Cell-Line:SLR20, Gender:nan, Histology:CARCINOMA)","(Tissue:KIDNEY, Cell-Line:HK2, Gender:nan, Histology:OTHER)","(Tissue:BONE, Cell-Line:EW8, Gender:nan, Histology:EWINGS_SARCOMA-PERIPHERAL_PRIMITIVE_NEUROECTODERMAL_TUMOUR)","(Tissue:KIDNEY, Cell-Line:UOK101, Gender:nan, Histology:CARCINOMA)","(Tissue:OESOPHAGUS, Cell-Line:JHESOAD1, Gender:nan, Histology:CARCINOMA)","(Tissue:CENTRAL NERVOUS SYSTEM, Cell-Line:CH157MN, Gender:nan, Histology:MENINGIOMA)"
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.502695,-0.97262,-1.006189,1.020175,0.837669,1.168023,0.230597,0.83652,-1.232227,-1.01545,...,-0.733904,0.610321,1.445425,-0.431498,-0.749541,1.813942,0.898835,-0.68014,-1.006266,0.898407
A1BG-AS1,1.244843,-0.723821,-1.488741,0.377839,-0.940552,0.005174,0.214648,1.117908,-0.600372,0.075751,...,-1.012036,0.348292,0.380517,-0.080611,-0.910432,-0.458806,0.10372,-0.673,-1.16149,0.478357
A1CF,-0.307085,0.708578,-0.350131,-0.238931,4.661455,-0.325196,-0.249422,-0.542779,-0.258301,-0.080076,...,-0.429413,-0.624122,-0.095026,0.114054,-0.151785,-0.674296,-0.385131,-0.307983,0.842803,-0.386687
A2M,-0.214048,-0.446497,-0.542891,-0.018156,4.07029,-0.388704,-0.399565,-0.412471,-0.352169,-0.422669,...,-0.409386,-0.243157,0.590895,-0.201081,-0.423245,-0.406003,2.397774,-0.399119,-0.326927,-0.403062
A2ML1,0.099417,0.231727,-0.476481,0.393972,0.172222,-0.121695,-1.005227,-0.248635,0.650954,1.306716,...,-0.359839,-0.161895,0.032715,-0.244966,-0.311123,-0.122912,0.041057,-0.0984,-0.129961,-0.015836


# Merge Like Column (by taking the mean)

In [26]:
normalized_matrix = mf.merge(normalized_matrix, 'column', 'mean')

In [27]:
normalized_matrix.shape

(17337, 1036)

# Normalize Matrix (Quantile Normalize the matrix for the columns)

In [28]:
normalized_matrix = matrix.copy()

# Nomalize Matrix (z-score the rows)

In [29]:
mf.zscore(normalized_matrix, 'row')

Progress: 100%  17337 Out of 17337   

## Save Filtered Sample Matrix To File

In [31]:
filename = '~/./Documents/Harmonizome/CCLE/Output/ccle_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [32]:
tertiary_matrix = mf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  1036 Out of 1036   

In [33]:
tertiary_matrix.head()

Unnamed: 0_level_0,"(Tissue:AUTONOMIC GANGLIA, Cell-Line:CHP126, Gender:F, Histology:NEUROBLASTOMA)","(Tissue:AUTONOMIC GANGLIA, Cell-Line:CHP212, Gender:nan, Histology:NEUROBLASTOMA)","(Tissue:AUTONOMIC GANGLIA, Cell-Line:IMR32, Gender:M, Histology:NEUROBLASTOMA)","(Tissue:AUTONOMIC GANGLIA, Cell-Line:KELLY, Gender:nan, Histology:NEUROBLASTOMA)","(Tissue:AUTONOMIC GANGLIA, Cell-Line:KPNRTBM1, Gender:nan, Histology:NEUROBLASTOMA)","(Tissue:AUTONOMIC GANGLIA, Cell-Line:KPNSI9S, Gender:M, Histology:NEUROBLASTOMA)","(Tissue:AUTONOMIC GANGLIA, Cell-Line:KPNYN, Gender:nan, Histology:NEUROBLASTOMA)","(Tissue:AUTONOMIC GANGLIA, Cell-Line:MHHNB11, Gender:M, Histology:NEUROBLASTOMA)","(Tissue:AUTONOMIC GANGLIA, Cell-Line:NB1, Gender:M, Histology:NEUROBLASTOMA)","(Tissue:AUTONOMIC GANGLIA, Cell-Line:NH6, Gender:nan, Histology:NEUROBLASTOMA)",...,"(Tissue:URINARY TRACT, Cell-Line:RT4, Gender:M, Histology:CARCINOMA)","(Tissue:URINARY TRACT, Cell-Line:SCABER, Gender:M, Histology:CARCINOMA)","(Tissue:URINARY TRACT, Cell-Line:SW1710, Gender:F, Histology:CARCINOMA)","(Tissue:URINARY TRACT, Cell-Line:SW780, Gender:F, Histology:CARCINOMA)","(Tissue:URINARY TRACT, Cell-Line:T24, Gender:F, Histology:CARCINOMA)","(Tissue:URINARY TRACT, Cell-Line:TCCSUP, Gender:F, Histology:CARCINOMA)","(Tissue:URINARY TRACT, Cell-Line:UBLC1, Gender:F, Histology:CARCINOMA)","(Tissue:URINARY TRACT, Cell-Line:UMUC1, Gender:M, Histology:CARCINOMA)","(Tissue:URINARY TRACT, Cell-Line:UMUC3, Gender:M, Histology:CARCINOMA)","(Tissue:URINARY TRACT, Cell-Line:VMCUB1, Gender:M, Histology:CARCINOMA)"
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,-1.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,-1.0
A1BG-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Teriary Matrix

In [34]:
filename = '~/./Documents/Harmonizome/CCLE/Output/ccle_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

#### Create matrix with simplified column names for libraries

In [72]:
file = 'Input/CCLE_Expression_Entrez_2012-09-29.gct'
col = pd.read_csv(file, sep='\t' ,skiprows=2, nrows=0)
col.drop('NCIH292_LUNG.1', axis=1,inplace=True)

tertiary_matrix_simple_col = tertiary_matrix.copy()
tertiary_matrix_simple_col.columns = col.columns[2:]

In [73]:
tertiary_matrix_simple_col.head()

Unnamed: 0_level_0,LN18_CENTRAL_NERVOUS_SYSTEM,769P_KIDNEY,786O_KIDNEY,CAOV3_OVARY,HEPG2_LIVER,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,NCIH524_LUNG,NCIH209_LUNG,MIAPACA2_PANCREAS,MCAS_OVARY,...,SLR21_KIDNEY,LNZ308_CENTRAL_NERVOUS_SYSTEM,LN340_CENTRAL_NERVOUS_SYSTEM,HCC827GR5_LUNG,SLR20_KIDNEY,HK2_KIDNEY,EW8_BONE,UOK101_KIDNEY,JHESOAD1_OESOPHAGUS,CH157MN_CENTRAL_NERVOUS_SYSTEM
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,-1.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,-1.0
A1BG-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Path to output files

In [75]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CCLE/Output/'

# Create Up Gene Set Library

In [79]:
name = 'ccle_gene_up_set'

In [80]:
mf.createUpGeneSetLib(tertiary_matrix_simple_col, path, name, details=tertiary_matrix.columns.tolist())

Progeres: 100%  1036 Out of 1036   

# Create Down Gene Set Library

In [81]:
name = 'ccle_gene_down_set'

In [82]:
mf.createDownGeneSetLib(tertiary_matrix_simple_col, path, name, details=tertiary_matrix.columns.tolist())

Progeres: 100%  1036 Out of 1036   

# Create Up Attribute Library

In [83]:
name = 'ccle_attribute_up_set'

In [84]:
mf.createUpAttributeSetLib(tertiary_matrix_simple_col, path, name)

Progeres: 100%  17337 Out of 17337   

# Create Down Attribute Library

In [85]:
name = 'ccle_attribute_down_set'

In [86]:
mf.createDownAttributeSetLib(tertiary_matrix_simple_col, path, name)

Progeres: 100%  17337 Out of 17337   

# Create Gene Similaruty Matrix

In [88]:
gene_similarity_matix = mf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [89]:
gene_similarity_matix.head()

Gene Symbol,A1BG,A1BG-AS1,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AACSP1,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,0.581781,-0.204272,0.180168,-0.140021,-0.111689,-0.024551,0.009723,-0.141812,0.022497,...,-0.011351,-0.031293,0.058236,-0.036765,0.061038,0.116823,0.272626,-0.06591,0.115096,-0.015456
A1BG-AS1,0.581781,1.0,-0.154121,0.147679,-0.042668,0.100474,-0.002285,-0.089154,-0.069732,-0.002152,...,-0.088309,-0.034213,-0.036712,-0.099816,-0.093956,0.12272,0.126242,-0.035312,0.059931,-0.022589
A1CF,-0.204272,-0.154121,1.0,0.13602,-0.038572,-0.059265,0.010876,0.11262,0.205584,0.0199,...,-0.025258,-0.019952,-0.100981,0.033195,-0.009513,-0.071766,-0.172035,-0.031281,-0.079953,-0.083759
A2M,0.180168,0.147679,0.13602,1.0,-0.070217,-0.04928,-0.044527,-0.05438,-0.032112,-0.007224,...,-0.049221,-0.123989,-0.090117,-0.086814,-0.141054,-0.095446,0.105209,0.159027,-0.016643,-0.047812
A2ML1,-0.140021,-0.042668,-0.038572,-0.070217,1.0,0.271285,0.170413,-0.030366,0.04319,0.001712,...,-0.087753,-0.08056,-0.061888,-0.012794,-0.043014,-0.048686,-0.110196,-0.007525,-0.061093,-0.071122


## Save Gene Similarity Matrix

In [90]:
filename = '~/./Documents/Harmonizome/CCLE/Output/ccle_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

#### Create matrix with simplified column names for attribute

In [91]:
file = 'Input/CCLE_Expression_Entrez_2012-09-29.gct'
col = pd.read_csv(file, sep='\t' ,skiprows=2, nrows=0)
col.drop('NCIH292_LUNG.1', axis=1,inplace=True)

normalized_matrix_simple_col = normalized_matrix.copy()
normalized_matrix_simple_col.columns = col.columns[2:]

In [92]:
attribute_similarity_matix = mf.createSimilarityMatrix(normalized_matrix_simple_col.T, 'cosine')

In [93]:
attribute_similarity_matix.head()

Unnamed: 0,LN18_CENTRAL_NERVOUS_SYSTEM,769P_KIDNEY,786O_KIDNEY,CAOV3_OVARY,HEPG2_LIVER,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,NCIH524_LUNG,NCIH209_LUNG,MIAPACA2_PANCREAS,MCAS_OVARY,...,SLR21_KIDNEY,LNZ308_CENTRAL_NERVOUS_SYSTEM,LN340_CENTRAL_NERVOUS_SYSTEM,HCC827GR5_LUNG,SLR20_KIDNEY,HK2_KIDNEY,EW8_BONE,UOK101_KIDNEY,JHESOAD1_OESOPHAGUS,CH157MN_CENTRAL_NERVOUS_SYSTEM
LN18_CENTRAL_NERVOUS_SYSTEM,1.0,0.333072,0.614016,0.570349,0.701247,0.219113,0.622899,0.488729,0.471024,0.628029,...,-0.086955,-0.242894,0.000206,-0.103931,-0.12634,-0.035069,-0.033711,-0.141684,-0.032579,-0.104463
769P_KIDNEY,0.333072,1.0,0.347688,0.329929,0.391703,0.384087,0.32429,0.157626,0.33736,0.372374,...,-0.003066,-0.138294,0.017154,-0.059558,-0.01475,0.024167,-0.011656,-0.104698,0.018616,-0.090337
786O_KIDNEY,0.614016,0.347688,1.0,0.509293,0.598352,0.184065,0.578453,0.3653,0.485619,0.543956,...,-0.065897,-0.218219,-0.069313,-0.103156,-0.057819,-0.061086,-0.038001,-0.221654,0.008967,-0.133641
CAOV3_OVARY,0.570349,0.329929,0.509293,1.0,0.550507,0.208762,0.561256,0.426057,0.442166,0.5292,...,-0.026534,-0.124389,-0.104393,-0.126835,-0.102234,-0.088918,-0.069045,-0.198599,0.051339,-0.14981
HEPG2_LIVER,0.701247,0.391703,0.598352,0.550507,1.0,0.206281,0.718255,0.482868,0.489108,0.7386,...,-0.075941,-0.253552,0.006405,-0.053318,-0.081622,-0.017659,-0.035428,-0.164919,0.000659,-0.105589


## Save Attribute Similarity Matrix

In [94]:
filename = '~/./Documents/Harmonizome/CCLE/Output/ccle_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [95]:
gene_list = mf.createGeneList(normalized_matrix)

Progeres: 100%  17337 Out of 17337   

In [96]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1BG-AS1,503538
2,A1CF,29974
3,A2M,2
4,A2ML1,144568


In [97]:
gene_list.shape

(17337, 2)

### Save Gene List

In [98]:
filename = '~/./Documents/Harmonizome/CCLE/Output/ccle_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [99]:
attribute_list = mf.createAttributeList(normalized_matrix_simple_col)

In [100]:
attribute_list.head()

Unnamed: 0,Attributes
0,LN18_CENTRAL_NERVOUS_SYSTEM
1,769P_KIDNEY
2,786O_KIDNEY
3,CAOV3_OVARY
4,HEPG2_LIVER


In [101]:
attribute_list.shape

(1036, 1)

### Save Attribute List

In [102]:
filename = '~/./Documents/Harmonizome/CCLE/Output/ccle_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [103]:
gene_attribute_edge_list = mf.createGeneAttributeEdgeList(tertiary_matrix_simple_col, gene_list)

Progeres: 100%  1036 Out of 1036   

In [104]:
gene_attribute_edge_list.head()

Unnamed: 0,Attribute,Gene,GeneID,Weight
0,LN18_CENTRAL_NERVOUS_SYSTEM,A1BG,1,0.0
1,LN18_CENTRAL_NERVOUS_SYSTEM,A1BG-AS1,503538,0.0
2,LN18_CENTRAL_NERVOUS_SYSTEM,A1CF,29974,0.0
3,LN18_CENTRAL_NERVOUS_SYSTEM,A2M,2,0.0
4,LN18_CENTRAL_NERVOUS_SYSTEM,A2ML1,144568,0.0


In [105]:
gene_attribute_edge_list.shape

(17961132, 4)

### Get Number of (Statistically Relevant) Gene-Attribute Associations

In [106]:
gene_attribute_edge_list[gene_attribute_edge_list['Weight'] != 0].shape

(3591812, 4)

### Save Gene-Attribute Edge List

In [107]:
filename = '~/./Documents/Harmonizome/CCLE/Output/ccle_gene_attribute_edge_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_attribute_edge_list.to_csv(filename, sep='\t', index=False, compression='gzip')