# GDSC (Genomics of Drug Sensitivity in Cancer)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: http://www.cancerrxgene.org/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GDSC/untility_functions.py'>

# Load Data

In [3]:
matrix = pd.read_csv('Input/expU133A.txt.zip', sep='\t')

In [4]:
matrix.head()

Unnamed: 0,PROBE,BxPC-3,KMOE-2,ufM-223,NUGC-3,OC-314,COLO-741,KARPAS-45,JAR,DU-4475,...,SF268,TK10,A2780,OVCAR-4.1,A375,KP-N-S19s,BC-3,PANC-08-13.1,EKVX.1,DMS-114.1
0,1007_s_at,639.944183,16.373449,318.155778,450.998247,239.469002,154.087362,46.001733,203.582799,171.863044,...,84.766663,235.219129,46.343851,263.266154,230.686727,101.217233,17.466248,506.256421,305.627689,89.021586
1,1053_at,51.349139,135.185672,68.408031,65.130261,126.891209,146.355079,163.08733,103.647606,204.069724,...,78.853615,55.688551,76.178632,23.357161,35.83589,75.858236,199.226068,63.581571,63.613736,91.283199
2,117_at,5.900338,53.393754,3.67953,0.835631,0.624551,3.906931,28.396109,10.808254,165.628891,...,0.769494,2.808876,3.210985,0.670012,13.643147,7.977687,2.249444,1.784688,2.224977,3.879441
3,121_at,29.984657,18.760248,22.707359,37.10888,347.899885,21.480952,40.950497,31.54549,31.226368,...,22.735391,553.756598,16.79305,638.726055,38.4079,32.86139,53.520014,46.280554,21.681985,23.478692
4,1255_g_at,0.329891,1.891916,0.371882,1.384804,0.516666,0.461409,0.503592,0.550652,1.15615,...,0.667385,1.072061,0.3726,6.173136,1.860528,18.326121,3.964986,0.492764,0.611497,13.048996


In [5]:
matrix.shape

(22277, 790)

# Load Gene Meta Data

In [6]:
gene_meta = pd.read_csv('Input/ProbesToGenesU133A.tsv', sep='\t', header=None, index_col=0)

In [7]:
gene_meta.head()

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1007_s_at,DDR1,780.0,discoidin domain receptor tyrosine kinase 1
1007_s_at,MIR4640,100616237.0,microRNA 4640
1053_at,RFC2,5982.0,replication factor C subunit 2
117_at,HSPA6,3310.0,heat shock protein family A (Hsp70) member 6
121_at,PAX8,7849.0,paired box 8


# Map Gene ID to Symbol 

In [8]:
lst = []

for i,index in enumerate(matrix.index):
    
    progressPercent = ((i+1)/len(matrix.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(matrix.index)))
    sys.stdout.flush()
    
    probe = matrix.ix[index, 'PROBE']
    if probe in gene_meta.index and type(gene_meta.ix[probe, 1]) == str:
        lst.append(gene_meta.ix[probe, 1])
    else:
        lst.append(np.nan)
        
matrix['PROBE'] = lst
matrix.dropna(how='any', inplace=True)

Progress: 100%  22277 Out of 22277   

In [9]:
matrix.set_index('PROBE', inplace=True)

In [10]:
matrix.head()

Unnamed: 0_level_0,BxPC-3,KMOE-2,ufM-223,NUGC-3,OC-314,COLO-741,KARPAS-45,JAR,DU-4475,MONO-MAC-6,...,SF268,TK10,A2780,OVCAR-4.1,A375,KP-N-S19s,BC-3,PANC-08-13.1,EKVX.1,DMS-114.1
PROBE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RFC2,51.349139,135.185672,68.408031,65.130261,126.891209,146.355079,163.08733,103.647606,204.069724,113.892333,...,78.853615,55.688551,76.178632,23.357161,35.83589,75.858236,199.226068,63.581571,63.613736,91.283199
HSPA6,5.900338,53.393754,3.67953,0.835631,0.624551,3.906931,28.396109,10.808254,165.628891,3.575531,...,0.769494,2.808876,3.210985,0.670012,13.643147,7.977687,2.249444,1.784688,2.224977,3.879441
PAX8,29.984657,18.760248,22.707359,37.10888,347.899885,21.480952,40.950497,31.54549,31.226368,18.41448,...,22.735391,553.756598,16.79305,638.726055,38.4079,32.86139,53.520014,46.280554,21.681985,23.478692
GUCA1A,0.329891,1.891916,0.371882,1.384804,0.516666,0.461409,0.503592,0.550652,1.15615,0.565504,...,0.667385,1.072061,0.3726,6.173136,1.860528,18.326121,3.964986,0.492764,0.611497,13.048996
THRA,8.927264,22.617141,22.832327,9.482732,12.532744,20.129683,23.326317,7.619551,42.62994,20.252146,...,13.12384,10.745271,7.717689,16.380232,18.574513,19.247007,49.339992,12.359365,20.152065,30.058033


In [11]:
matrix.shape

(19862, 789)

# Save Unfiltered Matrix To File

In [12]:
filename = '~/./Documents/Harmonizome/GDSC/Output/gdsc_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Drop Any Genes That Have Zero Expression Across 95% Of The Samples

In [13]:
normalized_matrix = matrix.copy()

In [14]:
 normalized_matrix.replace(0, np.nan, inplace=True)

In [15]:
normalized_matrix.dropna(thresh=(0.05*normalized_matrix.shape[1]), axis=0, inplace=True)

In [16]:
normalized_matrix.replace(np.nan, 0, inplace=True)

In [17]:
normalized_matrix.shape

(19862, 789)

# Normalize Matrix (Quantile Normalize the matrix for the columns)

In [18]:
normalized_matrix = uf.quantileNormalize(normalized_matrix)

Step 2/2 progress: 100%  789 Out of 789   

In [19]:
normalized_matrix.head()

Unnamed: 0_level_0,BxPC-3,KMOE-2,ufM-223,NUGC-3,OC-314,COLO-741,KARPAS-45,JAR,DU-4475,MONO-MAC-6,...,SF268,TK10,A2780,OVCAR-4.1,A375,KP-N-S19s,BC-3,PANC-08-13.1,EKVX.1,DMS-114.1
PROBE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RFC2,49.912741,131.919818,68.849847,60.170185,113.173296,140.442908,158.87918,104.863396,208.419544,111.458477,...,74.955031,50.965606,74.218338,22.769558,29.971693,64.374866,205.414014,61.439664,61.120339,84.524163
HSPA6,6.432464,57.822184,5.386406,1.454813,1.059324,5.386406,32.118009,15.690193,168.134055,5.318793,...,0.849222,3.045805,4.215177,0.572874,11.035236,5.729556,1.656562,2.519119,2.396972,3.52492
PAX8,29.200466,24.173254,26.446721,36.610704,339.543847,24.964202,44.647413,37.369442,31.011522,23.094729,...,22.305402,549.481252,18.747931,633.144859,32.174341,24.366186,49.16426,45.575087,20.193253,19.480752
GUCA1A,0.438586,3.109434,0.715395,2.051112,0.91763,0.832674,0.578145,0.903686,1.250702,0.931887,...,0.733659,1.086581,0.379426,5.878178,1.695669,13.114842,3.852042,0.641114,0.793042,10.230372
THRA,9.885777,28.196345,26.601635,12.932332,14.440789,23.711994,27.345918,11.095612,42.731143,24.956751,...,12.887476,9.794829,8.963791,15.533873,14.962072,13.679505,45.262149,13.782679,18.78143,25.802687


# Nomalize Matrix (z-score the rows)

In [20]:
uf.zscore(normalized_matrix, 'row')

Progress: 100%  19862 Out of 19862   

In [21]:
normalized_matrix.head()

Unnamed: 0_level_0,BxPC-3,KMOE-2,ufM-223,NUGC-3,OC-314,COLO-741,KARPAS-45,JAR,DU-4475,MONO-MAC-6,...,SF268,TK10,A2780,OVCAR-4.1,A375,KP-N-S19s,BC-3,PANC-08-13.1,EKVX.1,DMS-114.1
PROBE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RFC2,-0.711609,1.131753,-0.285939,-0.481041,0.710367,1.323336,1.737748,0.523577,2.851321,0.671822,...,-0.148706,-0.687942,-0.165265,-1.321736,-1.159845,-0.386528,2.783762,-0.452506,-0.459683,0.06639
HSPA6,-0.18527,0.598926,-0.201232,-0.261228,-0.267263,-0.201232,0.206686,-0.043999,2.282262,-0.202264,...,-0.270469,-0.236949,-0.219105,-0.274686,-0.115032,-0.195996,-0.258149,-0.244986,-0.24685,-0.229638
PAX8,-0.252337,-0.272261,-0.263251,-0.222968,0.977647,-0.269127,-0.191116,-0.219961,-0.245159,-0.276536,...,-0.279664,1.809692,-0.293764,2.141276,-0.240551,-0.271497,-0.173215,-0.18744,-0.288035,-0.290859
GUCA1A,-0.089372,-0.058817,-0.086205,-0.070924,-0.083892,-0.084863,-0.087775,-0.084051,-0.080081,-0.083728,...,-0.085996,-0.081959,-0.090049,-0.027142,-0.074991,0.055647,-0.050321,-0.087055,-0.085317,0.022648
THRA,-1.068731,0.398442,0.270662,-0.82462,-0.703751,0.039124,0.330299,-0.971791,1.563073,0.138862,...,-0.828214,-1.076018,-1.142607,-0.616166,-0.661982,-0.764751,1.765875,-0.756484,-0.355948,0.206645


# Merge Duplicate Samples By Columns (by taking the mean)

In [22]:
lst = []

for col in normalized_matrix.columns:
    lst.append(col.split('.')[0])
    
normalized_matrix.columns = lst

In [23]:
normalized_matrix = uf.merge(normalized_matrix, 'column', 'mean')

In [24]:
normalized_matrix.shape

(19862, 727)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [25]:
uf.mapgenesymbols(normalized_matrix)

Progeres: 100%  19862 Out of 19862   

In [26]:
matrix.shape

(19862, 789)

# Merge Duplicate Genes By Rows

In [27]:
normalized_matrix = uf.merge(normalized_matrix, 'row', 'mean')

In [28]:
normalized_matrix.shape

(12296, 727)

# Save Filtered Matrix

In [29]:
filename = '~/./Documents/Harmonizome/GDSC/Output/gdsc_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [30]:
tertiary_matrix = uf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  727 Out of 727   

In [31]:
tertiary_matrix.head()

Unnamed: 0_level_0,22RV1,23132-87,380,5637,639-V,647-V,697,769-P,786-0,8-MG-BA,...,VMRC-RCZ,WERI-Rb-1,WM-115,WSU-NHL,YAPC,YH-13,YKG-1,ZR-75-30,no-10,no-11
PROBE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1CF,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GALT,-1.0,0.0,-1.0,0.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,...,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0
A4GNT,-1.0,0.0,-1.0,-1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0
AAAS,0.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,1.0,-1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0


# Save Teriary Matrix

In [32]:
filename = '~/./Documents/Harmonizome/GDSC/Output/gdsc_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

### Path to output files

In [42]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GDSC/Output/'

# Create Up Gene Set Library

In [43]:
name = 'gdsc_gene_up_set'

In [44]:
uf.createUpGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  727 Out of 727   

# Create Down Gene Set Library

In [45]:
name = 'gdsc_gene_down_set'

In [46]:
uf.createDownGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  727 Out of 727   

# Create Up Attribute Library

In [47]:
name = 'gdsc_attribute_up_set'

In [48]:
uf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  12296 Out of 12296   

# Create Down Attribute Library

In [49]:
name = 'gdsc_attribute_down_set'

In [50]:
uf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  12296 Out of 12296   

# Create Gene Similarity Matrix

In [51]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [52]:
gene_similarity_matix.head()

PROBE,A1CF,A2M,A4GALT,A4GNT,AAAS,AACS,AADAC,AAGAB,AAK1,AAMDC,...,ZSWIM1,ZSWIM8,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYX,ZZEF1,ZZZ3
PROBE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1CF,1.0,0.255595,-0.033207,0.049937,0.08212,0.147871,0.015613,0.004685,-0.055131,0.064878,...,0.145505,0.008739,-0.04984,-0.010036,0.035482,0.012953,0.008065,-0.034493,-0.031332,-0.084004
A2M,0.255595,1.0,-0.065254,-0.040466,-0.023013,-0.042714,-0.022066,0.018307,-0.040571,0.035798,...,0.01306,0.018648,-0.034983,-0.009465,-0.117534,0.011927,-0.074811,0.026035,-0.012152,0.003269
A4GALT,-0.033207,-0.065254,1.0,0.021044,0.009858,-0.002789,-0.014219,-0.051756,0.02685,-0.004466,...,-0.063633,0.087421,-0.046712,-0.034705,-0.062013,-0.002014,0.01982,-0.019787,-0.065415,-0.0197
A4GNT,0.049937,-0.040466,0.021044,1.0,0.042046,0.014076,0.044933,-0.00448,-0.002613,-0.032182,...,0.070155,-0.029523,0.034823,-0.003093,-0.01111,0.063289,-0.008444,0.004539,0.029043,0.049417
AAAS,0.08212,-0.023013,0.009858,0.042046,1.0,0.05606,-0.049126,0.114366,0.204521,0.106213,...,0.098341,0.170105,-0.116632,0.060668,0.267183,0.0719,0.077259,0.043894,0.229175,-0.121875


# Save Gene Similarity Matrix

In [53]:
filename = 'Output/gdsc_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [54]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [55]:
attribute_similarity_matix.head()

Unnamed: 0,22RV1,23132-87,380,5637,639-V,647-V,697,769-P,786-0,8-MG-BA,...,VMRC-RCZ,WERI-Rb-1,WM-115,WSU-NHL,YAPC,YH-13,YKG-1,ZR-75-30,no-10,no-11
22RV1,1.0,0.09372,0.015854,-0.041533,-0.032991,-0.052188,0.02371,0.025385,-0.058713,-0.009366,...,-0.030108,0.044262,-0.052859,0.044498,-0.063227,-0.006082,0.025773,0.068811,-0.049298,-0.020784
23132-87,0.09372,1.0,-0.153124,-0.082618,-0.042215,-0.0581,-0.12319,0.17617,-0.089834,0.119579,...,0.116601,-0.036058,-0.127101,0.077109,0.139792,0.010487,-0.011674,0.026777,0.002449,0.111138
380,0.015854,-0.153124,1.0,-0.026266,-0.029638,0.002096,0.42155,-0.142021,-0.044042,-0.140735,...,-0.141415,0.055105,-0.016475,0.151626,-0.083199,-0.141315,-0.084273,-0.011065,-0.147236,-0.141819
5637,-0.041533,-0.082618,-0.026266,1.0,0.08781,0.141352,-0.005326,-0.117565,0.147874,-0.060139,...,-0.103327,-0.069679,0.03147,-0.163151,0.059555,-0.093072,-0.071513,-0.014852,-0.030563,-0.096335
639-V,-0.032991,-0.042215,-0.029638,0.08781,1.0,0.052778,0.01146,-0.10032,0.175232,-0.071871,...,-0.109451,-0.04176,-0.02576,-0.151005,-0.057722,-0.128256,-0.015021,-0.043742,0.035568,-0.009198


# Save Attribute Similarity Matrix

In [56]:
filename = 'Output/gdsc_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [57]:
gene_list = uf.createGeneList(normalized_matrix)

Progeres: 100%  12296 Out of 12296   

In [58]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1CF,29974
1,A2M,2
2,A4GALT,53947
3,A4GNT,51146
4,AAAS,8086


In [59]:
gene_list.shape

(12296, 2)

# Save Gene List

In [60]:
filename = 'Output/gdsc_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [61]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [62]:
attribute_list.head()

Unnamed: 0,Attributes
0,22RV1
1,23132-87
2,380
3,5637
4,639-V


In [63]:
attribute_list.shape

(727, 1)

# Save Attribute List

In [64]:
filename = 'Output/gdsc_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [65]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GDSC/Output/'

In [66]:
name = 'gdsc_gene_attribute_edge_list'

In [67]:
uf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  727 Out of 727   

 The number of statisticaly relevent gene-attribute associations is: 1787693
