# Catalogue of Somatic Mutations In Cancer (COSMIC) Copy Number Variants (CNV)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: https://cancer.sanger.ac.uk/cosmic/download

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/COSMIC/CNV/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/CosmicCLP_CompleteCNA.tsv.gz', sep='\t')

In [4]:
df.head()

Unnamed: 0,CNV_ID,ID_GENE,gene_name,ID_SAMPLE,ID_TUMOUR,Primary site,Site subtype 1,Site subtype 2,Site subtype 3,Primary histology,Histology subtype 1,Histology subtype 2,Histology subtype 3,SAMPLE_NAME,TOTAL_CN,MINOR_ALLELE,MUT_TYPE,ID_STUDY,GRCh,Chromosome:G_Start..G_Stop
0,6334435,101525,GYG2P1,683665,611825,haematopoietic_and_lymphoid_tissue,NS,NS,NS,lymphoid_neoplasm,plasma_cell_myeloma,NS,NS,MC-CAR,5,0,gain,619,38,Y:12363966..12512040
1,6583900,69786,CDY1B,683665,611825,haematopoietic_and_lymphoid_tissue,NS,NS,NS,lymphoid_neoplasm,plasma_cell_myeloma,NS,NS,MC-CAR,7,0,gain,619,38,Y:22477961..24258193
2,6335779,68758,CTDSPL,683665,611825,haematopoietic_and_lymphoid_tissue,NS,NS,NS,lymphoid_neoplasm,plasma_cell_myeloma,NS,NS,MC-CAR,0,0,loss,619,38,3:37940617..37945438
3,6352848,55218,LCE3C,683665,611825,haematopoietic_and_lymphoid_tissue,NS,NS,NS,lymphoid_neoplasm,plasma_cell_myeloma,NS,NS,MC-CAR,0,0,loss,619,38,1:152583052..152613763
4,6583900,106281,DAZ1_ENST00000382510,683665,611825,haematopoietic_and_lymphoid_tissue,NS,NS,NS,lymphoid_neoplasm,plasma_cell_myeloma,NS,NS,MC-CAR,7,0,gain,619,38,Y:22477961..24258193


In [5]:
df.shape

(176254, 20)

# Get Relevent Data

In [6]:
df = df[['gene_name', 'SAMPLE_NAME', 'TOTAL_CN']]

In [7]:
df.head()

Unnamed: 0,gene_name,SAMPLE_NAME,TOTAL_CN
0,GYG2P1,MC-CAR,5
1,CDY1B,MC-CAR,7
2,CTDSPL,MC-CAR,0
3,LCE3C,MC-CAR,0
4,DAZ1_ENST00000382510,MC-CAR,7


# Drop Duplicates

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.shape

(173794, 3)

# Create Matrix

In [10]:
genes = df['gene_name'].unique().tolist()
samples = df['SAMPLE_NAME'].unique().tolist()

In [11]:
df.set_index('SAMPLE_NAME', inplace=True)

In [12]:
matrix = pd.DataFrame(index=genes, columns=samples)

matrix.replace(np.nan, 0, inplace=True)

In [13]:
for i,col in enumerate(matrix.columns):
    
    progress = ((i+1)/len(matrix.columns))*100
        
    sys.stdout.write("Progress: %d Out of %d which is %d%%   \r" % ((i+1), len(matrix.columns), progress))
    sys.stdout.flush()
    
    if df.loc[col, 'gene_name'].shape[0] > 1:
        matrix.ix[df.loc[col, 'gene_name'].values.tolist(), col] = df.ix[col, 'TOTAL_CN'].values.tolist()
    else:
        matrix.ix[df.loc[col, 'gene_name'], col] = df.ix[col, 'TOTAL_CN']

Progress: 1012 Out of 1012 which is 100%   

In [14]:
matrix.head()

Unnamed: 0,MC-CAR,PFSK-1,A673,ES3,ES5,ES7,EW-11,SK-ES-1,NCI-H1395,COLO-829,...,SNU-1040,SNU-175,SNU-283,SNU-407,SNU-61,SNU-81,SNU-C5,DIFI,LIM1215,GEO
GYG2P1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CDY1B,7,14,0,0,13,0,0,0,0,0,...,14,0,0,7,0,0,0,0,8,0
CTDSPL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LCE3C,0,9,0,0,0,0,0,7,9,0,...,0,0,0,0,0,0,0,0,0,0
DAZ1_ENST00000382510,7,14,0,0,13,0,0,0,0,0,...,14,0,0,7,0,0,0,0,6,0


In [15]:
matrix.shape

(25106, 1012)

# Get Only Gene Symbols in Index

In [16]:
lst = []

for i,index in enumerate(matrix.index):
    
    progress = ((i+1)/len(matrix.index))*100
        
    sys.stdout.write("Progress: %d Out of %d which is %d%%   \r" % ((i+1), len(matrix.index), progress))
    sys.stdout.flush()
    
    lst.append(index.split('_')[0])
    
matrix.index = lst

Progress: 25106 Out of 25106 which is 100%   

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [17]:
uf.mapgenesymbols(matrix)

Progeres: 100%  25106 Out of 25106   

In [18]:
matrix.shape

(23705, 1012)

# Merge Duplicate Genes By Rows

In [19]:
matrix = uf.merge(matrix, 'row', 'mean')

In [20]:
matrix.shape

(16080, 1012)

# Drop Gene That have Zero Value Across All Samples 

In [21]:
matrix.replace(0.0, np.nan, inplace=True)

In [22]:
matrix.dropna(how='all', axis=0,inplace=True)

In [23]:
matrix.shape

(15685, 1012)

# Drop Samples with Zero Value Across all Genes

In [24]:
matrix.dropna(how='all', axis=1,inplace=True)

In [25]:
matrix.shape

(15685, 798)

In [26]:
matrix.replace(np.nan, 0.0, inplace=True)

# Save Unfiltered Matrix To File

In [27]:
filename = 'Output/cosmic_cnv_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Normalize Matrix

In [28]:
normalized_matrix = matrix.copy()

# Quantile Normalize the matrix for the columns

In [29]:
normalized_matrix = uf.quantileNormalize(normalized_matrix)

Step 2/2 progress: 100%  798 Out of 798   

In [30]:
normalized_matrix.head()

Unnamed: 0_level_0,MC-CAR,PFSK-1,A673,ES3,ES5,SK-ES-1,NCI-H1395,5637,RT4,TCCSUP,...,OCI-LY7,Set2,SNU-1040,SNU-283,SNU-407,SNU-61,SNU-81,DIFI,LIM1215,GEO
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Nomalize Matrix (z-score the rows)

In [31]:
uf.zscore(normalized_matrix, 'row')

Progress: 100%  15685 Out of 15685   

In [32]:
normalized_matrix.head()

Unnamed: 0_level_0,MC-CAR,PFSK-1,A673,ES3,ES5,SK-ES-1,NCI-H1395,5637,RT4,TCCSUP,...,OCI-LY7,Set2,SNU-1040,SNU-283,SNU-407,SNU-61,SNU-81,DIFI,LIM1215,GEO
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,...,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967,-0.044967
A1CF,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,...,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725,-0.040725
A2M,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,...,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711
A2ML1,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,...,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711,-0.072711
A3GALT2,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,...,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107,-0.037107


# Save Filtered Matrix

In [33]:
filename = 'Output/cosmic_cnv_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [34]:
tertiary_matrix = uf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  798 Out of 798   

In [35]:
tertiary_matrix.head()

Unnamed: 0_level_0,MC-CAR,PFSK-1,A673,ES3,ES5,SK-ES-1,NCI-H1395,5637,RT4,TCCSUP,...,OCI-LY7,Set2,SNU-1040,SNU-283,SNU-407,SNU-61,SNU-81,DIFI,LIM1215,GEO
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Teriary Matrix

In [36]:
filename = 'Output/cosmic_cnv_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

#### Path to output files

In [37]:
path = '/Users/moshesilverstein/Documents/Harmonizome/COSMIC/CNV/Output/'

# Create Up Gene Set Library

In [38]:
name = 'cosmic_cnv_gene_up_set'

In [39]:
uf.createUpGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  798 Out of 798   

# Create Down Gene Set Library

In [40]:
name = 'cosmic_cnv_gene_down_set'

In [41]:
uf.createDownGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  798 Out of 798   

# Create Up Attribute Library

In [42]:
name = 'cosmic_cnv_attribute_up_set'

In [43]:
uf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  15685 Out of 15685   

# Create Down Attribute Library

In [44]:
name = 'cosmic_cnv_attribute_down_set'

In [45]:
uf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  15685 Out of 15685   

# Create Gene Similarity Matrix

In [46]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [47]:
gene_similarity_matix.head()

index,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZZ3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,-0.001834,-0.003274,-0.003274,-0.001671,-0.001594,-0.002579,-0.001798,-0.002539,-0.003477,...,-0.002347,-0.001915,-0.003057,-0.002853,-0.003173,-0.001997,-0.002191,-0.002191,-0.001945,-0.003083
A1CF,-0.001834,1.0,-0.002965,-0.002965,-0.001513,-0.001443,-0.002335,-0.001628,-0.0023,-0.003149,...,-0.002125,-0.001735,0.013999,-0.002584,-0.002874,-0.001809,-0.001984,-0.001984,-0.001762,-0.002793
A2M,-0.003274,-0.002965,1.0,1.0,-0.002701,-0.002577,-0.00417,0.651807,-0.004106,-0.005622,...,-0.003795,-0.003097,-0.004943,-0.004614,-0.005131,-0.003229,-0.002218,-0.002218,-0.003145,-0.004986
A2ML1,-0.003274,-0.002965,1.0,1.0,-0.002701,-0.002577,-0.00417,0.651807,-0.004106,-0.005622,...,-0.003795,-0.003097,-0.004943,-0.004614,-0.005131,-0.003229,-0.002218,-0.002218,-0.003145,-0.004986
A3GALT2,-0.001671,-0.001513,-0.002701,-0.002701,1.0,-0.001315,-0.002128,-0.001484,-0.002095,-0.002869,...,-0.001937,-0.001581,-0.002522,-0.002355,-0.002619,-0.001648,-0.001808,-0.001808,-0.001605,-0.001839


# Save Gene Similarity Matrix 

In [48]:
filename = 'Output/cosmic_cnv_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [49]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [50]:
attribute_similarity_matix.head()

Unnamed: 0,MC-CAR,PFSK-1,A673,ES3,ES5,SK-ES-1,NCI-H1395,5637,RT4,TCCSUP,...,OCI-LY7,Set2,SNU-1040,SNU-283,SNU-407,SNU-61,SNU-81,DIFI,LIM1215,GEO
MC-CAR,1.0,0.745922,0.129055,0.548445,0.814658,0.543994,-0.079743,-0.007306,0.075204,0.106928,...,0.288925,-0.051375,-0.013607,0.539814,0.711066,-0.062808,0.121661,0.027512,0.810815,0.121859
PFSK-1,0.745922,1.0,0.130721,0.534245,0.891933,0.717846,-0.078802,-0.004904,0.07569,0.108389,...,0.260546,-0.051333,-0.013844,0.55147,0.656659,-0.062551,0.123219,0.028085,0.748119,0.123424
A673,0.129055,0.130721,1.0,0.245553,0.140225,0.157079,-0.027892,0.000608,0.006591,0.065678,...,0.039496,-0.017561,-0.009446,0.23434,0.099881,-0.020157,0.07276,0.020604,0.123526,0.072957
ES3,0.548445,0.534245,0.245553,1.0,0.567951,0.662214,-0.09364,0.000126,0.031246,0.207879,...,0.175424,-0.059139,-0.027936,0.741483,0.441411,-0.068803,0.230794,0.064274,0.503573,0.231398
ES5,0.814658,0.891933,0.140225,0.567951,1.0,0.544796,-0.084578,-0.002633,0.069518,0.116341,...,0.282332,-0.054447,-0.013141,0.576739,0.712139,-0.066383,0.132167,0.030318,0.821248,0.13239


# Save Attribute Similarity Matrix

In [51]:
filename = 'Output/cosmic_cnv_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [52]:
gene_list = uf.createGeneList(normalized_matrix)

Progeres: 100%  15685 Out of 15685   

In [53]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1CF,29974
2,A2M,2
3,A2ML1,144568
4,A3GALT2,127550


In [54]:
gene_list.shape

(15685, 2)

# Save Gene List

In [55]:
filename = 'Output/cosmic_cnv_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [56]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [57]:
attribute_list.head()

Unnamed: 0,Attributes
0,MC-CAR
1,PFSK-1
2,A673
3,ES3
4,ES5


In [58]:
attribute_list.shape

(798, 1)

# Save Attribute List

In [59]:
filename = 'Output/cosmic_cnv_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [60]:
path = '/Users/moshesilverstein/Documents/Harmonizome/COSMIC/CNV/Output/'

In [61]:
name = 'achilles_gene_attribute_edge_list'

In [62]:
uf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  798 Out of 798   

 The number of statisticaly relevent gene-attribute associations is: 2529012
