# Catalogue of Somatic Mutations In Cancer (COSMIC) Cell Line Gene Mutation

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: https://cancer.sanger.ac.uk/cosmic/download

In [2]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [3]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/CSMC/Mutations/untility_functions.py'>

# Load Data

In [40]:
df = pd.read_csv('Input/CosmicCLP_MutantExport.tsv.gz', sep='\t')

In [41]:
df.head()

Unnamed: 0,Gene name,Accession Number,Gene CDS length,HGNC ID,Sample name,ID_sample,ID_tumour,Primary site,Site subtype 1,Site subtype 2,...,Mutation somatic status,Mutation verification status,Pubmed_PMID,ID_STUDY,Institute,Institute Address,Catalogue Number,Sample source,Tumour origin,Age
0,KRAS,ENST00000311936,567,6407.0,PL-21,1330991,1241467,haematopoietic_and_lymphoid_tissue,NS,NS,...,Reported in another cancer sample as somatic,Verified,,619,German Collection of Microorganisms and Cell C...,"Braunschweig, Germany",ACC 536,cell-line,NS,
1,P2RY2,ENST00000393596,1134,8541.0,A375,906793,824317,skin,NS,NS,...,Reported in another cancer sample as somatic,Unverified,,619,American Type Culture Collection (ATCC),"P.O. Box 1549, Manassas, VA 20108, USA",CRL-1619,cell-line,primary,54.0
2,SALL4,ENST00000217086,3162,15924.0,MCC26,1298234,1209288,skin,NS,NS,...,Variant of unknown origin,Unverified,,619,UNKNOWN,UNKNOWN,,cell-line,NS,
3,SLC35F2,ENST00000525815,1125,23615.0,LS-411N,907794,825306,large_intestine,caecum,NS,...,Variant of unknown origin,Unverified,,619,American Type Culture Collection (ATCC),"P.O. Box 1549, Manassas, VA 20108, USA",CRL-2159,cell-line,primary,32.0
4,COL14A1,ENST00000297848,5391,2191.0,RH-1,971773,887870,soft_tissue,striated_muscle,NS,...,Reported in another cancer sample as somatic,Unverified,,619,St Jude Children's Research Hospital,"332 North Lauderdale St., Memphis, TN 38105-27...",,cell-line,metastasis,


In [44]:
df.shape

(1085496, 38)

# Get Relevent Data

In [45]:
df = df[['Gene name', 'Sample name']]

In [46]:
df.head()

Unnamed: 0,Gene name,Sample name
0,KRAS,PL-21
1,P2RY2,A375
2,SALL4,MCC26
3,SLC35F2,LS-411N
4,COL14A1,RH-1


# Drop Duplicates

In [47]:
df.drop_duplicates(inplace=True)

In [48]:
df.shape

(923349, 2)

# Create Binary Matrix

In [49]:
matrix = uf.createBinaryMatix(df)

Progeres: 100%  26815 Out of 26815   

In [50]:
matrix.head()

Unnamed: 0,TE-441-T,LB831-BLC,NCI-H187,CCF-STTG1,HCC2157,MDA-MB-415,KP-N-YS,ufE-296,PL4,D-263MG,...,SW780,VM-CUB-1,NCI-H64,HA7-RCC,SJRH30,SKM-1,MLMA,RCC-FG2,KARPAS-45,PL18
TPSAB1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SEC16A,0,0,0,0,0,0,1,1,0,0,...,0,0,1,0,0,0,0,0,1,0
NPHP3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
RFESD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FXYD3_ENST00000435734,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [51]:
matrix.shape

(26815, 1020)

# Get Only Gene Symbol in Index 

In [52]:
lst = []

for i,index in enumerate(matrix.index):
    
    progress = ((i+1)/len(matrix.index))*100
        
    sys.stdout.write("Progress: %d Out of %d which is %d%%   \r" % ((i+1), len(matrix.index), progress))
    sys.stdout.flush()
    
    lst.append(index.split('_')[0])
    
matrix.index = lst

Progress: 26815 Out of 26815 which is 100%   

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [53]:
uf.mapgenesymbols(matrix)

Progeres: 100%  26815 Out of 26815   

In [54]:
matrix.shape

(25856, 1020)

# Merge Duplicate Genes By Rows

In [55]:
matrix = uf.merge(matrix, 'row', 'mean')

In [56]:
matrix.shape

(18302, 1020)

# Save Binary Matrix

In [59]:
filename = 'Output/cosmic_celline_mutation_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [60]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CSMC/Mutations/Output/'

In [61]:
name = 'cosmic_celline_mutation_gene_set'

In [62]:
uf.createUpGeneSetLib(matrix, path, name)

Progeres: 100%  1020 Out of 1020   

# Create Attribute Library

In [63]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CSMC/Mutations/Output/'

In [64]:
name = 'cosmic_celline_mutation_attribute_set'

In [65]:
uf.createUpAttributeSetLib(matrix, path, name)

Progeres: 100%  18302 Out of 18302   

# Create Gene Similarity Matrix

In [67]:
gene_similarity_matix = uf.createSimilarityMatrix(matrix, 'jaccard')

In [68]:
gene_similarity_matix.head()

index,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,0.060241,0.061947,0.062016,0.068182,0.018868,0.041096,0.036585,0.040816,0.061538,...,0.076923,0.0,0.037037,0.037037,0.05,0.125,0.051724,0.111111,0.079137,0.041667
A1CF,0.060241,1.0,0.06015,0.067568,0.029851,0.041667,0.053763,0.07,0.042857,0.011236,...,0.045977,0.014493,0.054054,0.012987,0.024096,0.119048,0.050633,0.025,0.117647,0.043478
A2M,0.061947,0.06015,1.0,0.136905,0.030612,0.058824,0.092437,0.069231,0.029412,0.06087,...,0.068966,0.030303,0.028037,0.057692,0.073394,0.086207,0.074766,0.096154,0.108696,0.075
A2ML1,0.062016,0.067568,0.136905,1.0,0.008547,0.059322,0.065217,0.04698,0.02521,0.085271,...,0.084615,0.026316,0.058333,0.058333,0.073171,0.067164,0.056,0.056452,0.139175,0.073529
A4GALT,0.068182,0.029851,0.030612,0.008547,1.0,0.0,0.017857,0.047619,0.0,0.020408,...,0.040816,0.0,0.0,0.0,0.0,0.039216,0.02439,0.051282,0.031496,0.037037


# Save Gene Similarity Matrix 

In [70]:
filename = 'Output/cosmic_celline_mutation_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [71]:
attribute_similarity_matix = uf.createSimilarityMatrix(matrix.T, 'jaccard')

In [72]:
attribute_similarity_matix.head()

Unnamed: 0,TE-441-T,LB831-BLC,NCI-H187,CCF-STTG1,HCC2157,MDA-MB-415,KP-N-YS,ufE-296,PL4,D-263MG,...,SW780,VM-CUB-1,NCI-H64,HA7-RCC,SJRH30,SKM-1,MLMA,RCC-FG2,KARPAS-45,PL18
TE-441-T,1.0,0.010989,0.031865,0.011647,0.014414,0.021769,0.019659,0.025167,0.012411,0.016791,...,0.020513,0.021277,0.028184,0.004386,0.021337,0.017766,0.013382,0.020385,0.021414,0.026415
LB831-BLC,0.010989,1.0,0.016563,0.008264,0.013761,0.012903,0.010753,0.017685,0.013483,0.01199,...,0.019313,0.016774,0.022406,0.002985,0.006757,0.011905,0.014225,0.011643,0.016145,0.014641
NCI-H187,0.031865,0.016563,1.0,0.015656,0.023736,0.032428,0.028986,0.050955,0.013198,0.013542,...,0.013875,0.030023,0.041912,0.013683,0.030521,0.021559,0.02269,0.025483,0.048493,0.025599
CCF-STTG1,0.011647,0.008264,0.015656,1.0,0.01636,0.022388,0.017143,0.016988,0.015968,0.016949,...,0.007561,0.013189,0.030067,0.0,0.013975,0.017956,0.011827,0.014528,0.016853,0.013495
HCC2157,0.014414,0.013761,0.023736,0.01636,1.0,0.019139,0.010638,0.018385,0.015453,0.011682,...,0.006211,0.011407,0.01626,0.008746,0.015075,0.013177,0.012605,0.011553,0.015452,0.02924


# Save Attribute Similarity Matrix

In [73]:
filename = 'Output/cosmic_celline_mutation_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [74]:
gene_list = uf.createGeneList(matrix)

Progeres: 100%  18302 Out of 18302   

In [75]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1CF,29974
2,A2M,2
3,A2ML1,144568
4,A4GALT,53947


In [76]:
gene_list.shape

(18302, 2)

# Save Gene List

In [77]:
filename = 'Output/cosmic_celline_mutation_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [78]:
attribute_list = uf.createAttributeList(matrix)

In [79]:
attribute_list.head()

Unnamed: 0,Attributes
0,TE-441-T
1,LB831-BLC
2,NCI-H187
3,CCF-STTG1
4,HCC2157


In [80]:
attribute_list.shape

(1020, 1)

# Save Attribute List

In [81]:
filename = 'Output/cosmic_celline_mutation_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [85]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CSMC/Mutations/Output/'

In [86]:
name = 'cosmic_celline_mutation_gene_attribute_edge_list'

In [87]:
uf.createGeneAttributeEdgeList(matrix, gene_list, path, name)

Progeres: 100%  1020 Out of 1020   

 The number of statisticaly relevent gene-attribute associations is: 643824
