In [3]:
import pandas as pd
from os import listdir
from statistics import mean
import numpy as np

In [4]:
listdir('data')

['.DS_Store',
 '.remove_duplicates.py.swo',
 'BLCA-US',
 'BRCA-US',
 'COAD-US',
 'full_cancer_data',
 'GBM-US',
 'KIRC-US',
 'LGG-US',
 'list_data_dirs.py',
 'LUSC-US',
 'organize_data.py',
 'OV-US',
 'PRAD-US',
 'SKCM-US',
 'THCA-US',
 'UCEC-US']

In [None]:
# Gets unique mutation ids with number of occurrences
# Returns new dataframe with cancer type, donor id, mutation ids and corresponding counts
def mutation_counts(df):
    cancer_type = df.project_code[0]
    donor_id = df.icgc_donor_id[0] + cancer_type
    smaller_df = df[['icgc_mutation_id']]
    unique_ids, counts = np.unique(smaller_df.to_numpy(), 
                                   return_counts=True,
                                  )
    new_df = pd.DataFrame({"CancerType": cancer_type,
                           "DonorId": donor_id,
                           "MutationId": unique_ids, 
                           "Count": counts})
    return new_df

In [None]:
# The mutation counts for a cancer type is returned
def get_mutation_frames(cancer_code):
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    all_files = [file for file in listdir(PATH)]
    
    final_df = None
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        final_df = pd.concat([final_df, mutation_counts(df)])
        
    return final_df

In [None]:
# Go through all cancer types and get mutation dataframes
# Concatenate all dataframes
# This results in one dataframe for all cancer types

folders = [file for file in listdir('data') if 'US' in file]
all_cancer_df = None
for cancer in folders:
    all_cancer_df = pd.concat([all_cancer_df, 
                               get_mutation_frames(cancer)])
    
all_cancer_df.reset_index(drop=True, inplace=True)

In [None]:
all_cancer_df.head()

In [None]:
len(np.unique(mutation_gene_df['MutationId']))

In [None]:
unique_mutations, counts = np.unique(np.array(all_cancer_df.MutationId), return_counts=True)

In [None]:
mutation_df = pd.DataFrame( {'MutationID': unique_mutations, 
                             'Counts': counts} )

In [None]:
mutation_df_filtered = mutation_df[mutation_df.Counts > 4]

In [None]:
mutation_df_filtered.sort_values(by=['Counts'], inplace=True)
mutation_df_filtered.head()

In [None]:
mutation_ids = mutation_df_filtered.MutationID

In [None]:
master_mutations = np.array(mutation_ids)

In [None]:
columns = ['DonorIDs', 'CancerType'] + list(master_mutations)
df = pd.DataFrame(columns = columns)
temp = np.where(master_mutations == 'IS675')
np.zeros((len(master_mutations)))

In [None]:
def helper(df):
    cancer_type = df.project_code[0]
    donor_id = df.icgc_donor_id[0] + cancer_type
    smaller_df = df[['icgc_mutation_id']]
    unique_ids = np.unique(smaller_df.to_numpy())
    return cancer_type, donor_id, unique_ids

In [None]:
def katrinas_function(df, cancer_code, master_mutations):
    
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    all_files = [file for file in listdir(PATH)]
    columns = ['DonorIDs', 'CancerType'] + list(master_mutations)
    final_df = df
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        cancer_type, donor_id, unique_ids = helper(df)
        binary = np.zeros((len(master_mutations)))
        
        for ids in unique_ids:
            temp = np.where(master_mutations == ids)
            if len(temp) > 0:
                binary[temp[0]] = 1
        
        row = [donor_id, cancer_type] + list(binary)
        final_df = pd.concat([final_df, pd.DataFrame([row], columns=columns)], ignore_index=True)
    
    return final_df



In [None]:
folders = [file for file in listdir('data') if 'US' in file]

for cancer in folders:
    print(cancer)

In [None]:
folders = [file for file in listdir('data') if 'US' in file]
encodings_array = []
i=0

for cancer in folders:
    print(cancer)
    encoding = pd.DataFrame(columns = columns)
    encoding = katrinas_function(encoding, cancer, master_mutations)
    encoding.reset_index(drop=True, inplace=True)
    encodings_array.append(encoding)

In [None]:
len(encodings_array)

In [None]:
binary_encodings = pd.concat([encodings_array[0],
          encodings_array[1],
          encodings_array[2],
          encodings_array[3],
          encodings_array[4],
          encodings_array[5],
          encodings_array[6],
          encodings_array[7],
          encodings_array[8],
          encodings_array[9],
          encodings_array[10],
          encodings_array[11]])

In [None]:
binary_encodings.reset_index(drop=True, inplace=True)
print(binary_encodings.shape)
binary_encodings.head()

In [None]:
uniq_binary_enc = binary_encodings.drop_duplicates()

In [None]:
print(uniq_binary_enc.shape)
uniq_binary_enc.head()

In [None]:
uniq_binary_enc.to_csv('binary_encodings.csv')

In [None]:
unique_donors = np.unique(np.array(all_cancer_df.DonorId))

In [None]:
len(unique_donors)

In [None]:
all_cancer_df_filtered = all_cancer_df[all_cancer_df.Count]

In [None]:
"""
This function looks at each class and computes the number of donors, 
average number of mutations and features.
"""
def look(cancer_code):
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    feat = []
    ids = []
    donors = []
    chromosomes = []
    
    all_files = [file for file in listdir(PATH)]
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        feat.append(len(df.columns))
        ids.append(len(set(df['icgc_mutation_id'])))
        donors.append(len(set(df['icgc_donor_id'])))
        
        # Repeatedly union the set of chromosomes for each donor
        chromosomes += list(df['chromosome'].apply(str))
    
    print('*' * 40)
    print('Project code: ', cancer_code)
    
    try:
        print('Average number of features: {}'.format(mean(feat)))
        print('Average number of mutations: {}'.format(mean(ids)))
        print('Number of donors: {}'.format(sum(donors)))
        print('Chromosomes: {}'.format(set(chromosomes)))
    except:
        print(NULL)

In [None]:
folders = [file for file in listdir('data') if 'US' in file]
for cancer in folders:
    look(cancer)

### Reading in each file
Each file represents one individual. We could probably use the project_code feature as our target/output.

In [None]:
df = pd.read_csv('data/BRCA-US/simple_somatic_mutation.open-2020-03-02T154602.752.tsv', sep='\t')

In [None]:
print('Number of features: {}'.format(len(df.columns)))
print(df.columns)

Notice how for one donor there are duplicate icgc_mutation_id values. Not sure what this means.

Number of muations and genes affected do not match :(

In [None]:
genes, counts = np.unique(df.gene_affected.to_numpy(), return_counts=True)
genes.shape

In [None]:
# Mutation ID MU23549
df[df['icgc_mutation_id'] == 'MU23549']

For the files I looked at (which was only a few so far) there are 42 features.

# Some EDA

In [None]:
df.columns

# Choices of features
- Chromosome start and end may not be helpful though there could exist a relationship between which chromosome and the position of mutation. 
- Do not know what CDS mutation or AA mutation is. Many NaNs here.
- Do not know how the gene_effected feature is encoded.
- Project_code can be the target output.
- Possibly remove start and end features with length of mutation. If a deletion then we can represent with negative number.

In [None]:
# Choice of features here is due to quick look at the columns. 
# Change as you see fit.

features = ['icgc_mutation_id', 
            'icgc_donor_id',
            'icgc_sample_id', 
            'matched_icgc_sample_id', 
            'submitted_sample_id',
            'submitted_matched_sample_id',
            'chromosome', 
            'chromosome_start',
            'chromosome_end',
            'chromosome_strand',
            'reference_genome_allele',
            'mutated_from_allele',
            'mutated_to_allele',
            'consequence_type',
            'gene_affected',
            'total_read_count',
            'project_code']

np.unique(df[features].chromosome_strand)
df[features][['chromosome', 
              'chromosome_start', 
              'reference_genome_allele', 
              'mutated_from_allele', 
              'mutated_to_allele']]

# why would donor have two submitted samples or more?


# Mutation IDs
- There are many duplicates. There are only 213 unique mutation IDs for this donor.

In [None]:
print('There are {} unique mutation IDs'.format(len(set(df[features]['icgc_mutation_id']))))

In [None]:
print('There is only {} donor in this file'.format(len(set(df[features]['icgc_donor_id']))))

In [None]:
data = df[features]

In [None]:
data.head()

## Not all mutations are 1bp
Some mutations involve longer sequences!

In [None]:
df[df['mutation_type'] == 'deletion of <=200bp'][['mutation_type', 'reference_genome_allele', 'mutated_from_allele', 'mutated_to_allele']].head(5)

In [None]:
set(df['consequence_type'])

# Encoding Take 2
### Use gene affected instead of mutation ID

In [29]:
def mutation_gene_aff(df):
    cancer_type = df.project_code[0]
    smaller_df = df[['icgc_donor_id','icgc_mutation_id','gene_affected']]
    unique_ids = smaller_df.drop_duplicates()
    donor_id = unique_ids.icgc_donor_id + cancer_type
    new_df = pd.DataFrame({"CancerType": cancer_type,
                           "DonorId": donor_id,
                           "MutationId": unique_ids['icgc_mutation_id'], 
                           "GeneAffected": unique_ids['gene_affected']})
    
    return new_df

In [34]:
def get_mutation_gene_full():
    PATH = 'data/full_cancer_data/'
    
    all_files = [file for file in listdir(PATH)]
    
    final_df = None
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        final_df = pd.concat([final_df, mutation_gene_aff(df)])
        
    return final_df

In [35]:
test_df = get_mutation_gene_full()
print(test_df.shape)
print('unique donors:')
print(len(np.unique(test_df.DonorId)))
test_df.head()

(3653094, 4)
unique donors:
5987


Unnamed: 0,CancerType,DonorId,MutationId,GeneAffected
0,BLCA-US,DO51951BLCA-US,MU129042621,ENSG00000075340
12,BLCA-US,DO51951BLCA-US,MU131874999,ENSG00000247077
16,BLCA-US,DO51951BLCA-US,MU131874999,ENSG00000256632
17,BLCA-US,DO51951BLCA-US,MU131874999,ENSG00000176894
19,BLCA-US,DO51951BLCA-US,MU28596490,ENSG00000083093


In [41]:
all_cancer_gene_df = test_df

In [37]:
BLCA_df = None
BRCA_df = None
COAD_df = None
GBM_df = None
KIRC_df = None
LGG_df = None
LUSC_df = None
OV_df = None
PRAD_df = None
SKCM_df = None
THCA_df = None
UCEC_df = None
cancer_names = [BLCA_df, BRCA_df, COAD_df, GBM_df, KIRC_df, LGG_df, LUSC_df, OV_df, PRAD_df, SKCM_df, THCA_df, UCEC_df]

In [36]:
def get_mutation_gene_ind(cancer):
    PATH = 'data/full_cancer_data/'
    df = pd.read_csv(PATH + cancer, sep='\t')
    final_df = mutation_gene_aff(df)
        
    return final_df

#### Stuff for Kaitlyn's viz

In [38]:
PATH = 'data/full_cancer_data/'
    
all_files = [file for file in listdir(PATH)]

i=0
for cancer in all_files:
    print(cancer)
    cancer_names[i] = pd.concat([cancer_names[i], 
                               get_mutation_gene_ind(cancer)])
    cancer_names[i].reset_index(drop=True, inplace=True)
    i = i + 1

BLCA_ssm.tsv
BRCA_ssm.tsv
COAD_ssm.tsv
GBM_ssm.tsv
KIRC_ssm.tsv
LGG_ssm.tsv
LUSC_ssm.tsv
OV_ssm.tsv
PRAD_ssm.tsv
SKCM_ssm.tsv
THCA_ssm.tsv
UCEC_ssm.tsv


In [39]:
file_names = ["BLCA_df.csv", "BRCA_df.csv", "COAD_df.csv", "GBM_df.csv", "KIRC_df.csv", "LGG_df.csv", "LUSC_df.csv", "OV_df.csv", "PRAD_df.csv", "SKCM_df.csv", "THCA_df.csv", "UCEC_df.csv"]

In [40]:
i = 0
for df in cancer_names:
    mutation_gene_df = df[['MutationId', 'GeneAffected']]
    df.reset_index(drop=True, inplace=True)
    print_for_Kaitlyn = df.drop_duplicates()
    print(print_for_Kaitlyn.shape)
    print_for_Kaitlyn.to_csv(file_names[i])
    i = i + 1

(232272, 4)
(185661, 4)
(382412, 4)
(109262, 4)
(42843, 4)
(61621, 4)
(288951, 4)
(81417, 4)
(48050, 4)
(816531, 4)
(18031, 4)
(1386043, 4)


#### Create Master Gene List

In [42]:
mutation_gene_df = all_cancer_gene_df[['MutationId', 'GeneAffected']]
mutation_gene_df.reset_index(drop=True, inplace=True)
mutation_gene_df.head(10)

Unnamed: 0,MutationId,GeneAffected
0,MU129042621,ENSG00000075340
1,MU131874999,ENSG00000247077
2,MU131874999,ENSG00000256632
3,MU131874999,ENSG00000176894
4,MU28596490,ENSG00000083093
5,MU11253975,ENSG00000100987
6,MU132005596,ENSG00000205038
7,MU131862503,ENSG00000196693
8,MU131862503,ENSG00000272373
9,MU132085217,ENSG00000198670


In [46]:
genes = mutation_gene_df['GeneAffected'].to_numpy().astype('str')
print(len(genes))
print(len(np.unique(genes)))
uniq_genes, counts = np.unique(genes, return_counts=True)
uniq_gene_df = pd.DataFrame({'gene': uniq_genes,
             'Count': counts})
print(uniq_gene_df.shape)
uniq_gene_df.head()

3653094
34442
(34442, 2)


Unnamed: 0,gene,Count
0,ENSG00000000003,71
1,ENSG00000000005,85
2,ENSG00000000419,156
3,ENSG00000000457,166
4,ENSG00000000460,556


In [47]:
uniq_gene_df.tail(10)

Unnamed: 0,gene,Count
34432,ENSG00000273474,5
34433,ENSG00000273476,46
34434,ENSG00000273477,13
34435,ENSG00000273478,33
34436,ENSG00000273481,6
34437,ENSG00000273483,92
34438,ENSG00000273485,106
34439,ENSG00000273488,13
34440,ENSG00000273489,23
34441,,576


In [49]:
uniq_gene_df_filtered = uniq_gene_df[uniq_gene_df.Count > 9]
uniq_gene_df_filtered.shape

(30399, 2)

In [52]:
master_genes = np.array(uniq_gene_df_filtered.gene)
master_genes = np.delete(master_genes, 30398)
master_genes

array(['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', ...,
       'ENSG00000273485', 'ENSG00000273488', 'ENSG00000273489'],
      dtype=object)

In [54]:
gene_df = pd.DataFrame({'GeneAffected': master_genes})
gene_df.to_csv('all_genes_affected.csv')

### Create Encoding

In [55]:
def gene_helper(df):
    cancer_type = df.CancerType[0]
    donor_id = df.DonorId[0]
    smaller_df = df[['MutationId','GeneAffected']]
    smaller_df.drop_duplicates(inplace=True)
    genes_only = smaller_df[['gene_affected']].to_numpy().astype('str')
    unique_genes, counts = np.unique(genes_only, return_counts=True)
    return cancer_type, donor_id, unique_genes, counts

In [56]:
def katrinas_gene_function(df, master_genes, unique_ids):
    
    columns = ['DonorIDs', 'CancerType'] + list(master_genes)
    final_df = None
    k = 0
    
    for donor in unique_ids:
        temp_df = df[df.DonorId == donor]
        
        cancer_type, donor_id, unique_genes, counts = gene_helper(temp_df)
        freq = np.zeros((len(master_genes)))
        i = 0
        
        for gene in unique_genes:
            temp = np.where(master_genes == gene)
            if len(temp) > 0:
                freq[temp[0]] = counts[i]
            i = i + 1
            if i == len(unique_genes):
                k = k + 1
                if k%50 == 0:
                    print(str(k) + ' donors processed')
                
        row = [donor_id, cancer_type] + list(freq)
        final_df = pd.concat([final_df, pd.DataFrame([row], columns=columns)], ignore_index=True)
    
    return final_df



In [None]:
columns = ['DonorIDs', 'CancerType'] + list(master_genes)
df2 = pd.DataFrame(columns = columns)
df2

In [None]:
#test
encoding = pd.DataFrame(columns = columns)
encoding = katrinas_gene_function(encoding, 'BRCA-US', master_genes)
encoding.reset_index(drop=True, inplace=True)
print(encoding.shape)
encoding

In [None]:
folders = [file for file in listdir('data') if 'US' in file]
encodings_array = []
i=0

for cancer in folders:
    print(cancer)
    encoding = pd.DataFrame(columns = columns)
    encoding = katrinas_gene_function(encoding, cancer, master_genes)
    encoding.reset_index(drop=True, inplace=True)
    encodings_array.append(encoding)

In [None]:
len(encodings_array)

In [None]:
freq_encodings = pd.concat([encodings_array[0],
          encodings_array[1],
          encodings_array[2],
          encodings_array[3],
          encodings_array[4],
          encodings_array[5],
          encodings_array[6],
          encodings_array[7],
          encodings_array[8],
          encodings_array[9],
          encodings_array[10],
          encodings_array[11]])

In [None]:
freq_encodings.reset_index(drop=True, inplace=True)
print(freq_encodings.shape)
freq_encodings.head()

In [None]:
np.unique(freq_encodings.CancerType)

In [None]:
uniq_freq_enc = freq_encodings.drop_duplicates()

In [None]:
print(uniq_freq_enc.shape)
uniq_freq_enc.head()

In [None]:
uniq_freq_enc.to_csv('Gene_freq_encoding.csv')

In [None]:
this_sum = uniq_freq_enc.sum(axis=1)
this_sum

In [None]:
sum(this_sum)/len(this_sum)