In [1]:
import pandas as pd
from os import listdir
from statistics import mean
import numpy as np

In [2]:
listdir('data')

['.DS_Store',
 '.remove_duplicates.py.swo',
 'BLCA-US',
 'BRCA-US',
 'COAD-US',
 'GBM-US',
 'KIRC-US',
 'LGG-US',
 'list_data_dirs.py',
 'LUSC-US',
 'organize_data.py',
 'OV-US',
 'PRAD-US',
 'SKCM-US',
 'THCA-US',
 'UCEC-US']

In [3]:
# Gets unique mutation ids with number of occurrences
# Returns new dataframe with cancer type, donor id, mutation ids and corresponding counts
def mutation_counts(df):
    cancer_type = df.project_code[0]
    donor_id = df.icgc_donor_id[0] + cancer_type
    smaller_df = df[['icgc_mutation_id']]
    unique_ids, counts = np.unique(smaller_df.to_numpy(), 
                                   return_counts=True,
                                  )
    new_df = pd.DataFrame({"CancerType": cancer_type,
                           "DonorId": donor_id,
                           "MutationId": unique_ids, 
                           "Count": counts})
    return new_df

In [4]:
# The mutation counts for a cancer type is returned
def get_mutation_frames(cancer_code):
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    all_files = [file for file in listdir(PATH)]
    
    final_df = None
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        final_df = pd.concat([final_df, mutation_counts(df)])
        
    return final_df

In [5]:
# Go through all cancer types and get mutation dataframes
# Concatenate all dataframes
# This results in one dataframe for all cancer types

folders = [file for file in listdir('data') if 'US' in file]
all_cancer_df = None
for cancer in folders:
    all_cancer_df = pd.concat([all_cancer_df, 
                               get_mutation_frames(cancer)])
    
all_cancer_df.reset_index(drop=True, inplace=True)

  if (yield from self.run_code(code, result)):


In [6]:
all_cancer_df.head()

Unnamed: 0,CancerType,DonorId,MutationId,Count
0,BLCA-US,DO48566BLCA-US,MU100073,7
1,BLCA-US,DO48566BLCA-US,MU101899,12
2,BLCA-US,DO48566BLCA-US,MU102378,12
3,BLCA-US,DO48566BLCA-US,MU102443,9
4,BLCA-US,DO48566BLCA-US,MU103120,3


In [85]:
len(np.unique(mutation_gene_df['MutationId']))

1710852

In [8]:
unique_mutations, counts = np.unique(np.array(all_cancer_df.MutationId), return_counts=True)

In [9]:
mutation_df = pd.DataFrame( {'MutationID': unique_mutations, 
                             'Counts': counts} )

In [10]:
mutation_df_filtered = mutation_df[mutation_df.Counts > 4]

In [11]:
mutation_df_filtered.sort_values(by=['Counts'], inplace=True)
mutation_df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,MutationID,Counts
1186458,MU1899169,5
1236057,MU1957569,5
1236120,MU1957631,5
1236182,MU1957694,5
1236245,MU1957756,5


In [12]:
mutation_ids = mutation_df_filtered.MutationID

In [13]:
master_mutations = np.array(mutation_ids)

In [14]:
columns = ['DonorIDs', 'CancerType'] + list(master_mutations)
df = pd.DataFrame(columns = columns)
temp = np.where(master_mutations == 'IS675')
np.zeros((len(master_mutations)))

array([0., 0., 0., ..., 0., 0., 0.])

In [15]:
def helper(df):
    cancer_type = df.project_code[0]
    donor_id = df.icgc_donor_id[0] + cancer_type
    smaller_df = df[['icgc_mutation_id']]
    unique_ids = np.unique(smaller_df.to_numpy())
    return cancer_type, donor_id, unique_ids

In [16]:
def katrinas_function(df, cancer_code, master_mutations):
    
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    all_files = [file for file in listdir(PATH)]
    columns = ['DonorIDs', 'CancerType'] + list(master_mutations)
    final_df = df
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        cancer_type, donor_id, unique_ids = helper(df)
        binary = np.zeros((len(master_mutations)))
        
        for ids in unique_ids:
            temp = np.where(master_mutations == ids)
            if len(temp) > 0:
                binary[temp[0]] = 1
        
        row = [donor_id, cancer_type] + list(binary)
        final_df = pd.concat([final_df, pd.DataFrame([row], columns=columns)], ignore_index=True)
    
    return final_df



In [18]:
folders = [file for file in listdir('data') if 'US' in file]

for cancer in folders:
    print(cancer)

BLCA-US
BRCA-US
COAD-US
GBM-US
KIRC-US
LGG-US
LUSC-US
OV-US
PRAD-US
SKCM-US
THCA-US
UCEC-US


In [24]:
folders = [file for file in listdir('data') if 'US' in file]
encodings_array = []
i=0

for cancer in folders:
    print(cancer)
    encoding = pd.DataFrame(columns = columns)
    encoding = katrinas_function(encoding, cancer, master_mutations)
    encoding.reset_index(drop=True, inplace=True)
    encodings_array.append(encoding)

BLCA-US
BRCA-US
COAD-US


  if (yield from self.run_code(code, result)):


GBM-US
KIRC-US
LGG-US
LUSC-US
OV-US
PRAD-US
SKCM-US
THCA-US
UCEC-US


In [25]:
len(encodings_array)

12

In [27]:
binary_encodings = pd.concat([encodings_array[0],
          encodings_array[1],
          encodings_array[2],
          encodings_array[3],
          encodings_array[4],
          encodings_array[5],
          encodings_array[6],
          encodings_array[7],
          encodings_array[8],
          encodings_array[9],
          encodings_array[10],
          encodings_array[11]])

In [28]:
binary_encodings.reset_index(drop=True, inplace=True)
print(binary_encodings.shape)
binary_encodings.head()

(4581, 20266)


Unnamed: 0,DonorIDs,CancerType,MU1899169,MU1957569,MU1957631,MU1957694,MU1957756,MU1957895,MU1957974,MU1958009,...,MU130696800,MU122201,MU129795540,MU129540995,MU4885648,MU4468,MU866,MU62030,MU131898417,MU131867962
0,DO48566BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DO223588BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DO51948BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DO514BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DO474BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
uniq_binary_enc = binary_encodings.drop_duplicates()

In [32]:
print(uniq_binary_enc.shape)
uniq_binary_enc.head()

(3364, 20266)


Unnamed: 0,DonorIDs,CancerType,MU1899169,MU1957569,MU1957631,MU1957694,MU1957756,MU1957895,MU1957974,MU1958009,...,MU130696800,MU122201,MU129795540,MU129540995,MU4885648,MU4468,MU866,MU62030,MU131898417,MU131867962
0,DO48566BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DO223588BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DO51948BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DO514BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DO474BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
uniq_binary_enc.to_csv('binary_encodings.csv')

In [17]:
unique_donors = np.unique(np.array(all_cancer_df.DonorId))

In [18]:
len(unique_donors)

3364

In [19]:
all_cancer_df_filtered = all_cancer_df[all_cancer_df.Count]

KeyError: "None of [Int64Index([ 7, 12, 12,  9,  3, 16,  4,  6,  6,  5,\n            ...\n             1,  5,  4,  2,  3,  2,  3, 13, 11,  4],\n           dtype='int64', length=2534373)] are in the [columns]"

In [7]:
"""
This function looks at each class and computes the number of donors, 
average number of mutations and features.
"""
def look(cancer_code):
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    feat = []
    ids = []
    donors = []
    chromosomes = []
    
    all_files = [file for file in listdir(PATH)]
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        feat.append(len(df.columns))
        ids.append(len(set(df['icgc_mutation_id'])))
        donors.append(len(set(df['icgc_donor_id'])))
        
        # Repeatedly union the set of chromosomes for each donor
        chromosomes += list(df['chromosome'].apply(str))
    
    print('*' * 40)
    print('Project code: ', cancer_code)
    
    try:
        print('Average number of features: {}'.format(mean(feat)))
        print('Average number of mutations: {}'.format(mean(ids)))
        print('Number of donors: {}'.format(sum(donors)))
        print('Chromosomes: {}'.format(set(chromosomes)))
    except:
        print(NULL)

In [8]:
folders = [file for file in listdir('data') if 'US' in file]
for cancer in folders:
    look(cancer)

****************************************
Project code:  BLCA-US
Average number of features: 42
Average number of mutations: 55.48048780487805
Number of donors: 410
Chromosomes: {'4', '9', '7', '10', 'Y', '8', '16', '11', '18', '17', '5', '14', '1', '2', '22', 'X', '20', '3', '21', '15', '13', '19', '6', '12'}
****************************************
Project code:  BRCA-US
Average number of features: 42
Average number of mutations: 130.88801571709234
Number of donors: 1018
Chromosomes: {'4', '9', '7', '10', 'Y', '8', '16', '11', '18', '17', '5', '14', '1', '2', '22', 'X', '20', '3', '21', '15', '13', '19', '6', '12'}
****************************************
Project code:  COAD-US
Average number of features: 42
Average number of mutations: 1949.2
Number of donors: 280
Chromosomes: {'4', '9', '7', '10', 'Y', '8', '16', '18', '11', '5', '17', '14', '1', '2', '22', 'X', '20', '3', '21', '15', '13', '19', '6', '12'}
****************************************
Project code:  GBM-US
Average numbe

### Reading in each file
Each file represents one individual. We could probably use the project_code feature as our target/output.

In [9]:
df = pd.read_csv('data/BRCA-US/simple_somatic_mutation.open-2020-03-02T154602.752.tsv', sep='\t')

In [10]:
print('Number of features: {}'.format(len(df.columns)))
print(df.columns)

Number of features: 42
Index(['icgc_mutation_id', 'icgc_donor_id', 'project_code', 'icgc_specimen_id',
       'icgc_sample_id', 'matched_icgc_sample_id', 'submitted_sample_id',
       'submitted_matched_sample_id', 'chromosome', 'chromosome_start',
       'chromosome_end', 'chromosome_strand', 'assembly_version',
       'mutation_type', 'reference_genome_allele', 'mutated_from_allele',
       'mutated_to_allele', 'quality_score', 'probability', 'total_read_count',
       'mutant_allele_read_count', 'verification_status',
       'verification_platform', 'biological_validation_status',
       'biological_validation_platform', 'consequence_type', 'aa_mutation',
       'cds_mutation', 'gene_affected', 'transcript_affected',
       'gene_build_version', 'platform', 'experimental_protocol',
       'sequencing_strategy', 'base_calling_algorithm', 'alignment_algorithm',
       'variation_calling_algorithm', 'other_analysis_algorithm',
       'seq_coverage', 'raw_data_repository', 'raw_data_acc

Notice how for one donor there are duplicate icgc_mutation_id values. Not sure what this means.

Number of muations and genes affected do not match :(

In [None]:
genes, counts = np.unique(df.gene_affected.to_numpy(), return_counts=True)
genes.shape

In [None]:
# Mutation ID MU23549
df[df['icgc_mutation_id'] == 'MU23549']

For the files I looked at (which was only a few so far) there are 42 features.

# Some EDA

In [None]:
df.columns

# Choices of features
- Chromosome start and end may not be helpful though there could exist a relationship between which chromosome and the position of mutation. 
- Do not know what CDS mutation or AA mutation is. Many NaNs here.
- Do not know how the gene_effected feature is encoded.
- Project_code can be the target output.
- Possibly remove start and end features with length of mutation. If a deletion then we can represent with negative number.

In [None]:
# Choice of features here is due to quick look at the columns. 
# Change as you see fit.

features = ['icgc_mutation_id', 
            'icgc_donor_id',
            'icgc_sample_id', 
            'matched_icgc_sample_id', 
            'submitted_sample_id',
            'submitted_matched_sample_id',
            'chromosome', 
            'chromosome_start',
            'chromosome_end',
            'chromosome_strand',
            'reference_genome_allele',
            'mutated_from_allele',
            'mutated_to_allele',
            'consequence_type',
            'gene_affected',
            'total_read_count',
            'project_code']

np.unique(df[features].chromosome_strand)
df[features][['chromosome', 
              'chromosome_start', 
              'reference_genome_allele', 
              'mutated_from_allele', 
              'mutated_to_allele']]

# why would donor have two submitted samples or more?


# Mutation IDs
- There are many duplicates. There are only 213 unique mutation IDs for this donor.

In [None]:
print('There are {} unique mutation IDs'.format(len(set(df[features]['icgc_mutation_id']))))

In [None]:
print('There is only {} donor in this file'.format(len(set(df[features]['icgc_donor_id']))))

In [None]:
data = df[features]

In [None]:
data.head()

## Not all mutations are 1bp
Some mutations involve longer sequences!

In [None]:
df[df['mutation_type'] == 'deletion of <=200bp'][['mutation_type', 'reference_genome_allele', 'mutated_from_allele', 'mutated_to_allele']].head(5)

In [None]:
set(df['consequence_type'])

# Encoding Take 2
### Use gene affected instead of mutation ID

In [5]:
def mutation_gene_aff(df):
    cancer_type = df.project_code[0]
    donor_id = df.icgc_donor_id[0] + cancer_type
    smaller_df = df[['icgc_mutation_id','gene_affected']]
    unique_ids = smaller_df.drop_duplicates()
    new_df = pd.DataFrame({"CancerType": cancer_type,
                           "DonorId": donor_id,
                           "MutationId": unique_ids['icgc_mutation_id'], 
                           "GeneAffected": unique_ids['gene_affected']})
    
    return new_df

In [6]:
def get_mutation_gene_frames(cancer_code):
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    all_files = [file for file in listdir(PATH)]
    
    final_df = None
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        final_df = pd.concat([final_df, mutation_gene_aff(df)])
        
    return final_df

In [9]:
BLCA_df = None
BRCA_df = None
COAD_df = None
GBM_df = None
KIRC_df = None
LGG_df = None
LUSC_df = None
OV_df = None
PRAD_df = None
SKCM_df = None
THCA_df = None
UCEC_df = None
cancer_names = [BLCA_df, BRCA_df, COAD_df, GBM_df, KIRC_df, LGG_df, LUSC_df, OV_df, PRAD_df, SKCM_df, THCA_df, UCEC_df]

In [10]:
folders = [file for file in listdir('data') if 'US' in file]

i=0
for cancer in folders:
    print(cancer)
    cancer_names[i] = pd.concat([cancer_names[i], 
                               get_mutation_gene_frames(cancer)])
    cancer_names[i].reset_index(drop=True, inplace=True)
    i = i + 1

BLCA-US
BRCA-US
COAD-US


  if (yield from self.run_code(code, result)):


GBM-US
KIRC-US
LGG-US
LUSC-US
OV-US
PRAD-US
SKCM-US
THCA-US
UCEC-US


In [11]:
file_names = ["BLCA_df.csv", "BRCA_df.csv", "COAD_df.csv", "GBM_df.csv", "KIRC_df.csv", "LGG_df.csv", "LUSC_df.csv", "OV_df.csv", "PRAD_df.csv", "SKCM_df.csv", "THCA_df.csv", "UCEC_df.csv"]

In [14]:
i = 0
for df in cancer_names:
    mutation_gene_df = df[['MutationId', 'GeneAffected']]
    df.reset_index(drop=True, inplace=True)
    print_for_Kaitlyn = df.drop_duplicates()
    print(print_for_Kaitlyn.shape)
    print_for_Kaitlyn.to_csv(file_names[i])
    i = i + 1

(33891, 4)
(178476, 4)
(343184, 4)
(103051, 4)
(32021, 4)
(51435, 4)
(69, 4)
(64614, 4)
(5453, 4)
(751309, 4)
(14806, 4)
(1250562, 4)


In [7]:
folders = [file for file in listdir('data') if 'US' in file]
all_cancer_gene_df = None
for cancer in folders:
    print(cancer)
    all_cancer_gene_df = pd.concat([all_cancer_gene_df, 
                               get_mutation_gene_frames(cancer)])
    
all_cancer_gene_df.reset_index(drop=True, inplace=True)

BLCA-US
BRCA-US


KeyboardInterrupt: 

In [10]:
all_cancer_gene_df.head()

Unnamed: 0,CancerType,DonorId,MutationId,GeneAffected
0,BLCA-US,DO48566BLCA-US,MU131916556,ENSG00000141698
1,BLCA-US,DO48566BLCA-US,MU131916556,ENSG00000141756
2,BLCA-US,DO48566BLCA-US,MU63670359,ENSG00000002746
3,BLCA-US,DO48566BLCA-US,MU131988623,ENSG00000204252
4,BLCA-US,DO48566BLCA-US,MU4472758,ENSG00000144218


In [11]:
mutation_gene_df = all_cancer_gene_df[['MutationId', 'GeneAffected']]
mutation_gene_df.reset_index(drop=True, inplace=True)
mutation_gene_df.head(10)

Unnamed: 0,MutationId,GeneAffected
0,MU131916556,ENSG00000141698
1,MU131916556,ENSG00000141756
2,MU63670359,ENSG00000002746
3,MU131988623,ENSG00000204252
4,MU4472758,ENSG00000144218
5,MU131965260,ENSG00000145244
6,MU129831223,ENSG00000164309
7,MU131868936,ENSG00000241322
8,MU131868936,ENSG00000251537
9,MU131904393,ENSG00000196832


In [118]:
mutations_for_Kaitlyn = mutation_gene_df.drop_duplicates()
mutations_for_Kaitlyn.to_csv('mutation_gene_list.csv')

In [12]:
genes = mutation_gene_df['GeneAffected'].to_numpy().astype('str')

In [13]:
print(len(genes))
print(len(np.unique(genes)))

3847745
34304


In [14]:
uniq_genes, counts = np.unique(genes, return_counts=True)
uniq_gene_df = pd.DataFrame({'gene': uniq_genes,
             'Count': counts})

In [15]:
print(uniq_gene_df.shape)
uniq_gene_df.head()

(34304, 2)


Unnamed: 0,gene,Count
0,ENSG00000000003,74
1,ENSG00000000005,93
2,ENSG00000000419,164
3,ENSG00000000457,191
4,ENSG00000000460,623


In [None]:
uniq_gene_df.tail(10)

In [None]:
import matplotlib.pyplot as plt
data = uniq_gene_df['Count']
plt.hist(data, bins=50)

In [16]:
uniq_gene_df_filtered = uniq_gene_df[uniq_gene_df.Count > 9]
uniq_gene_df_filtered.shape

(30544, 2)

In [17]:
master_genes = np.array(uniq_gene_df_filtered.gene)
master_genes = np.delete(master_genes, 30543)
master_genes

array(['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', ...,
       'ENSG00000273485', 'ENSG00000273488', 'ENSG00000273489'],
      dtype=object)

In [158]:
gene_df = pd.DataFrameFrameFrame({'GeneAffected': master_genes})
gene_df.to_csv('all_genes_affected.csv')

In [25]:
def gene_helper(df):
    cancer_type = df.project_code[0]
    donor_id = df.icgc_donor_id[0] + cancer_type
    smaller_df = df[['icgc_mutation_id','gene_affected']]
    smaller_df.drop_duplicates(inplace=True)
    genes_only = smaller_df[['gene_affected']].to_numpy().astype('str')
    unique_genes, counts = np.unique(genes_only, return_counts=True)
    return cancer_type, donor_id, unique_genes, counts

In [34]:
def katrinas_gene_function(df, cancer_code, master_genes):
    
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    all_files = [file for file in listdir(PATH)]
    columns = ['DonorIDs', 'CancerType'] + list(master_genes)
    final_df = df
    k = 0
    
    for file in all_files:
        temp_df = pd.read_csv(PATH + file, sep='\t')
        cancer_type, donor_id, unique_genes, counts = gene_helper(temp_df)
        freq = np.zeros((len(master_genes)))
        i = 0
        
        for gene in unique_genes:
            temp = np.where(master_genes == gene)
            if len(temp) > 0:
                freq[temp[0]] = counts[i]
            i = i + 1
            if i == len(unique_genes):
                k = k + 1
                if k%50 == 0:
                    print(str(k) + ' donors processed')
                
        row = [donor_id, cancer_type] + list(freq)
        final_df = pd.concat([final_df, pd.DataFrame([row], columns=columns)], ignore_index=True)
    
    return final_df



In [26]:
columns = ['DonorIDs', 'CancerType'] + list(master_genes)
df2 = pd.DataFrame(columns = columns)
df2

Unnamed: 0,DonorIDs,CancerType,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,...,ENSG00000273456,ENSG00000273466,ENSG00000273471,ENSG00000273476,ENSG00000273477,ENSG00000273478,ENSG00000273483,ENSG00000273485,ENSG00000273488,ENSG00000273489


In [30]:
x = 1
str(x)

'1'

In [None]:
#test
encoding = pd.DataFrame(columns = columns)
encoding = katrinas_gene_function(encoding, 'BRCA-US', master_genes)
encoding.reset_index(drop=True, inplace=True)
print(encoding.shape)
encoding

In [35]:
folders = [file for file in listdir('data') if 'US' in file]
encodings_array = []
i=0

for cancer in folders:
    print(cancer)
    encoding = pd.DataFrame(columns = columns)
    encoding = katrinas_gene_function(encoding, cancer, master_genes)
    encoding.reset_index(drop=True, inplace=True)
    encodings_array.append(encoding)

BLCA-US


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


50 donors processed
100 donors processed
150 donors processed
200 donors processed
250 donors processed
300 donors processed
350 donors processed
400 donors processed
BRCA-US
50 donors processed
100 donors processed
150 donors processed
200 donors processed
250 donors processed
300 donors processed
350 donors processed
400 donors processed
450 donors processed
500 donors processed
550 donors processed
600 donors processed
650 donors processed
700 donors processed
750 donors processed
800 donors processed
850 donors processed
900 donors processed
950 donors processed
1000 donors processed
COAD-US


  if (yield from self.run_code(code, result)):


50 donors processed
100 donors processed
150 donors processed
200 donors processed
250 donors processed
GBM-US
50 donors processed
100 donors processed
150 donors processed
200 donors processed
250 donors processed
300 donors processed
350 donors processed
KIRC-US
50 donors processed
100 donors processed
150 donors processed
200 donors processed
LGG-US
50 donors processed
100 donors processed
150 donors processed
200 donors processed
250 donors processed
300 donors processed
LUSC-US
50 donors processed
100 donors processed
150 donors processed
200 donors processed
250 donors processed
300 donors processed
350 donors processed
400 donors processed
450 donors processed
OV-US
50 donors processed
100 donors processed
150 donors processed
200 donors processed
250 donors processed
300 donors processed
350 donors processed
400 donors processed
PRAD-US
50 donors processed
100 donors processed
SKCM-US
50 donors processed
100 donors processed
150 donors processed
200 donors processed
250 donors 

In [36]:
len(encodings_array)

12

In [37]:
freq_encodings = pd.concat([encodings_array[0],
          encodings_array[1],
          encodings_array[2],
          encodings_array[3],
          encodings_array[4],
          encodings_array[5],
          encodings_array[6],
          encodings_array[7],
          encodings_array[8],
          encodings_array[9],
          encodings_array[10],
          encodings_array[11]])

In [38]:
freq_encodings.reset_index(drop=True, inplace=True)
print(freq_encodings.shape)
freq_encodings.head()

(4581, 30545)


Unnamed: 0,DonorIDs,CancerType,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,...,ENSG00000273456,ENSG00000273466,ENSG00000273471,ENSG00000273476,ENSG00000273477,ENSG00000273478,ENSG00000273483,ENSG00000273485,ENSG00000273488,ENSG00000273489
0,DO48566BLCA-US,BLCA-US,0.0,0.0,0.0,2.0,3.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DO223588BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,DO51948BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DO514BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DO474BLCA-US,BLCA-US,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
np.unique(freq_encodings.CancerType)

array(['BLCA-US', 'BRCA-US', 'COAD-US', 'GBM-US', 'KIRC-US', 'LGG-US',
       'LUSC-US', 'OV-US', 'PRAD-US', 'SKCM-US', 'THCA-US', 'UCEC-US'],
      dtype=object)

In [40]:
uniq_freq_enc = freq_encodings.drop_duplicates()

In [41]:
print(uniq_freq_enc.shape)
uniq_freq_enc.head()

(3364, 30545)


Unnamed: 0,DonorIDs,CancerType,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,...,ENSG00000273456,ENSG00000273466,ENSG00000273471,ENSG00000273476,ENSG00000273477,ENSG00000273478,ENSG00000273483,ENSG00000273485,ENSG00000273488,ENSG00000273489
0,DO48566BLCA-US,BLCA-US,0.0,0.0,0.0,2.0,3.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DO223588BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,DO51948BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DO514BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DO474BLCA-US,BLCA-US,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
uniq_freq_enc.to_csv('Gene_freq_encoding.csv')

In [44]:
this_sum = uniq_freq_enc.sum(axis=1)
this_sum

0        7470.0
1        5104.0
2        3644.0
3        3376.0
4        3074.0
5        2597.0
6        2238.0
7        2047.0
8        2122.0
9        2008.0
10          3.0
410     10873.0
411      2007.0
412       298.0
413      1916.0
414      1788.0
415      1741.0
416      1725.0
417      1610.0
418      1518.0
419      1361.0
420      1267.0
421      1174.0
423      1098.0
424      1041.0
425      1008.0
426       927.0
427       826.0
428       759.0
429       712.0
         ...   
4551      163.0
4552      172.0
4553      163.0
4554      161.0
4555      167.0
4556      160.0
4557      167.0
4558      162.0
4559      161.0
4560      158.0
4561      151.0
4562      153.0
4563      156.0
4564      153.0
4565      155.0
4566      153.0
4567      153.0
4568      151.0
4569      149.0
4570      144.0
4571      151.0
4572      145.0
4573      149.0
4574      146.0
4575      145.0
4576      149.0
4577      151.0
4578      146.0
4579      148.0
4580      144.0
Length: 3364, dtype: flo

In [46]:
sum(this_sum)/len(this_sum)

836.7693222354341