In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [2]:
abundance = pd.read_csv("Kraken-TCGA-Voom-SNM-All-Putative-Contaminants-Removed-Data.csv", index_col=0) # Load raw data
abundance = abundance[abundance.sum(axis = 1) > 0] # remove columns that all values are 0
print(abundance.shape)

(17625, 1411)


In [3]:
# Impute missing values
feature_names = abundance.columns
imputer = KNNImputer(n_neighbors=5, weights="uniform")
abundance_imputed = imputer.fit_transform(abundance.T)
abundance_imputed = pd.DataFrame(abundance_imputed.T, columns = feature_names)
abundance_imputed.iloc[0] = abundance.iloc[0]
abundance_imputed

Unnamed: 0,k__Viruses.f__Phycodnaviridae.g__Prasinovirus,k__Viruses.o__Caudovirales.f__Siphoviridae.g__Sfi1unalikevirus,k__Viruses.o__Herpesvirales.f__Herpesviridae.g__Simplexvirus,k__Viruses.f__Poxviridae.g__Parapoxvirus,k__Viruses.f__Bicaudaviridae.g__Bicaudavirus,k__Viruses.o__Picornavirales.f__Picornaviridae.g__Aquamavirus,k__Viruses.o__Herpesvirales.f__Herpesviridae.g__Mardivirus,k__Viruses.f__Baculoviridae.g__Deltabaculovirus,k__Viruses.f__Papillomaviridae.g__Taupapillomavirus,k__Viruses.o__Caudovirales.f__Myoviridae.g__I3likevirus,...,k__Bacteria.p__Firmicutes.c__Clostridia.o__Clostridiales.f__Clostridiales_Family_XIII._Incertae_Sedis.g__Casaltella,k__Bacteria.p__Proteobacteria.c__Betaproteobacteria.o__Neisseriales.f__Chromobacteriaceae.g__Deefgea,k__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Phyllobacteriaceae.g__Aquamicrobium,k__Bacteria.p__Deferribacteres.c__Deferribacteres.o__Deferribacterales.f__Deferribacteraceae.g__Geovibrio,k__Bacteria.p__Firmicutes.c__Clostridia.o__Clostridiales.f__Ruminococcaceae.g__Subdoligranulum,contaminant1Harvard,contaminant2HarvardCanadaBaylorWashU,contaminant3AllSeqCenters,contaminant4RandomSpikesHarvard,contaminant5RandomSpikes1000
0,-0.944325,1.378696,4.257198,2.263261,0.478455,3.283590,6.175448,0.554111,1.707182,-2.701144,...,2.952431,1.053012,1.641701,1.597915,1.615110,1.613142,12.579647,12.579647,1.579534,2.482244
1,2.799427,2.095355,5.282909,0.625737,1.262373,2.598059,5.032088,-0.978710,-0.066234,0.042982,...,3.623247,2.048703,2.357091,2.339657,2.378702,2.338944,13.305450,13.305450,2.332050,2.855626
2,-0.350754,1.969313,4.789957,0.645980,1.072432,2.454999,6.929492,-1.045204,4.299620,0.037454,...,3.531484,1.879986,2.273343,2.176359,2.268394,2.254124,13.220630,13.220630,2.211553,3.193130
3,2.163915,1.915785,4.830879,0.699582,1.110310,0.812846,5.856873,1.314613,2.308325,1.252654,...,3.649382,1.827052,2.385817,2.223181,2.313300,2.361125,13.327630,13.327630,2.164692,3.090736
4,0.053629,2.127427,3.664010,0.911225,1.321953,3.560542,6.068515,-0.795673,0.198039,-1.705629,...,3.861024,2.038694,2.597460,2.434824,2.524942,2.572767,13.539273,13.539273,2.376335,3.302378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17620,0.284636,2.194480,4.703390,0.814588,1.239996,1.451831,7.643768,-0.498119,3.479699,-1.214270,...,3.606118,2.043599,2.325969,2.347134,2.320139,2.292332,13.258838,13.258838,2.259281,3.443482
17621,0.459102,2.421001,4.873527,1.102422,1.471217,1.771189,3.612937,-0.324706,1.182912,-1.272753,...,3.855399,2.293863,2.546157,2.490664,2.572189,2.545829,13.512334,13.512334,2.537966,3.876017
17622,2.354685,2.152569,3.585112,0.904770,1.340790,1.417062,7.595169,-0.405086,0.796029,-1.078900,...,3.627826,2.068357,2.344308,2.365683,2.353494,2.348888,13.315394,13.315394,2.232305,3.625496
17623,0.352523,2.472335,4.215219,1.224536,1.660556,1.736829,7.021851,-0.085319,1.115795,-0.759133,...,3.947592,2.388124,2.664075,2.685449,2.673261,2.668655,13.635160,13.635160,2.552071,3.945263


In [4]:
# Define a list of invalid names: no information or virus are invalid
invalid_names = ['f__.g__.s__','g__.s__','virus']

for col in abundance_imputed.columns:
    if any(invalid_name in col for invalid_name in invalid_names):
        abundance_imputed.drop(col, axis=1, inplace=True)

# Extract family, genera, and species from existing column names
def extract_taxonomy(column):
    return '; '.join([t for t in column.split('.') if t.startswith('f__') or t.startswith('g__') or t.startswith('s__')])

new_columns = [extract_taxonomy(column) for column in abundance_imputed.columns]

# Rename columns
abundance_imputed.columns = new_columns

# Add a column containing original patient id for later "merge" step
pid = abundance.index.astype('str')
abundance_imputed.insert(0,'sample_id1',pid)

abundance_imputed

Unnamed: 0,sample_id1,f__Halomonadaceae; g__Cobetia,f__Thermoanaerobacteraceae; g__Ammonifex,f__Cryomorphaceae; g__Owenweeksia,f__Haloplasmataceae; g__Haloplasma,f__Desulfobacteraceae; g__Desulfosarcina,f__Enterobacteriaceae; g__Siccibacter,f__Acetobacteraceae; g__Asaia,f__Rhodobiaceae; g__Parvibaculum,g__Neosynechococcus,...,f__Clostridiales_Family_XIII; g__Casaltella,f__Chromobacteriaceae; g__Deefgea,f__Phyllobacteriaceae; g__Aquamicrobium,f__Deferribacteraceae; g__Geovibrio,f__Ruminococcaceae; g__Subdoligranulum,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,s17489,0.750758,1.698080,1.211822,2.098064,1.901532,0.746481,-0.731171,0.069450,1.183112,...,2.952431,1.053012,1.641701,1.597915,1.615110,1.613142,12.579647,12.579647,1.579534,2.482244
1,s17512,2.957832,2.437834,1.979203,0.569559,1.184304,1.657705,0.251591,2.690162,2.091016,...,3.623247,2.048703,2.357091,2.339657,2.378702,2.338944,13.305450,13.305450,2.332050,2.855626
2,s17498,2.865233,2.223742,1.849716,0.408622,2.003357,1.461779,1.840733,0.527662,1.917301,...,3.531484,1.879986,2.273343,2.176359,2.268394,2.254124,13.220630,13.220630,2.211553,3.193130
3,s17528,1.331853,2.394651,1.813642,0.501072,0.985425,3.488439,4.899925,2.730293,1.865934,...,3.649382,1.827052,2.385817,2.223181,2.313300,2.361125,13.327630,13.327630,2.164692,3.090736
4,s17535,-0.041467,2.606293,2.025285,2.297677,-0.041719,1.378154,2.874528,0.620008,2.077577,...,3.861024,2.038694,2.597460,2.434824,2.524942,2.572767,13.539273,13.539273,2.376335,3.302378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17620,s23,1.492351,2.495612,2.118032,0.773989,0.951887,2.135941,3.213407,1.754071,2.045186,...,3.606118,2.043599,2.325969,2.347134,2.320139,2.292332,13.258838,13.258838,2.259281,3.443482
17621,s20,1.642807,2.709255,2.253808,1.035752,2.675408,2.310780,1.919476,1.929309,2.437095,...,3.855399,2.293863,2.546157,2.490664,2.572189,2.545829,13.512334,13.512334,2.537966,3.876017
17622,s27,1.422109,2.568446,2.202434,0.955879,1.782572,2.043960,1.740752,1.762615,2.213836,...,3.627826,2.068357,2.344308,2.365683,2.353494,2.348888,13.315394,13.315394,2.232305,3.625496
17623,s26,1.741875,2.888213,2.522201,1.275646,1.571824,2.363727,2.060519,2.082382,2.533603,...,3.947592,2.388124,2.664075,2.685449,2.673261,2.668655,13.635160,13.635160,2.552071,3.945263


In [5]:
metadata = pd.read_csv('Metadata-TCGA-All-18116-Samples.csv', index_col=0)
metadata = metadata[metadata['investigation'] == 'TCGA-COAD'] # Extract only colon cancer
metadata = pd.DataFrame(metadata)
pid_meta = metadata.index.astype('str')
metadata.insert(0,'sample_id1',pid_meta)
metadata.head

<bound method NDFrame.head of        sample_id1                         gdc_file_uuid  \
s13008     s13008  5DCDB663-BC72-4928-970A-2C50103CF337   
s12994     s12994  5220B7B2-6F81-4C7F-BFB4-B523C2C30DDF   
s13005     s13005  95F7BD48-5E28-4892-BC8D-A6DEC3B02D70   
s12987     s12987  78F6E4E8-1349-43CB-AAA7-C56FD833DEDD   
s12895     s12895  D04CC01A-3CCB-4FC1-BAD8-2EDE20790454   
...           ...                                   ...   
s13862     s13862  36293F9B-A284-467E-AB73-AE62FC5EAB6E   
s13849     s13849  86E4599E-AB45-4118-B105-32782288A0D3   
s13857     s13857  CCC44CA2-2E42-4083-9252-597D398BE401   
s13839     s13839  0DD44906-041D-4AF2-8F42-A1BED185E1C6   
s13842     s13842  7942EAB9-D80A-496E-B3CB-50B42BD23334   

                                                 filename  age_at_diagnosis  \
s13008  UNCID_340264.TCGA-AZ-4615-01A-01R-1410-07.1103...              84.0   
s12994  UNCID_269004.TCGA-AA-A02R-01A-01R-A00A-07.1008...              84.0   
s13005  UNCID_339870.TCG

In [6]:
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE) # In case there are invalid string in feature names

SG_CRC_BA = pd.merge(abundance_imputed, metadata, on='sample_id1', how='inner')
SG_CRC_BA.columns = [regex.sub("_",col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in SG_CRC_BA.columns.values]
SG_CRC_BA
#SG_CRC.to_csv('merged_CRC', index = False)


Unnamed: 0,sample_id1,f__Halomonadaceae; g__Cobetia,f__Thermoanaerobacteraceae; g__Ammonifex,f__Cryomorphaceae; g__Owenweeksia,f__Haloplasmataceae; g__Haloplasma,f__Desulfobacteraceae; g__Desulfosarcina,f__Enterobacteriaceae; g__Siccibacter,f__Acetobacteraceae; g__Asaia,f__Rhodobiaceae; g__Parvibaculum,g__Neosynechococcus,...,portion_weight,aliquot_concentration,analyte_A260A280Ratio,analyte_amount,analyte_type_label,radiation_therapy_code_label,radiation_therapy_site_label,radiation_therapy_type_label,year_of_diagnosis,vital_status_label
0,s13008,0.549996,3.627426,1.243335,0.200708,-0.019726,3.566605,1.143794,3.529965,1.626574,...,36.0,0.14,1.80,147.30,RNA,,,,2009.0,Alive
1,s12994,1.036992,2.472233,1.813183,0.705863,-0.388531,1.471217,1.448203,1.290475,2.069139,...,110.0,0.15,1.72,132.38,RNA,,,,2006.0,Dead
2,s13005,0.391081,4.148250,3.489199,0.059951,-1.034442,0.825306,2.387254,3.451919,3.745155,...,137.0,0.17,1.80,583.30,RNA,,,,2002.0,Dead
3,s12987,0.307795,1.743036,2.668948,-0.023334,-1.117728,0.742020,2.303968,0.561278,1.339942,...,158.0,0.14,1.80,438.30,RNA,,,,2007.0,Alive
4,s12895,1.091898,2.527139,1.868089,2.345731,1.251338,1.526123,1.503109,1.345381,2.124045,...,114.0,0.13,1.80,533.30,RNA,,,,2009.0,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,s13862,0.955064,2.390305,1.731255,0.623935,1.114504,1.389289,3.688203,1.208548,1.987211,...,110.0,0.15,1.73,239.76,RNA,,,,2009.0,Alive
1002,s13849,2.017887,1.819532,0.994857,4.463031,-1.705226,0.701323,0.955917,0.587535,1.494217,...,129.0,0.15,1.80,877.30,RNA,,,,2009.0,Alive
1003,s13857,3.343320,2.743249,2.314112,1.020499,3.378589,3.974225,1.981555,1.969840,2.411123,...,36.0,0.14,1.70,19.80,RNA,951909FD-4B2B-4D3F-918A-34269D0A70FC,Primary Tumor Field,EXTERNAL BEAM,2008.0,Alive
1004,s13839,1.289116,2.429583,3.706441,2.381846,1.259359,2.226014,1.631685,1.822473,2.084668,...,30.0,0.14,2.03,80.04,DNA,,,,2004.0,Alive


In [8]:
# Export final pre-processed meta-transcriptomics data
SG_CRC_BA.to_csv('Poore_COAD', index=False)