In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [15]:
abundance = pd.read_csv("Kraken-TCGA-Voom-SNM-All-Putative-Contaminants-Removed-Data.csv", index_col=0) # Load raw data
abundance = abundance.T
abundance.index.name = 'taxonomy'
abundance = abundance[abundance.index.str.startswith('k__Bacteria')] # Select bacteria kingdom
abundance = abundance.T
print(abundance.shape)

(17625, 1206)


In [16]:
# Define a list of invalid names: no information or virus are invalid
invalid_names = ['f__; g__; s__','g__; s__','Afipia','Abiotrophia','Acidovorax','Acinetobacter','Aeromicrobium','Aquabacterium','Arthrobacter','Asticcacaulis','Aurantimonas','Azoarcus','Azospira','Bacillus','Beijerinckia','Beutenbergia','Bosea','Bradyrhizobium','Brevibacillus','Brevundimonas','Brochothrix','Burkholderia','Caulobacter','Chryseobacterium','Corynebacterium','Craurococcus','Curtobacterium','Deinococcus','Devosia','Dietzia','Dyadobacter','Enhydrobacter','Enterobacter','Escherichia','Facklamia','Flavobacterium','Geodermatophilus','Hoeflea','Hydrotalea','Janibacter','Kingella','Kocuria','Leptothrix','Limnobacter','Massilia','Mesorhizobium','Methylobacterium','Methylophilus','Methyloversatilis','Microbacterium','Micrococcus','Microlunatus','Nevskia','Niastella','Novosphingobium','Ochrobactrum','Olivibacter','Oxalobacter','Paenibacillus','Paracoccus','Patulibacter','Pedobacter','Pedomicrobium','Pelomonas','Phyllobacterium','Polaromonas','Propionibacterium','Pseudomonas','Pseudoxanthomonas','Psychrobacter','Ralstonia','Rhizobium','Rhodococcus','Roseomonas','Schlegelella','Sphingobium','Sphingomonas','Sphingopyxis','Stenotrophomonas','Streptococcus','Sulfuritalea','Tsukamurella','Undibacterium','Variovorax','Wautersiella','Xanthomonas']

for col in abundance.columns:
    if any(invalid_name in col for invalid_name in invalid_names):
        abundance.drop(col, axis=1, inplace=True)

# Extract only genera from existing column names
def extract_taxonomy(column):
    return '; '.join([t for t in column.split('.') if t.startswith('g__')])

new_columns = [extract_taxonomy(column) for column in abundance.columns]

abundance.columns = new_columns

# Add a column containing original patient id for later "merge" step
pid = abundance.index.astype('str')
abundance.insert(0,'sample_id1',pid)

abundance

Unnamed: 0,sample_id1,g__Cobetia,g__Ammonifex,g__Owenweeksia,g__Haloplasma,g__Desulfosarcina,g__Asaia,g__Parvibaculum,g__Neosynechococcus,g__Zymomonas,...,g__Sorangium,g__Hydrogenobacter,g__Cloacibacillus,g__Sellimonas,g__Gracilimonas,g__Casaltella,g__Deefgea,g__Aquamicrobium,g__Geovibrio,g__Subdoligranulum
s17489,s17489,0.750758,1.698080,1.211822,2.098064,1.901532,-0.731171,0.069450,1.183112,0.950821,...,8.198331,1.474081,1.593105,1.600950,1.471551,2.952431,1.053012,1.641701,1.597915,1.615110
s17512,s17512,2.957832,2.437834,1.979203,0.569559,1.184304,0.251591,2.690162,2.091016,-0.634765,...,10.000061,2.253314,2.322985,2.320565,2.294672,3.623247,2.048703,2.357091,2.339657,2.378702
s17498,s17498,2.865233,2.223742,1.849716,0.408622,2.003357,1.840733,0.527662,1.917301,1.279785,...,11.319977,2.106385,2.229016,2.189063,2.134658,3.531484,1.879986,2.273343,2.176359,2.268394
s17528,s17528,1.331853,2.394651,1.813642,0.501072,0.985425,4.899925,2.730293,1.865934,1.824556,...,13.229603,2.214463,2.318367,2.309160,2.196471,3.649382,1.827052,2.385817,2.223181,2.313300
s17535,s17535,-0.041467,2.606293,2.025285,2.297677,-0.041719,2.874528,0.620008,2.077577,-0.771157,...,11.135500,2.426106,2.530010,2.520803,2.408113,3.861024,2.038694,2.597460,2.434824,2.524942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s23,s23,1.492351,2.495612,2.118032,0.773989,0.951887,3.213407,1.754071,2.045186,-0.399423,...,10.581427,2.304599,2.269756,2.323190,2.265803,3.606118,2.043599,2.325969,2.347134,2.320139
s20,s20,1.642807,2.709255,2.253808,1.035752,2.675408,1.919476,1.929309,2.437095,-0.077439,...,9.769648,2.519072,2.524853,2.577375,2.557251,3.855399,2.293863,2.546157,2.490664,2.572189
s27,s27,1.422109,2.568446,2.202434,0.955879,1.782572,1.740752,1.762615,2.213836,-0.036487,...,8.761304,2.327410,2.335390,2.387210,2.402852,3.627826,2.068357,2.344308,2.365683,2.353494
s26,s26,1.741875,2.888213,2.522201,1.275646,1.571824,2.060519,2.082382,2.533603,0.283279,...,9.261643,2.647177,2.655157,2.706976,2.722619,3.947592,2.388124,2.664075,2.685449,2.673261


In [17]:
metadata = pd.read_csv('Metadata-TCGA-All-18116-Samples.csv', index_col=0)
metadata = metadata[metadata['investigation'] == 'TCGA-COAD'] # Extract only colon cancer
metadata = pd.DataFrame(metadata)

# Later for merge
pid_meta = metadata.index.astype('str')
metadata.insert(0,'sample_id1',pid_meta)
metadata.head

<bound method NDFrame.head of        sample_id1                         gdc_file_uuid  \
s13008     s13008  5DCDB663-BC72-4928-970A-2C50103CF337   
s12994     s12994  5220B7B2-6F81-4C7F-BFB4-B523C2C30DDF   
s13005     s13005  95F7BD48-5E28-4892-BC8D-A6DEC3B02D70   
s12987     s12987  78F6E4E8-1349-43CB-AAA7-C56FD833DEDD   
s12895     s12895  D04CC01A-3CCB-4FC1-BAD8-2EDE20790454   
...           ...                                   ...   
s13862     s13862  36293F9B-A284-467E-AB73-AE62FC5EAB6E   
s13849     s13849  86E4599E-AB45-4118-B105-32782288A0D3   
s13857     s13857  CCC44CA2-2E42-4083-9252-597D398BE401   
s13839     s13839  0DD44906-041D-4AF2-8F42-A1BED185E1C6   
s13842     s13842  7942EAB9-D80A-496E-B3CB-50B42BD23334   

                                                 filename  age_at_diagnosis  \
s13008  UNCID_340264.TCGA-AZ-4615-01A-01R-1410-07.1103...              84.0   
s12994  UNCID_269004.TCGA-AA-A02R-01A-01R-A00A-07.1008...              84.0   
s13005  UNCID_339870.TCG

In [18]:
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE) # In case there are invalid string in feature names

Poore_COAD = pd.merge(abundance, metadata, on='sample_id1', how='inner')
Poore_COAD.columns = [regex.sub("_",col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in Poore_COAD.columns.values]
Poore_COAD


Unnamed: 0,sample_id1,g__Cobetia,g__Ammonifex,g__Owenweeksia,g__Haloplasma,g__Desulfosarcina,g__Asaia,g__Parvibaculum,g__Neosynechococcus,g__Zymomonas,...,portion_weight,aliquot_concentration,analyte_A260A280Ratio,analyte_amount,analyte_type_label,radiation_therapy_code_label,radiation_therapy_site_label,radiation_therapy_type_label,year_of_diagnosis,vital_status_label
0,s13008,0.549996,3.627426,1.243335,0.200708,-0.019726,1.143794,3.529965,1.626574,-0.920077,...,36.0,0.14,1.80,147.30,RNA,,,,2009.0,Alive
1,s12994,1.036992,2.472233,1.813183,0.705863,-0.388531,1.448203,1.290475,2.069139,-0.342958,...,110.0,0.15,1.72,132.38,RNA,,,,2006.0,Dead
2,s13005,0.391081,4.148250,3.489199,0.059951,-1.034442,2.387254,3.451919,3.745155,0.596093,...,137.0,0.17,1.80,583.30,RNA,,,,2002.0,Dead
3,s12987,0.307795,1.743036,2.668948,-0.023334,-1.117728,2.303968,0.561278,1.339942,-1.072154,...,158.0,0.14,1.80,438.30,RNA,,,,2007.0,Alive
4,s12895,1.091898,2.527139,1.868089,2.345731,1.251338,1.503109,1.345381,2.124045,-0.288051,...,114.0,0.13,1.80,533.30,RNA,,,,2009.0,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001,s13862,0.955064,2.390305,1.731255,0.623935,1.114504,3.688203,1.208548,1.987211,-0.424885,...,110.0,0.15,1.73,239.76,RNA,,,,2009.0,Alive
1002,s13849,2.017887,1.819532,0.994857,4.463031,-1.705226,0.955917,0.587535,1.494217,-1.100823,...,129.0,0.15,1.80,877.30,RNA,,,,2009.0,Alive
1003,s13857,3.343320,2.743249,2.314112,1.020499,3.378589,1.981555,1.969840,2.411123,0.229642,...,36.0,0.14,1.70,19.80,RNA,951909FD-4B2B-4D3F-918A-34269D0A70FC,Primary Tumor Field,EXTERNAL BEAM,2008.0,Alive
1004,s13839,1.289116,2.429583,3.706441,2.381846,1.259359,1.631685,1.822473,2.084668,2.162532,...,30.0,0.14,2.03,80.04,DNA,,,,2004.0,Alive


In [19]:
# Export final pre-processed meta-transcriptomics data
Poore_COAD.to_csv('Poore_COAD', index=False)