In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [2]:
abundance = pd.read_csv("Kraken-TCGA-Voom-SNM-All-Putative-Contaminants-Removed-Data.csv", index_col=0) # Load raw data
abundance = abundance[abundance.sum(axis = 1) > 0] # remove columns that all values are 0
print(abundance.shape)

(17625, 1411)


In [3]:
# Impute missing values
feature_names = abundance.columns
imputer = KNNImputer(n_neighbors=5, weights="uniform")
abundance_imputed = imputer.fit_transform(abundance.T)
abundance_imputed = pd.DataFrame(abundance_imputed.T, columns = feature_names)
abundance_imputed.iloc[0] = abundance.iloc[0]
abundance_imputed

Unnamed: 0,k__Viruses.f__Phycodnaviridae.g__Prasinovirus,k__Viruses.o__Caudovirales.f__Siphoviridae.g__Sfi1unalikevirus,k__Viruses.o__Herpesvirales.f__Herpesviridae.g__Simplexvirus,k__Viruses.f__Poxviridae.g__Parapoxvirus,k__Viruses.f__Bicaudaviridae.g__Bicaudavirus,k__Viruses.o__Picornavirales.f__Picornaviridae.g__Aquamavirus,k__Viruses.o__Herpesvirales.f__Herpesviridae.g__Mardivirus,k__Viruses.f__Baculoviridae.g__Deltabaculovirus,k__Viruses.f__Papillomaviridae.g__Taupapillomavirus,k__Viruses.o__Caudovirales.f__Myoviridae.g__I3likevirus,...,k__Bacteria.p__Firmicutes.c__Clostridia.o__Clostridiales.f__Clostridiales_Family_XIII._Incertae_Sedis.g__Casaltella,k__Bacteria.p__Proteobacteria.c__Betaproteobacteria.o__Neisseriales.f__Chromobacteriaceae.g__Deefgea,k__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Phyllobacteriaceae.g__Aquamicrobium,k__Bacteria.p__Deferribacteres.c__Deferribacteres.o__Deferribacterales.f__Deferribacteraceae.g__Geovibrio,k__Bacteria.p__Firmicutes.c__Clostridia.o__Clostridiales.f__Ruminococcaceae.g__Subdoligranulum,contaminant1Harvard,contaminant2HarvardCanadaBaylorWashU,contaminant3AllSeqCenters,contaminant4RandomSpikesHarvard,contaminant5RandomSpikes1000
0,-0.944325,1.378696,4.257198,2.263261,0.478455,3.283590,6.175448,0.554111,1.707182,-2.701144,...,2.952431,1.053012,1.641701,1.597915,1.615110,1.613142,12.579647,12.579647,1.579534,2.482244
1,2.799427,2.095355,5.282909,0.625737,1.262373,2.598059,5.032088,-0.978710,-0.066234,0.042982,...,3.623247,2.048703,2.357091,2.339657,2.378702,2.338944,13.305450,13.305450,2.332050,2.855626
2,-0.350754,1.969313,4.789957,0.645980,1.072432,2.454999,6.929492,-1.045204,4.299620,0.037454,...,3.531484,1.879986,2.273343,2.176359,2.268394,2.254124,13.220630,13.220630,2.211553,3.193130
3,2.163915,1.915785,4.830879,0.699582,1.110310,0.812846,5.856873,1.314613,2.308325,1.252654,...,3.649382,1.827052,2.385817,2.223181,2.313300,2.361125,13.327630,13.327630,2.164692,3.090736
4,0.053629,2.127427,3.664010,0.911225,1.321953,3.560542,6.068515,-0.795673,0.198039,-1.705629,...,3.861024,2.038694,2.597460,2.434824,2.524942,2.572767,13.539273,13.539273,2.376335,3.302378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17620,0.284636,2.194480,4.703390,0.814588,1.239996,1.451831,7.643768,-0.498119,3.479699,-1.214270,...,3.606118,2.043599,2.325969,2.347134,2.320139,2.292332,13.258838,13.258838,2.259281,3.443482
17621,0.459102,2.421001,4.873527,1.102422,1.471217,1.771189,3.612937,-0.324706,1.182912,-1.272753,...,3.855399,2.293863,2.546157,2.490664,2.572189,2.545829,13.512334,13.512334,2.537966,3.876017
17622,2.354685,2.152569,3.585112,0.904770,1.340790,1.417062,7.595169,-0.405086,0.796029,-1.078900,...,3.627826,2.068357,2.344308,2.365683,2.353494,2.348888,13.315394,13.315394,2.232305,3.625496
17623,0.352523,2.472335,4.215219,1.224536,1.660556,1.736829,7.021851,-0.085319,1.115795,-0.759133,...,3.947592,2.388124,2.664075,2.685449,2.673261,2.668655,13.635160,13.635160,2.552071,3.945263


In [4]:
# Define a list of invalid names: no information or virus are invalid
invalid_names = ['f__.g__.s__','g__.s__','virus']

for col in abundance_imputed.columns:
    if any(invalid_name in col for invalid_name in invalid_names):
        abundance_imputed.drop(col, axis=1, inplace=True)

# Extract family, genera, and species from existing column names
def extract_taxonomy(column):
    return '; '.join([t for t in column.split('.') if t.startswith('f__') or t.startswith('g__') or t.startswith('s__')])

new_columns = [extract_taxonomy(column) for column in abundance_imputed.columns]

# Rename columns
abundance_imputed.columns = new_columns

# Add a column containing original patient id for later "merge" step
pid = abundance.index.astype('str')
abundance_imputed.insert(0,'sample_id1',pid)

abundance_imputed

Unnamed: 0,sample_id1,f__Halomonadaceae; g__Cobetia,f__Thermoanaerobacteraceae; g__Ammonifex,f__Cryomorphaceae; g__Owenweeksia,f__Haloplasmataceae; g__Haloplasma,f__Desulfobacteraceae; g__Desulfosarcina,f__Enterobacteriaceae; g__Siccibacter,f__Acetobacteraceae; g__Asaia,f__Rhodobiaceae; g__Parvibaculum,g__Neosynechococcus,...,f__Clostridiales_Family_XIII; g__Casaltella,f__Chromobacteriaceae; g__Deefgea,f__Phyllobacteriaceae; g__Aquamicrobium,f__Deferribacteraceae; g__Geovibrio,f__Ruminococcaceae; g__Subdoligranulum,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,s17489,0.750758,1.698080,1.211822,2.098064,1.901532,0.746481,-0.731171,0.069450,1.183112,...,2.952431,1.053012,1.641701,1.597915,1.615110,1.613142,12.579647,12.579647,1.579534,2.482244
1,s17512,2.957832,2.437834,1.979203,0.569559,1.184304,1.657705,0.251591,2.690162,2.091016,...,3.623247,2.048703,2.357091,2.339657,2.378702,2.338944,13.305450,13.305450,2.332050,2.855626
2,s17498,2.865233,2.223742,1.849716,0.408622,2.003357,1.461779,1.840733,0.527662,1.917301,...,3.531484,1.879986,2.273343,2.176359,2.268394,2.254124,13.220630,13.220630,2.211553,3.193130
3,s17528,1.331853,2.394651,1.813642,0.501072,0.985425,3.488439,4.899925,2.730293,1.865934,...,3.649382,1.827052,2.385817,2.223181,2.313300,2.361125,13.327630,13.327630,2.164692,3.090736
4,s17535,-0.041467,2.606293,2.025285,2.297677,-0.041719,1.378154,2.874528,0.620008,2.077577,...,3.861024,2.038694,2.597460,2.434824,2.524942,2.572767,13.539273,13.539273,2.376335,3.302378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17620,s23,1.492351,2.495612,2.118032,0.773989,0.951887,2.135941,3.213407,1.754071,2.045186,...,3.606118,2.043599,2.325969,2.347134,2.320139,2.292332,13.258838,13.258838,2.259281,3.443482
17621,s20,1.642807,2.709255,2.253808,1.035752,2.675408,2.310780,1.919476,1.929309,2.437095,...,3.855399,2.293863,2.546157,2.490664,2.572189,2.545829,13.512334,13.512334,2.537966,3.876017
17622,s27,1.422109,2.568446,2.202434,0.955879,1.782572,2.043960,1.740752,1.762615,2.213836,...,3.627826,2.068357,2.344308,2.365683,2.353494,2.348888,13.315394,13.315394,2.232305,3.625496
17623,s26,1.741875,2.888213,2.522201,1.275646,1.571824,2.363727,2.060519,2.082382,2.533603,...,3.947592,2.388124,2.664075,2.685449,2.673261,2.668655,13.635160,13.635160,2.552071,3.945263


In [5]:
metadata = pd.read_csv('Metadata-TCGA-All-18116-Samples.csv', index_col=0)
metadata = metadata[metadata['investigation'] == 'TCGA-GBM'] # Extract only brain cancer
metadata = pd.DataFrame(metadata)
pid_meta = metadata.index.astype('str')
metadata.insert(0,'sample_id1',pid_meta)
metadata.head

<bound method NDFrame.head of        sample_id1                         gdc_file_uuid  \
s14192     s14192  BDA63807-A377-4773-9FC6-0B72D546E55A   
s14168     s14168  06F9237F-6DC2-4D16-A8CD-34A74BBCEFC5   
s14197     s14197  320AA0A3-CE24-4B14-92DC-563F3BD9A196   
s14161     s14161  6F07A8B9-FBDF-4979-9A92-349556710D96   
s14165     s14165  D26469D0-68F5-4E32-A435-C70DA432B270   
...           ...                                   ...   
s14113     s14113  50B29BBA-DE4A-43F4-AD1E-48CB1B97BA78   
s14107     s14107  33B20567-1D6A-43C5-AB2B-22D629C85979   
s14080     s14080  A07D57FD-65CA-4E02-945E-5DA2957DF576   
s14087     s14087  DEBDB819-4888-48C7-A65A-FAC873B539CF   
s14103     s14103  4580948D-4987-44CB-B488-4A9B0410A65F   

                                                 filename  age_at_diagnosis  \
s14192  UNCID_1544138.17c98d8c-8bd4-4b16-a384-0497d730...              30.0   
s14168                   G2147.TCGA-06-0648-10A-01D.9.bam              77.0   
s14197  UNCID_1544755.01

In [6]:
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE) # In case there are invalid string in feature names

SG_CRC_BA = pd.merge(abundance_imputed, metadata, on='sample_id1', how='inner')
SG_CRC_BA.columns = [regex.sub("_",col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in SG_CRC_BA.columns.values]
SG_CRC_BA
#SG_CRC.to_csv('merged_CRC', index = False)


Unnamed: 0,sample_id1,f__Halomonadaceae; g__Cobetia,f__Thermoanaerobacteraceae; g__Ammonifex,f__Cryomorphaceae; g__Owenweeksia,f__Haloplasmataceae; g__Haloplasma,f__Desulfobacteraceae; g__Desulfosarcina,f__Enterobacteriaceae; g__Siccibacter,f__Acetobacteraceae; g__Asaia,f__Rhodobiaceae; g__Parvibaculum,g__Neosynechococcus,...,portion_weight,aliquot_concentration,analyte_A260A280Ratio,analyte_amount,analyte_type_label,radiation_therapy_code_label,radiation_therapy_site_label,radiation_therapy_type_label,year_of_diagnosis,vital_status_label
0,s14192,0.893485,2.203858,1.674120,0.424344,1.372460,2.022902,1.464566,1.517857,1.734031,...,112.0,0.13,1.85,121.9,RNA,E843157C-27A8-4F36-8F07-96A24E8C9771,Primary Tumor Field,EXTERNAL BEAM,2002.0,Dead
1,s14168,0.180814,0.135872,-0.954348,0.682776,2.751091,-0.914354,1.512538,-1.407964,1.084384,...,,0.16,1.90,20.0,DNA,,,,2007.0,Dead
2,s14197,1.543645,2.854018,2.324280,1.074504,1.842048,2.673062,2.114726,2.168017,2.384191,...,40.0,0.13,,46.5,RNA,3A8D2E10-2AB1-40BA-A501-B0A56DADC35C,Primary Tumor Field,EXTERNAL BEAM,2002.0,Dead
3,s14161,0.997963,2.170523,1.664342,0.570449,-0.332872,2.024489,1.506755,1.549414,1.798506,...,122.0,0.15,1.80,219.7,RNA,E8425728-844D-46E5-9835-EAA9CB3A2EB1,Primary Tumor Field,EXTERNAL BEAM,2009.0,Alive
4,s14165,1.021629,2.180318,1.635810,0.419256,1.868306,1.907644,1.526159,1.553676,3.252892,...,34.0,0.15,1.80,52.6,RNA,45027352-CD80-448B-AC0F-4FD3173529DA,Primary Tumor Field,EXTERNAL BEAM,2010.0,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,s14113,1.125039,2.549325,2.202670,0.560846,1.890649,2.471937,2.116887,1.787414,1.931288,...,70.0,0.15,1.90,98.4,RNA,8190E5D0-A3CA-4451-97DA-1619FC5C20FB,Primary Tumor Field,EXTERNAL BEAM,2007.0,Dead
485,s14107,2.359550,2.084960,1.555222,0.305446,-2.833901,1.904004,1.345668,1.398959,1.615133,...,,0.17,1.81,123.6,RNA,7450C1EA-8F99-4CF1-9471-F3A344E73358,Primary Tumor Field,EXTERNAL BEAM,2008.0,Alive
486,s14080,0.657392,2.110302,1.562616,0.397289,2.176143,1.778996,1.338868,1.185277,1.730294,...,,0.55,1.79,35.0,RNA,8E82EE4F-FE89-45DC-96EE-E68378F12A75,Primary Tumor Field,EXTERNAL BEAM,2005.0,Dead
487,s14087,3.851544,2.354561,1.824823,0.575047,1.523163,2.173606,1.615270,1.668561,5.791625,...,135.0,0.16,1.80,330.4,RNA,E6CB395F-9DF5-4C99-996A-98F2FDA6AEB9,Primary Tumor Field,EXTERNAL BEAM,2009.0,Alive


In [7]:
class_counts = SG_CRC_BA['sample_type'].value_counts()
print(class_counts)

sample_type
Primary Tumor           382
Blood Derived Normal     71
Recurrent Tumor          36
Name: count, dtype: int64


In [8]:
# Export final pre-processed meta-transcriptomics data
SG_CRC_BA.to_csv('Poore_GBM', index=False)