In [1]:
import pandas as pd
import numpy as np


In [2]:
# Load pre-processed data
SG_BA = pd.read_csv("kraken2_all_CRC_samples_biom.tsv", index_col=0, sep = '\t')

# Remove the current index and reset it to default
SG_BA = SG_BA.reset_index(drop=True)

# Set the last column as the new index
SG_BA = SG_BA.set_index(SG_BA.columns[-1])

SG_BA = SG_BA[SG_BA.index.str.startswith('k__Bacteria')] # Select bacteria kingdom
SG_BA = SG_BA.T
SG_BA.columns = SG_BA.columns.str.strip()

# Calculate the sum of each row
row_sums = SG_BA.sum(axis=1)

# Divide each element by the sum of its row
SG_BA = SG_BA.div(row_sums, axis=0)

non_zero_counts = (SG_BA > 0).sum(axis=0)

half_samples = len(SG_BA) / 2

SG_BA = SG_BA.loc[:, non_zero_counts > half_samples]

SG_BA


taxonomy,k__Bacteria; p__Bacillota; c__Clostridia; o__Lachnospirales; f__; g__; s__,k__Bacteria; p__Bacillota; c__Clostridia; o__Lachnospirales; f__Lachnospiraceae; g__; s__,k__Bacteria; p__Bacillota; c__Clostridia; o__Lachnospirales; f__Lachnospiraceae; g__Roseburia; s__,k__Bacteria; p__Bacillota; c__Clostridia; o__Lachnospirales; f__Lachnospiraceae; g__Roseburia; s__sp. 831b,k__Bacteria; p__Bacillota; c__Clostridia; o__Lachnospirales; f__Lachnospiraceae; g__Roseburia; s__sp. 499,k__Bacteria; p__Bacillota; c__Clostridia; o__Lachnospirales; f__Lachnospiraceae; g__Roseburia; s__rectibacter,k__Bacteria; p__Bacillota; c__Clostridia; o__Lachnospirales; f__Lachnospiraceae; g__Roseburia; s__hominis,k__Bacteria; p__Bacillota; c__Clostridia; o__Lachnospirales; f__Lachnospiraceae; g__Roseburia; s__intestinalis,k__Bacteria; p__Bacillota; c__Clostridia; o__Lachnospirales; f__Lachnospiraceae; g__Enterocloster; s__,k__Bacteria; p__Bacillota; c__Clostridia; o__Lachnospirales; f__Lachnospiraceae; g__Enterocloster; s__bolteae,...,k__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__Alistipes; s__finegoldii,k__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__Alistipes; s__ihumii,k__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__Acetobacteroides; s__uncultured Acetobacteroides sp.,k__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Bacteroidales; f__Odoribacteraceae; g__Butyricimonas; s__faecihominis,k__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Bacteroidales; f__Odoribacteraceae; g__Butyricimonas; s__paravirosa,k__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Bacteroidales; f__Odoribacteraceae; g__Butyricimonas; s__virosa,k__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Bacteroidales; f__Barnesiellaceae; g__Coprobacter; s__secundus,k__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Bacteroidales; f__Dysgonomonadaceae; g__Petrimonas; s__mucosa,k__Bacteria; p__Bacteroidota; c__Cytophagia; o__Cytophagales; f__Hymenobacteraceae; g__Hymenobacter; s__,k__Bacteria; p__Bacteroidota; c__Sphingobacteriia; o__Sphingobacteriales; f__Sphingobacteriaceae; g__; s__
1030.kraken2.report,0.001200,0.101359,0.022759,0.011716,0.000576,0.000624,0.000240,0.000144,0.016133,0.008066,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00
1037.kraken2.report,0.025683,0.023230,0.000000,0.000199,0.000000,0.000000,0.000000,0.000000,0.000166,0.000033,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00
1053.kraken2.report,0.001901,0.054522,0.001872,0.000777,0.000267,0.000062,0.000395,0.000064,0.012216,0.000847,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00
1054.kraken2.report,0.003083,0.035217,0.000681,0.000569,0.000008,0.000052,0.000070,0.000102,0.007793,0.003352,...,0.000018,0.000012,0.000002,0.000197,0.000112,0.000007,0.000059,0.000118,0.000028,1.423687e-07
1066.kraken2.report,0.001843,0.047534,0.005289,0.001140,0.000197,0.000406,0.004901,0.000131,0.000869,0.000314,...,0.000668,0.000310,0.000009,0.000210,0.000149,0.000035,0.000122,0.000009,0.000004,4.367690e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662.kraken2.report,0.001306,0.022051,0.023173,0.000380,0.000014,0.000131,0.005200,0.009850,0.000478,0.000140,...,0.001534,0.000642,0.000087,0.000349,0.000089,0.000003,0.000252,0.000002,0.000003,1.362094e-05
1664.kraken2.report,0.003092,0.008010,0.002108,0.001265,0.000000,0.000000,0.000281,0.000000,0.001967,0.000000,...,0.000281,0.000141,0.000000,0.000422,0.001124,0.001405,0.000000,0.000000,0.000141,0.000000e+00
1665.kraken2.report,0.000741,0.021554,0.000533,0.000045,0.000009,0.000000,0.000000,0.000018,0.000063,0.000253,...,0.000000,0.000000,0.000018,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00
1672.kraken2.report,0.004128,0.024237,0.000336,0.000125,0.000125,0.000000,0.000227,0.000000,0.001212,0.000383,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000016,0.000000,0.000008,7.818242e-06


In [3]:
# Define a list of invalid names: no information or virus are invalid
invalid_names = ['f__; g__; s__','g__; s__','Afipia','Abiotrophia','Acidovorax','Acinetobacter','Aeromicrobium','Aquabacterium','Arthrobacter','Asticcacaulis','Aurantimonas','Azoarcus','Azospira','Bacillus','Beijerinckia','Beutenbergia','Bosea','Bradyrhizobium','Brevibacillus','Brevundimonas','Brochothrix','Burkholderia','Caulobacter','Chryseobacterium','Corynebacterium','Craurococcus','Curtobacterium','Deinococcus','Devosia','Dietzia','Dyadobacter','Enhydrobacter','Facklamia','Flavobacterium','Geodermatophilus','Hoeflea','Hydrotalea','Janibacter','Kingella','Kocuria','Leptothrix','Limnobacter','Massilia','Mesorhizobium','Methylobacterium','Methylophilus','Methyloversatilis','Microbacterium','Micrococcus','Microlunatus','Nevskia','Niastella','Novosphingobium','Ochrobactrum','Olivibacter','Oxalobacter','Paenibacillus','Paracoccus','Patulibacter','Pedobacter','Pedomicrobium','Pelomonas','Phyllobacterium','Polaromonas','Propionibacterium','Pseudoxanthomonas','Psychrobacter','Ralstonia','Rhizobium','Rhodococcus','Roseomonas','Schlegelella','Sphingobium','Sphingomonas','Sphingopyxis','Stenotrophomonas','Sulfuritalea','Tsukamurella','Undibacterium','Variovorax','Wautersiella','Xanthomonas']

# Remove columns with invalid names
for col in SG_BA.columns:
    if any(invalid_name in col for invalid_name in invalid_names):
        SG_BA.drop(col, axis=1, inplace=True)

# Extract only the genus level taxonomy from the column names
def extract_taxonomy(column):
    # Extract taxonomy levels; keep only those starting with 'g__'
    taxonomy_levels = [t for t in column.split('; ') if t.startswith('g__')]
    return '; '.join(taxonomy_levels) if taxonomy_levels else column

new_columns = [extract_taxonomy(column) for column in SG_BA.columns]
SG_BA.columns = new_columns

# Group by column names and sum the values of duplicate columns
SG_BA = SG_BA.groupby(SG_BA.columns, axis=1).sum()

# Define a function to transform the index names
def rename_index(index_name):
    # Extract the numeric part and prepend 'X'
    return 'X' + index_name.split('.')[0]

# Apply the function to the index
SG_BA.index = SG_BA.index.map(rename_index)
SG_BA.index.name = 'patient_id'
SG_BA

  SG_BA = SG_BA.groupby(SG_BA.columns, axis=1).sum()


Unnamed: 0_level_0,g__Acetivibrio,g__Acetobacterium,g__Acetobacteroides,g__Acidaminococcus,g__Actinobacillus,g__Actinomyces,g__Acutalibacter,g__Adlercreutzia,g__Aeromonas,g__Agathobacter,...,g__Tyzzerella,g__Veillonella,g__Vescimonas,g__Vibrio,g__Vogesella,g__Vulcanimicrobium,g__Wansuia,g__Xenorhabdus,g__Yersinia,g__Zobellia
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
X1030,0.000096,0.000000,0.000000,0.000048,0.000000,0.000000,0.000192,0.000000,0.000144,0.011379,...,0.000000,3.860374e-02,0.000816,0.000384,0.000000e+00,0.000096,0.001632,0.000144,1.920584e-04,0.000000
X1037,0.000066,0.000000,0.000000,0.000000,0.000000,0.000133,0.000000,0.000000,0.000265,0.000033,...,0.002485,2.733961e-02,0.000033,0.001094,1.656946e-04,0.000000,0.000099,0.000066,1.988335e-04,0.000000
X1053,0.000052,0.000004,0.000000,0.000008,0.000014,0.000041,0.000031,0.000064,0.021685,0.000477,...,0.000019,3.037008e-04,0.007378,0.000157,4.131984e-06,0.000004,0.000041,0.000027,5.578178e-05,0.000014
X1054,0.000041,0.000005,0.000002,0.000130,0.000001,0.000002,0.000152,0.000006,0.000021,0.000210,...,0.000027,9.965812e-07,0.001147,0.000014,5.694749e-07,0.000007,0.000037,0.000000,2.847375e-07,0.000006
X1066,0.000061,0.000083,0.000009,0.000515,0.000013,0.000079,0.000459,0.000044,0.000585,0.041445,...,0.000017,8.765953e-03,0.009666,0.000144,4.367690e-06,0.000013,0.000218,0.000022,6.551534e-05,0.000009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X1662,0.000044,0.000049,0.000087,0.000049,0.000000,0.000005,0.000073,0.000014,0.000063,0.018181,...,0.000143,1.225885e-04,0.003509,0.000024,0.000000e+00,0.000003,0.000078,0.000000,6.810470e-06,0.000019
X1664,0.000000,0.000000,0.000000,0.000281,0.000000,0.000000,0.000000,0.000000,0.000000,0.000843,...,0.000281,7.026419e-04,0.002811,0.000000,0.000000e+00,0.000141,0.000000,0.000000,1.405284e-04,0.000000
X1665,0.000009,0.000000,0.000018,0.000000,0.000018,0.000000,0.000000,0.000000,0.000090,0.008428,...,0.000009,1.111131e-03,0.000000,0.000117,2.710076e-05,0.000018,0.000163,0.000009,1.264702e-04,0.000000
X1672,0.000016,0.000016,0.000000,0.000242,0.000000,0.000000,0.000000,0.000008,0.000047,0.000063,...,0.000516,3.909121e-05,0.001071,0.000023,0.000000e+00,0.000016,0.000031,0.000000,0.000000e+00,0.000016


In [5]:
metadata = pd.read_csv("SG_metadata.csv", index_col=0,delimiter=",")
metadata = metadata.dropna()
metadata

Unnamed: 0,patient_id,TMB,KRAS,BRAF,NRAS,TP53,APC,PIK3CA,PIK3R1,SMAD4,...,Age.at.Diagnosis,Site.of.Primary.Colorectal.tumour,Side,Grade,TNM,Stage,iCMS,CMS,group3,group5
3,X106,1028,wt,wt,wt,wt,wt,wt,wt,wt,...,72.0,Transverse colon,Right,3,T4aN2(4/29)M0,IIIC,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
4,X153,80,mut,wt,wt,mut,mut,mut,wt,wt,...,66.0,Sigmoid colon,Left,2,T3N1(1/18)M0,IIIB,iCMS2,CMS4,iCMS2_MSS,iCMS2_fibrotic
5,X326,1512,mut,wt,wt,wt,wt,wt,wt,wt,...,55.0,Rectosigmoid junction,Left,2,Unknown,II,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
7,X420,1976,mut,wt,wt,mut,mut,wt,wt,wt,...,58.0,Sigmoid colon,Left,2,T3N1(1/23)M0,IIIB,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
8,X470,739,wt,wt,wt,wt,wt,wt,mut,mut,...,53.0,Ascending colon,Right,2,T3N1(1/11)M0,IIIB,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,X1658,82,mut,wt,wt,mut,mut,wt,wt,wt,...,59.0,Rectum,Left,2,T3N2b(8/22)M1,IV,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
143,X1662,74,mut,wt,wt,mut,wt,wt,wt,wt,...,67.0,Rectum,Left,2,T3N1a(1/36)M0,IIIB,iCMS3,CMS3,iCMS3_MSS,iCMS3_MSS
144,X1664,46,mut,wt,wt,mut,wt,wt,wt,wt,...,62.0,Sigmoid colon,Left,2,T3N2a(6/29)M0,IIIC,iCMS2,CMS4,iCMS2_MSS,iCMS2_fibrotic
145,X1665,93,mut,wt,wt,mut,mut,wt,wt,wt,...,78.0,Sigmoid colon,Left,2,T3N1c(0/15)M0,IIIB,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS


In [6]:
SG_CRC_BA = pd.merge(SG_BA, metadata, on='patient_id', how='inner')
SG_CRC_BA

Unnamed: 0,patient_id,g__Acetivibrio,g__Acetobacterium,g__Acetobacteroides,g__Acidaminococcus,g__Actinobacillus,g__Actinomyces,g__Acutalibacter,g__Adlercreutzia,g__Aeromonas,...,Age.at.Diagnosis,Site.of.Primary.Colorectal.tumour,Side,Grade,TNM,Stage,iCMS,CMS,group3,group5
0,X1030,0.000096,0.000000,0.000000,0.000048,0.000000,0.000000,0.000192,0.000000,0.000144,...,72.0,Sigmoid colon,Left,2,pT3N0(0/20)M0,IIA,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
1,X106,0.000045,0.000000,0.000000,0.000119,0.000000,0.000015,0.000030,0.000030,0.000015,...,72.0,Transverse colon,Right,3,T4aN2(4/29)M0,IIIC,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
2,X1076,0.000000,0.000000,0.000000,0.000085,0.000000,0.000170,0.000000,0.000000,0.001190,...,45.0,Cecum,Right,2,pT3N2(6/12)M1,IV,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
3,X1079,0.000018,0.000027,0.000002,0.006718,0.000006,0.000010,0.000005,0.000022,0.000166,...,65.0,Splenic flexure,Left,2,pT4aN0(0/6)M1,IV,iCMS3,CMS4,iCMS3_MSS,iCMS3_fibrotic
4,X1117,0.000000,0.000000,0.000000,0.000519,0.000000,0.000000,0.000000,0.000000,0.000000,...,45.0,Rectosigmoid junction,Left,2,pT3N0(0/12)M1,IV,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,X1658,0.000025,0.000025,0.000025,0.000738,0.000000,0.000000,0.000000,0.000051,0.000025,...,59.0,Rectum,Left,2,T3N2b(8/22)M1,IV,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
102,X1662,0.000044,0.000049,0.000087,0.000049,0.000000,0.000005,0.000073,0.000014,0.000063,...,67.0,Rectum,Left,2,T3N1a(1/36)M0,IIIB,iCMS3,CMS3,iCMS3_MSS,iCMS3_MSS
103,X1664,0.000000,0.000000,0.000000,0.000281,0.000000,0.000000,0.000000,0.000000,0.000000,...,62.0,Sigmoid colon,Left,2,T3N2a(6/29)M0,IIIC,iCMS2,CMS4,iCMS2_MSS,iCMS2_fibrotic
104,X1665,0.000009,0.000000,0.000018,0.000000,0.000018,0.000000,0.000000,0.000000,0.000090,...,78.0,Sigmoid colon,Left,2,T3N1c(0/15)M0,IIIB,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS


In [7]:
SG_CRC_BA.to_csv('SG_CRC_BA.csv', index=False)