In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler

In [2]:
abundance = pd.read_csv("SingaporeCRC_data/kr2_classification_all_CRC_samples.csv", index_col=0,delimiter=",") # Load raw data
abundance = abundance[abundance.sum(axis = 1) > 0] # remove columns that all values are 0
abundance = pd.DataFrame(abundance)
print(abundance.shape)

(162, 11196)


In [3]:
# Impute for the missing values
feature_names = abundance.columns
imputer = KNNImputer(n_neighbors=5, weights="uniform")
abundance_imputed = imputer.fit_transform(abundance.T)
abundance_imputed = pd.DataFrame(abundance_imputed.T, columns = feature_names)
abundance_imputed.iloc[0] = abundance.iloc[0]

In [4]:
random_seed = 42
# Define a list of invalid names: no information or virus are invalid
invalid_names = ['f__; g__; s__','g__; s__','virus','Afipia','Abiotrophia','Acidovorax','Acinetobacter','Aeromicrobium','Aquabacterium','Arthrobacter','Asticcacaulis','Aurantimonas','Azoarcus','Azospira','Bacillus','Beijerinckia','Beutenbergia','Bosea','Bradyrhizobium','Brevibacillus','Brevundimonas','Brochothrix','Burkholderia','Caulobacter','Chryseobacterium','Corynebacterium','Craurococcus','Curtobacterium','Deinococcus','Devosia','Dietzia','Dyadobacter','Enhydrobacter','Enterobacter','Escherichia','Facklamia','Flavobacterium','Geodermatophilus','Hoeflea','Hydrotalea','Janibacter','Kingella','Kocuria','Leptothrix','Limnobacter','Massilia','Mesorhizobium','Methylobacterium','Methylophilus','Methyloversatilis','Microbacterium','Micrococcus','Microlunatus','Nevskia','Niastella','Novosphingobium','Ochrobactrum','Olivibacter','Oxalobacter','Paenibacillus','Paracoccus','Patulibacter','Pedobacter','Pedomicrobium','Pelomonas','Phyllobacterium','Polaromonas','Propionibacterium','Pseudomonas','Pseudoxanthomonas','Psychrobacter','Ralstonia','Rhizobium','Rhodococcus','Roseomonas','Schlegelella','Sphingobium','Sphingomonas','Sphingopyxis','Stenotrophomonas','Streptococcus','Sulfuritalea','Tsukamurella','Undibacterium','Variovorax','Wautersiella','Xanthomonas']

for col in abundance_imputed.columns:
    if any(invalid_name in col for invalid_name in invalid_names):
        abundance_imputed.drop(col, axis=1, inplace=True)

# Extract only genera from existing column names
def extract_taxonomy(column):
    return '; '.join([t for t in column.split('; ') if t.startswith('g__')])

new_columns = [extract_taxonomy(column) for column in abundance_imputed.columns]

# Rename columns
abundance_imputed.columns = new_columns

abundance_imputed

Unnamed: 0,g__Chromobacterium,g__Chromobacterium.1,g__Chromobacterium.2,g__Chromobacterium.3,g__Vogesella,g__Paludibacterium,g__Paludibacterium.1,g__Paludibacterium.2,g__Aquitalea,g__Neisseria,...,g__Marivirga,g__Spiroplasma,g__Thiomicrorhabdus,g__Ehrlichia,g__Helicobacter,g__Methanocella,g__Acidianus,g__Kangiella,g__Haloterrigena,g__Virgibacillus
0,4.0,2.0,2.0,1.0,5.0,2.0,1.0,1.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,9.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,3.0,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,1.0,0.0,1.0,2.0,4.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,5.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158,6.0,0.0,1.0,0.0,8.0,2.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,5.0,35.0,...,1.0,1.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0
160,5.0,1.0,0.0,0.0,1.0,2.0,0.0,2.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [5]:
# Identify duplicate columns
duplicate_columns = abundance_imputed.columns[abundance_imputed.columns.duplicated()].tolist()

# Function to aggregate duplicate columns
def aggregate_duplicate_columns(abundance_imputed, aggfunc='sum'):
    # Group by column names and aggregate
    return abundance_imputed.groupby(abundance_imputed.columns, axis=1).agg(aggfunc)

# Aggregate duplicate columns by summing them
abundance_cleaned = aggregate_duplicate_columns(abundance_imputed, aggfunc='sum')

# Add a column containing original patient id for later "merge" step
pid = abundance.index.astype('float')
abundance_cleaned.insert(0,'patient_id1',pid)

print("DataFrame after aggregating duplicate columns:")
abundance_cleaned

DataFrame after aggregating duplicate columns:


  return abundance_imputed.groupby(abundance_imputed.columns, axis=1).agg(aggfunc)


Unnamed: 0,patient_id1,g__Abyssalbus,g__Abyssibius,g__Abyssicoccus,g__Acaryochloris,g__Aceticella,g__Acetilactobacillus,g__Acetivibrio,g__Acetoanaerobium,g__Acetobacter,...,g__Zhaonella,g__Zhihengliuella,g__Zhongshania,g__Zhouia,g__Zobellella,g__Zobellia,g__Zophobihabitans,g__Zunongwangia,g__Zymobacter,g__Zymomonas
0,1312.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0
1,1643.0,0.0,3.0,0.0,0.0,0.0,0.0,9.0,0.0,5.0,...,1.0,0.0,0.0,0.0,1.0,7.0,0.0,0.0,5.0,0.0
2,1592.0,0.0,0.0,0.0,0.0,2.0,1.0,74.0,0.0,4.0,...,1.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,3.0,0.0
3,1586.0,0.0,1.0,0.0,0.0,5.0,0.0,66.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,3.0,0.0
4,1138.0,0.0,2.0,0.0,0.0,0.0,1.0,19.0,5.0,1.0,...,0.0,1.0,0.0,0.0,2.0,3.0,0.0,14.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,1237.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,2.0,0.0
158,1253.0,0.0,2.0,0.0,0.0,0.0,5.0,36.0,8.0,0.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
159,1662.0,3.0,1.0,0.0,1.0,0.0,5.0,111.0,7.0,1.0,...,2.0,0.0,0.0,0.0,0.0,14.0,0.0,99.0,0.0,0.0
160,1664.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


In [6]:
metadata = pd.read_csv('SingaporeCRC_data/SG-BULK_patient_clinical_information.csv', index_col=0,delimiter=",")
metadata = metadata.dropna()
metadata = pd.DataFrame(metadata)
pid_meta = metadata.index.astype('float')
metadata.insert(0,'patient_id1',pid_meta)
metadata

Unnamed: 0_level_0,patient_id1,TMB,KRAS,BRAF,NRAS,TP53,APC,PIK3CA,PIK3R1,SMAD4,...,Age.at.Diagnosis,Site.of.Primary.Colorectal.tumour,Side,Grade,TNM,Stage,iCMS,CMS,group3,group5
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
106,106.0,1028,wt,wt,wt,wt,wt,wt,wt,wt,...,72.0,Transverse colon,Right,3,T4aN2(4/29)M0,IIIC,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
153,153.0,80,mut,wt,wt,mut,mut,mut,wt,wt,...,66.0,Sigmoid colon,Left,2,T3N1(1/18)M0,IIIB,iCMS2,CMS4,iCMS2_MSS,iCMS2_fibrotic
269,269.0,10,wt,wt,wt,wt,wt,wt,wt,wt,...,54.0,Rectum,Left,2,T1N1c(0/11)M0,IIIB,iCMS3,CMS4,iCMS3_MSS,iCMS3_fibrotic
326,326.0,1512,mut,wt,wt,wt,wt,wt,wt,wt,...,55.0,Rectosigmoid junction,Left,2,Unknown,II,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
420,420.0,1976,mut,wt,wt,mut,mut,wt,wt,wt,...,58.0,Sigmoid colon,Left,2,T3N1(1/23)M0,IIIB,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1658,1658.0,82,mut,wt,wt,mut,mut,wt,wt,wt,...,59.0,Rectum,Left,2,T3N2b(8/22)M1,IV,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
1662,1662.0,74,mut,wt,wt,mut,wt,wt,wt,wt,...,67.0,Rectum,Left,2,T3N1a(1/36)M0,IIIB,iCMS3,CMS3,iCMS3_MSS,iCMS3_MSS
1664,1664.0,46,mut,wt,wt,mut,wt,wt,wt,wt,...,62.0,Sigmoid colon,Left,2,T3N2a(6/29)M0,IIIC,iCMS2,CMS4,iCMS2_MSS,iCMS2_fibrotic
1665,1665.0,93,mut,wt,wt,mut,mut,wt,wt,wt,...,78.0,Sigmoid colon,Left,2,T3N1c(0/15)M0,IIIB,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS


In [7]:
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE) # In case there are invalid string in feature names

SG_CRC_BA = pd.merge(abundance_cleaned, metadata, on='patient_id1', how='inner')
SG_CRC_BA.columns = [regex.sub("_",col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in SG_CRC_BA.columns.values]
SG_CRC_BA
#SG_CRC.to_csv('merged_CRC', index = False)


Unnamed: 0,patient_id1,g__Abyssalbus,g__Abyssibius,g__Abyssicoccus,g__Acaryochloris,g__Aceticella,g__Acetilactobacillus,g__Acetivibrio,g__Acetoanaerobium,g__Acetobacter,...,Age.at.Diagnosis,Site.of.Primary.Colorectal.tumour,Side,Grade,TNM,Stage,iCMS,CMS,group3,group5
0,1312.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,80.0,Descending colon,Left,2,T2N0(0/18)M0,I,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
1,1643.0,0.0,3.0,0.0,0.0,0.0,0.0,9.0,0.0,5.0,...,65.0,Rectum,Left,2,T2N0(0/20)M0,I,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
2,1586.0,0.0,1.0,0.0,0.0,5.0,0.0,66.0,0.0,1.0,...,63.0,Sigmoid colon,Left,2,T2N0(0/9)M0,I,iCMS3,CMS3,iCMS3_MSS,iCMS3_MSS
3,1138.0,0.0,2.0,0.0,0.0,0.0,1.0,19.0,5.0,1.0,...,51.0,Sigmoid colon,Left,2,pT3N2(7/29)M1,IV,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
4,609.0,0.0,1.0,0.0,0.0,1.0,0.0,12.0,0.0,0.0,...,84.0,Ascending colon,Right,2,T3N0(0/12)M0,IIA,iCMS3,CMS3,iCMS3_MSI,iCMS3_MSI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,1237.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,65.0,Cecum,Right,2,T3N0(0/25)M0,IIA,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
109,1253.0,0.0,2.0,0.0,0.0,0.0,5.0,36.0,8.0,0.0,...,39.0,Sigmoid colon,Left,2,T3N0(0/18)M0,IIB,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
110,1662.0,3.0,1.0,0.0,1.0,0.0,5.0,111.0,7.0,1.0,...,67.0,Rectum,Left,2,T3N1a(1/36)M0,IIIB,iCMS3,CMS3,iCMS3_MSS,iCMS3_MSS
111,1664.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,62.0,Sigmoid colon,Left,2,T3N2a(6/29)M0,IIIC,iCMS2,CMS4,iCMS2_MSS,iCMS2_fibrotic


In [8]:
# Export final pre-processed meta-transcriptomics data
SG_CRC_BA.to_csv('SingaporeCRC_data/SG_CRC_BA', index=False)