In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [8]:
abundance = pd.read_csv("SingaporeCRC_data/kr2_classification_all_CRC_samples.csv", index_col=0,delimiter=",") # Load raw data
abundance = abundance[abundance.sum(axis = 1) > 0] # remove columns that all values are 0
abundance = pd.DataFrame(abundance)
print(abundance.shape)

(162, 11196)


In [9]:
# Impute for the missing values
feature_names = abundance.columns
imputer = KNNImputer(n_neighbors=5, weights="uniform")
abundance_imputed = imputer.fit_transform(abundance.T)
abundance_imputed = pd.DataFrame(abundance_imputed.T, columns = feature_names)
abundance_imputed.iloc[0] = abundance.iloc[0]

In [10]:
# Define a list of invalid names: no information or virus are invalid
invalid_names = ['f__; g__; s__','g__; s__','virus']

for col in abundance_imputed.columns:
    if any(invalid_name in col for invalid_name in invalid_names):
        abundance_imputed.drop(col, axis=1, inplace=True)

# Extract family, genera, and species from existing column names
def extract_taxonomy(column):
    return '; '.join([t for t in column.split('; ') if t.startswith('f__') or t.startswith('g__') or t.startswith('s__')])

new_columns = [extract_taxonomy(column) for column in abundance_imputed.columns]

# Rename columns
abundance_imputed.columns = new_columns

# Add a column containing original patient id for later "merge" step
pid = abundance.index.astype('float')
abundance_imputed.insert(0,'patient_id1',pid)

abundance_imputed

Unnamed: 0,patient_id1,f__Burkholderiaceae; g__Cupriavidus; s__,f__Burkholderiaceae; g__Cupriavidus; s__taiwanensis,f__Burkholderiaceae; g__Cupriavidus; s__metallidurans,f__Burkholderiaceae; g__Cupriavidus; s__neocaledonicus,f__Burkholderiaceae; g__Cupriavidus; s__sp. WKF15,f__Burkholderiaceae; g__Cupriavidus; s__sp. P-10,f__Burkholderiaceae; g__Cupriavidus; s__sp. EM10,f__Burkholderiaceae; g__Cupriavidus; s__basilensis,f__Burkholderiaceae; g__Cupriavidus; s__pauculus,...,f__Morganellaceae; g__Proteus; s__sp. NMG38-2,f__Piscirickettsiaceae; g__Thiomicrorhabdus; s__immobilis,f__Anaplasmataceae; g__Ehrlichia; s__japonica,f__Helicobacteraceae; g__Helicobacter; s__sp. NHP19-012,f__Methanocellaceae; g__Methanocella; s__conradii,f__Sulfolobaceae; g__Acidianus; s__,f__Kangiellaceae; g__Kangiella; s__aquimarina,f__Methylobacteriaceae; g__Methylobacterium; s__sp. OT2,f__Natrialbaceae; g__Haloterrigena; s__,f__Bacillaceae; g__Virgibacillus; s__sp. SK37
0,1312.0,8.0,1196.0,22.0,11.0,4.0,3.0,1.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1643.0,5.0,2331.0,21.0,17.0,0.0,1.0,0.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1592.0,6.0,1655.0,11.0,10.0,0.0,0.0,0.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1586.0,626.0,1834.0,15.0,22.0,0.0,0.0,1.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1138.0,24.0,760.0,17.0,5.0,1.0,0.0,0.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,1237.0,9.0,621.0,14.0,5.0,0.0,0.0,1.0,7.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158,1253.0,9.0,673.0,22.0,7.0,0.0,1.0,1.0,6.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159,1662.0,5.0,711.0,13.0,9.0,0.0,0.0,0.0,3.0,2.0,...,1.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0
160,1664.0,9.0,1234.0,18.0,5.0,0.0,0.0,1.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0


In [11]:
metadata = pd.read_csv('SingaporeCRC_data/SG-BULK_patient_clinical_information.csv', index_col=0,delimiter=",")
metadata = metadata.dropna()
metadata = pd.DataFrame(metadata)
pid_meta = metadata.index.astype('float')
metadata.insert(0,'patient_id1',pid_meta)
metadata

Unnamed: 0_level_0,patient_id1,TMB,KRAS,BRAF,NRAS,TP53,APC,PIK3CA,PIK3R1,SMAD4,...,Age.at.Diagnosis,Site.of.Primary.Colorectal.tumour,Side,Grade,TNM,Stage,iCMS,CMS,group3,group5
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
106,106.0,1028,wt,wt,wt,wt,wt,wt,wt,wt,...,72.0,Transverse colon,Right,3,T4aN2(4/29)M0,IIIC,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
153,153.0,80,mut,wt,wt,mut,mut,mut,wt,wt,...,66.0,Sigmoid colon,Left,2,T3N1(1/18)M0,IIIB,iCMS2,CMS4,iCMS2_MSS,iCMS2_fibrotic
269,269.0,10,wt,wt,wt,wt,wt,wt,wt,wt,...,54.0,Rectum,Left,2,T1N1c(0/11)M0,IIIB,iCMS3,CMS4,iCMS3_MSS,iCMS3_fibrotic
326,326.0,1512,mut,wt,wt,wt,wt,wt,wt,wt,...,55.0,Rectosigmoid junction,Left,2,Unknown,II,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
420,420.0,1976,mut,wt,wt,mut,mut,wt,wt,wt,...,58.0,Sigmoid colon,Left,2,T3N1(1/23)M0,IIIB,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1658,1658.0,82,mut,wt,wt,mut,mut,wt,wt,wt,...,59.0,Rectum,Left,2,T3N2b(8/22)M1,IV,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
1662,1662.0,74,mut,wt,wt,mut,wt,wt,wt,wt,...,67.0,Rectum,Left,2,T3N1a(1/36)M0,IIIB,iCMS3,CMS3,iCMS3_MSS,iCMS3_MSS
1664,1664.0,46,mut,wt,wt,mut,wt,wt,wt,wt,...,62.0,Sigmoid colon,Left,2,T3N2a(6/29)M0,IIIC,iCMS2,CMS4,iCMS2_MSS,iCMS2_fibrotic
1665,1665.0,93,mut,wt,wt,mut,mut,wt,wt,wt,...,78.0,Sigmoid colon,Left,2,T3N1c(0/15)M0,IIIB,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS


In [15]:
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE) # In case there are invalid string in feature names

SG_CRC_BA = pd.merge(abundance_imputed, metadata, on='patient_id1', how='inner')
SG_CRC_BA.columns = [regex.sub("_",col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in SG_CRC_BA.columns.values]
SG_CRC_BA
#SG_CRC.to_csv('merged_CRC', index = False)


Unnamed: 0,patient_id1,f__Burkholderiaceae; g__Cupriavidus; s__,f__Burkholderiaceae; g__Cupriavidus; s__taiwanensis,f__Burkholderiaceae; g__Cupriavidus; s__metallidurans,f__Burkholderiaceae; g__Cupriavidus; s__neocaledonicus,f__Burkholderiaceae; g__Cupriavidus; s__sp. WKF15,f__Burkholderiaceae; g__Cupriavidus; s__sp. P-10,f__Burkholderiaceae; g__Cupriavidus; s__sp. EM10,f__Burkholderiaceae; g__Cupriavidus; s__basilensis,f__Burkholderiaceae; g__Cupriavidus; s__pauculus,...,Age.at.Diagnosis,Site.of.Primary.Colorectal.tumour,Side,Grade,TNM,Stage,iCMS,CMS,group3,group5
0,1312.0,8.0,1196.0,22.0,11.0,4.0,3.0,1.0,5.0,5.0,...,80.0,Descending colon,Left,2,T2N0(0/18)M0,I,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
1,1643.0,5.0,2331.0,21.0,17.0,0.0,1.0,0.0,6.0,6.0,...,65.0,Rectum,Left,2,T2N0(0/20)M0,I,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
2,1586.0,626.0,1834.0,15.0,22.0,0.0,0.0,1.0,1.0,5.0,...,63.0,Sigmoid colon,Left,2,T2N0(0/9)M0,I,iCMS3,CMS3,iCMS3_MSS,iCMS3_MSS
3,1138.0,24.0,760.0,17.0,5.0,1.0,0.0,0.0,5.0,1.0,...,51.0,Sigmoid colon,Left,2,pT3N2(7/29)M1,IV,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
4,609.0,4.0,1041.0,12.0,8.0,0.0,0.0,2.0,2.0,6.0,...,84.0,Ascending colon,Right,2,T3N0(0/12)M0,IIA,iCMS3,CMS3,iCMS3_MSI,iCMS3_MSI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,1237.0,9.0,621.0,14.0,5.0,0.0,0.0,1.0,7.0,2.0,...,65.0,Cecum,Right,2,T3N0(0/25)M0,IIA,iCMS2,CMS2,iCMS2_MSS,iCMS2_MSS
109,1253.0,9.0,673.0,22.0,7.0,0.0,1.0,1.0,6.0,3.0,...,39.0,Sigmoid colon,Left,2,T3N0(0/18)M0,IIB,iCMS3,CMS1,iCMS3_MSI,iCMS3_MSI
110,1662.0,5.0,711.0,13.0,9.0,0.0,0.0,0.0,3.0,2.0,...,67.0,Rectum,Left,2,T3N1a(1/36)M0,IIIB,iCMS3,CMS3,iCMS3_MSS,iCMS3_MSS
111,1664.0,9.0,1234.0,18.0,5.0,0.0,0.0,1.0,4.0,3.0,...,62.0,Sigmoid colon,Left,2,T3N2a(6/29)M0,IIIC,iCMS2,CMS4,iCMS2_MSS,iCMS2_fibrotic


In [18]:
# Export final pre-processed meta-transcriptomics data
SG_CRC_BA.to_csv('SingaporeCRC_data/SG_CRC_BA', index=False)