<a href="https://colab.research.google.com/github/Hkd225/crc-gut-microbiome-ml-classification/blob/main/crc_gut_microbiome_ml_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import os
import kagglehub
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

path = kagglehub.dataset_download("aramelheni/crc-gut-microbiome-ml-data")
path_metadata = os.path.join(path, 'metadata.csv')
path_microbiome = os.path.join(path, 'seqtab_nochim_export.xlsx')
path_species = os.path.join(path, 'taxa_species_export.xlsx')

print("Sedang membaca data...")
df_meta = pd.read_csv(path_metadata, sep=';')
df_micro = pd.read_excel(path_microbiome)
df_taxa = pd.read_excel(path_species)

nama_kolom_id_micro = df_micro.columns[0]
df_micro = df_micro.rename(columns={nama_kolom_id_micro: 'SampleID_micro'})
df_merged = pd.merge(df_meta, df_micro, left_on='host_disease', right_on='SampleID_micro', how='inner')

kolom_dihapus = ['SampleID', 'SampleID_micro', 'host_disease', 'SampleName', 'DiseaseStatus']
X = df_merged.drop(columns=kolom_dihapus, errors='ignore')

X['Sex'] = X['Sex'].map({'female': 0, 'male': 1})
X = X.fillna(0)
X.columns = [re.sub(r'[\[\]<>\s]+', '_', col) for col in X.columns]

y = df_merged['DiseaseStatus']
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("\nMengekstrak Top 200 fitur menggunakan Random Forest...")
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selector.fit(X_train, y_train)

importances = pd.Series(rf_selector.feature_importances_, index=X_train.columns)
top_200_cols = importances.nlargest(200).index

X_train_200, X_test_200 = X_train[top_200_cols], X_test[top_200_cols]

scaler_all = StandardScaler()
X_train_scaled = scaler_all.fit_transform(X_train)
X_test_scaled = scaler_all.transform(X_test)

scaler_200 = StandardScaler()
X_train_200_scaled = scaler_200.fit_transform(X_train_200)
X_test_200_scaled = scaler_200.transform(X_test_200)

print("\nMelatih berbagai model...\n")

model_rf = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)

model_xgb = XGBClassifier(
    n_estimators=100,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
).fit(X_train, y_train)

model_xgb_reg = XGBClassifier(
    n_estimators=100,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss',
    reg_alpha=1.5,
    reg_lambda=2.0
).fit(X_train, y_train)

logreg_l1_all = LogisticRegression(
    penalty='l1',
    solver='saga',
    max_iter=5000,
    random_state=42
)
logreg_l1_all.fit(X_train_scaled, y_train)

logreg_l2_all = LogisticRegression(
    penalty='l2',
    solver='lbfgs',
    max_iter=5000,
    random_state=42
)
logreg_l2_all.fit(X_train_scaled, y_train)

logreg_l2_200 = LogisticRegression(
    penalty='l2',
    solver='lbfgs',
    max_iter=5000,
    random_state=42
)
logreg_l2_200.fit(X_train_200_scaled, y_train)

print("=== PERBANDINGAN AKURASI KESELURUHAN ===")
print("--- Kelompok Berbasis Pohon (Tree-based) ---")
print(f"1. Random Forest (Baseline)        : {accuracy_score(y_test, model_rf.predict(X_test)) * 100:.2f}%")
print(f"2. XGBoost (Standar)               : {accuracy_score(y_test, model_xgb.predict(X_test)) * 100:.2f}%")
print(f"3. XGBoost (Regularisasi L1 & L2)  : {accuracy_score(y_test, model_xgb_reg.predict(X_test)) * 100:.2f}%\n")

print("--- Kelompok Model Linear (Regularisasi L1 & L2) ---")
print(f"4. LogReg L1 Lasso (Semua Fitur)   : {accuracy_score(y_test, logreg_l1_all.predict(X_test_scaled)) * 100:.2f}%")
print(f"5. LogReg L2 Ridge (Semua Fitur)   : {accuracy_score(y_test, logreg_l2_all.predict(X_test_scaled)) * 100:.2f}%")
print(f"6. LogReg L2 Ridge (Top 200 Fitur) : {accuracy_score(y_test, logreg_l2_200.predict(X_test_200_scaled)) * 100:.2f}%")

Downloading from https://www.kaggle.com/api/v1/datasets/download/aramelheni/crc-gut-microbiome-ml-data?dataset_version_number=1...


100%|██████████| 1.18M/1.18M [00:00<00:00, 28.1MB/s]

Extracting files...
Sedang membaca data...






Mengekstrak Top 200 fitur menggunakan Random Forest...

Melatih berbagai model...



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== PERBANDINGAN AKURASI KESELURUHAN ===
--- Kelompok Berbasis Pohon (Tree-based) ---
1. Random Forest (Baseline)        : 66.67%
2. XGBoost (Standar)               : 58.33%
3. XGBoost (Regularisasi L1 & L2)  : 50.00%

--- Kelompok Model Linear (Regularisasi L1 & L2) ---
4. LogReg L1 Lasso (Semua Fitur)   : 50.00%
5. LogReg L2 Ridge (Semua Fitur)   : 75.00%
6. LogReg L2 Ridge (Top 200 Fitur) : 58.33%


In [2]:


print("\n=== MEMBUAT DATA PASIEN BARU YANG KOMPLEKS (SYNTHETIC PATIENTS) ===")

X_cancer = X[y_encoded == list(le_target.classes_).index('Colorectal cancer')]
X_polyp = X[y_encoded == list(le_target.classes_).index('Adenomatous Polyps')]
X_healthy = X[y_encoded == list(le_target.classes_).index('Healthy')]

mean_cancer, std_cancer = X_cancer.mean(), X_cancer.std()
mean_polyp, std_polyp = X_polyp.mean(), X_polyp.std()
mean_healthy, std_healthy = X_healthy.mean(), X_healthy.std()

data_kompleks = pd.DataFrame(
    columns=X.columns,
    index=[
        'Pasien 1 (Sintetis Kanker)',
        'Pasien 2 (Sintetis Polip)',
        'Pasien 3 (Sintetis Borderline: Sehat-Polip)'
    ]
)

data_kompleks.loc['Pasien 1 (Sintetis Kanker)'] = np.maximum(
    0, np.random.normal(mean_cancer, std_cancer)
)
data_kompleks.loc['Pasien 1 (Sintetis Kanker)', 'Age'] = 72
data_kompleks.loc['Pasien 1 (Sintetis Kanker)', 'Sex'] = 1

data_kompleks.loc['Pasien 2 (Sintetis Polip)'] = np.maximum(
    0, np.random.normal(mean_polyp, std_polyp)
)
data_kompleks.loc['Pasien 2 (Sintetis Polip)', 'Age'] = 58
data_kompleks.loc['Pasien 2 (Sintetis Polip)', 'Sex'] = 0

campuran_sehat = np.random.normal(mean_healthy, std_healthy) * 0.8
campuran_polip = np.random.normal(mean_polyp, std_polyp) * 0.2

data_kompleks.loc['Pasien 3 (Sintetis Borderline: Sehat-Polip)'] = np.maximum(
    0, campuran_sehat + campuran_polip
)
data_kompleks.loc['Pasien 3 (Sintetis Borderline: Sehat-Polip)', 'Age'] = 45
data_kompleks.loc['Pasien 3 (Sintetis Borderline: Sehat-Polip)', 'Sex'] = 1

data_kompleks = data_kompleks.astype(float)

data_kompleks_scaled = scaler_all.transform(data_kompleks)

prediksi_angka = logreg_l2_all.predict(data_kompleks_scaled)
probabilitas = logreg_l2_all.predict_proba(data_kompleks_scaled) * 100
hasil_diagnosis = le_target.inverse_transform(prediksi_angka)

print("\n=== HASIL DIAGNOSIS PASIEN KOMPLEKS ===")
for i, nama_pasien in enumerate(data_kompleks.index):
    print(f"\n[{nama_pasien}]")
    print(f"Prediksi Diagnosis  : >> {hasil_diagnosis[i].upper()} <<")
    print("Probabilitas/Keyakinan:")
    for label, prob in zip(le_target.classes_, probabilitas[i]):
        print(f"  - {label:<20}: {prob:.2f}%")


=== MEMBUAT DATA PASIEN BARU YANG KOMPLEKS (SYNTHETIC PATIENTS) ===

=== HASIL DIAGNOSIS PASIEN KOMPLEKS ===

[Pasien 1 (Sintetis Kanker)]
Prediksi Diagnosis  : >> COLORECTAL CANCER <<
Probabilitas/Keyakinan:
  - Adenomatous Polyps  : 0.00%
  - Colorectal cancer   : 100.00%
  - Healthy             : 0.00%

[Pasien 2 (Sintetis Polip)]
Prediksi Diagnosis  : >> ADENOMATOUS POLYPS <<
Probabilitas/Keyakinan:
  - Adenomatous Polyps  : 100.00%
  - Colorectal cancer   : 0.00%
  - Healthy             : 0.00%

[Pasien 3 (Sintetis Borderline: Sehat-Polip)]
Prediksi Diagnosis  : >> HEALTHY <<
Probabilitas/Keyakinan:
  - Adenomatous Polyps  : 0.09%
  - Colorectal cancer   : 0.01%
  - Healthy             : 99.90%


In [3]:
import joblib

print("=== 1. EKSTRAKSI BIOMARKER KANKER KOLOREKTAL ===")

idx_cancer = list(le_target.classes_).index('Colorectal cancer')

bobot_kanker = logreg_l2_all.coef_[idx_cancer]

top_15_idx = np.argsort(bobot_kanker)[-15:][::-1]

daftar_biomarker = []
nama_kolom_dna_taxa = df_taxa.columns[0]

for rank, idx in enumerate(top_15_idx):
    nama_fitur = X.columns[idx]
    nilai_bobot = bobot_kanker[idx]

    if nama_fitur in ['Age', 'Sex']:
        continue

    match = df_taxa[df_taxa[nama_kolom_dna_taxa] == nama_fitur]

    if not match.empty:
        family = match['Family'].values[0] if pd.notna(match['Family'].values[0]) else 'Unknown'
        genus = match['Genus'].values[0] if pd.notna(match['Genus'].values[0]) else 'Unknown'
        species = match['Species'].values[0] if pd.notna(match['Species'].values[0]) else 'Unknown'
    else:
        family, genus, species = 'Tak_Ditemukan', 'Tak_Ditemukan', 'Tak_Ditemukan'

    daftar_biomarker.append({
        'Peringkat': rank + 1,
        'Family': family,
        'Genus': genus,
        'Species': species,
        'Bobot_Pemicu': round(nilai_bobot, 4),
        'Potongan_DNA': nama_fitur[:15] + "..."
    })

df_biomarker = pd.DataFrame(daftar_biomarker)
display(df_biomarker)

df_biomarker.to_excel('Laporan_Biomarker_Kanker.xlsx', index=False)
print("\n[INFO] Daftar biomarker telah diexport ke 'Laporan_Biomarker_Kanker.xlsx'")

print("\n=======================================================")
print("=== 2. MENYIMPAN MODEL & SCALER (EXPORT KE PICKLE) ===")
print("=======================================================")

joblib.dump(logreg_l2_all, 'model_logreg_l2_crc.pkl')
joblib.dump(scaler_all, 'scaler_crc.pkl')
joblib.dump(le_target, 'label_encoder_crc.pkl')
joblib.dump(list(X.columns), 'fitur_template_crc.pkl')

print("✅ BERHASIL! 4 File (.pkl) berikut telah tersimpan di direktori Anda:")
print("  1. model_logreg_l2_crc.pkl")
print("  2. scaler_crc.pkl")
print("  3. label_encoder_crc.pkl")
print("  4. fitur_template_crc.pkl")
print("\nFile-file ini siap diunggah ke GitHub/Server untuk pembuatan Aplikasi Web Medis!")

=== 1. EKSTRAKSI BIOMARKER KANKER KOLOREKTAL ===


Unnamed: 0,Peringkat,Family,Genus,Species,Bobot_Pemicu,Potongan_DNA
0,1,Lachnospiraceae,GCA-900066575,Unknown,0.0152,CCTACGGGTGGCTGC...
1,2,Lachnospiraceae,Lachnospiraceae NK4A136 group,Unknown,0.0151,CCTACGGGGGGCTGC...
2,3,Oscillospiraceae,UCG-002,Unknown,0.0144,CCTACGGGAGGCTGC...
3,4,Fusobacteriaceae,Fusobacterium,Unknown,0.0136,CCTACGGGAGGCTGC...
4,5,Oscillospiraceae,UCG-002,Unknown,0.0132,CCTACGGGAGGCTGC...
5,6,Lachnospiraceae,Blautia,Unknown,0.0131,CCTACGGGAGGCAGC...
6,7,Lachnospiraceae,Blautia,Unknown,0.0131,CCTACGGGAGGCAGC...
7,8,Ruminococcaceae,Subdoligranulum,Unknown,0.0131,CCTACGGGAGGCTGC...
8,9,Lachnospiraceae,Dorea,Unknown,0.0129,CCTACGGGAGGCTGC...
9,10,Lachnospiraceae,Roseburia,Unknown,0.0122,CCTACGGGAGGCAGC...



[INFO] Daftar biomarker telah diexport ke 'Laporan_Biomarker_Kanker.xlsx'

=== 2. MENYIMPAN MODEL & SCALER (EXPORT KE PICKLE) ===
✅ BERHASIL! 4 File (.pkl) berikut telah tersimpan di direktori Anda:
  1. model_logreg_l2_crc.pkl
  2. scaler_crc.pkl
  3. label_encoder_crc.pkl
  4. fitur_template_crc.pkl

File-file ini siap diunggah ke GitHub/Server untuk pembuatan Aplikasi Web Medis!


In [None]:
import joblib
model = joblib.load('model_logreg_l2_crc.pkl')
scaler = joblib.load('scaler_crc.pkl')