In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score,cross_val_predict
from sklearn.metrics import roc_curve, auc
import matplotlib
matplotlib.rcParams.update({'font.size': 20})
import pickle
import platform
import time
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
from numpy import interp
from sklearn.utils.multiclass import type_of_target
from sklearn.model_selection import GridSearchCV
import os
import argparse

In [9]:

def read_csv(csv_file, nrows=None):
    df = pd.read_csv(csv_file, nrows=nrows)
    print(f"File = {csv_file}")
    print(f"Shape = {df.shape[0]:,} rows, {df.shape[1]:,} columns")
    print(f"Memory usage = {df.memory_usage().sum() / 1024**3:.2f}GB")
    return df

def preprocess(df):
    df = df.iloc[:, 2:]  # Remove first two columns
    drop_cols = ['Age.at.diagnosis', "Sentrix_ID_new", 'Ki67', 'index', 'Localization',
                 'Primary', 'G.phase', 'Gender']
    df = df.drop(labels=drop_cols, axis=1)

    # Split groups
    df_cup = df[df['P_grouping'] == "NEN liver CUP"].copy()
    df_cup = df_cup[df_cup['ID'] != 240230]
    df_meta = df[df['P_grouping'] == "NEN liver metastasis"].copy()
    df_model = df[~df['P_grouping'].isin(["NEN liver CUP", "NEN liver metastasis"])].copy()

    return df_model, df_meta, df_cup

def encode_labels(dfs):
    ref_encoding = []
    for df in dfs:
        le = LabelEncoder()
        for col in df.columns:
            if df[col].dtype == 'object' and df[col].nunique() <= 20:
                df[col] = le.fit_transform(df[col])
                ref_encoding.append(le.classes_)
    return ref_encoding

def split_features(df):
    y = pd.to_numeric(df['P_grouping'])
    X = df.drop(['ID', 'NEN.type', 'P_grouping'], axis=1)
    return X, y

def run_model(model, X, y, ref_classes, label="Model"):
    kf = StratifiedKFold(n_splits=3)
    scores = cross_val_score(model, X, y, cv=kf, scoring="accuracy")
    print(f"{label} Scores per fold: {scores}")
    print(f"{label} Average score: {scores.mean():.2f}")
    
    y_pred = cross_val_predict(model, X, y, cv=kf)
    conf_mat = confusion_matrix(y, y_pred)
    conf_df = pd.DataFrame(conf_mat, columns=ref_classes, index=ref_classes)
    return y_pred, conf_df

# Paths
rf_path = ".../data/NEN_ml_5k_res_test_update.csv"
xgb_path = ".../data/NEN_ml_xgboosting_update2.csv"

# Process RF
print("----- RANDOM FOREST -----")
df_rf = read_csv(rf_path)
df_model_rf, df_meta_rf, df_cup_rf = preprocess(df_rf)
ref_enc_rf = encode_labels([df_model_rf, df_meta_rf, df_cup_rf])
X_rf, y_rf = split_features(df_model_rf)
model_rf = RandomForestClassifier(n_estimators=2000, random_state=3, max_features="sqrt",
                                  criterion="gini", oob_score=True, n_jobs=10, max_depth=12)
y_pred_rf, conf_rf = run_model(model_rf, X_rf, y_rf, ref_enc_rf[1], label="Random Forest")

# Process XGB
print("----- XGBOOST -----")
df_xgb = read_csv(xgb_path)
df_model_xgb, df_meta_xgb, df_cup_xgb = preprocess(df_xgb)
ref_enc_xgb = encode_labels([df_model_xgb, df_meta_xgb, df_cup_xgb])
X_xgb, y_xgb = split_features(df_model_xgb)
model_xgb = RandomForestClassifier(n_estimators=2000, random_state=3, max_features="sqrt",
                                  criterion="gini", oob_score=True, n_jobs=10, max_depth=12)
y_pred_xgb, conf_xgb = run_model(model_xgb, X_xgb, y_xgb, ref_enc_xgb[1], label="XGBoost")

# Optional: Save confusion matrices
conf_rf.to_csv("confusion_matrix_rf.csv")
conf_xgb.to_csv("confusion_matrix_xgb.csv")


In [None]:
kf = StratifiedKFold(n_splits=3)

rf_preds = cross_val_predict(model_rf, X_rf, y_rf, cv=kf)
xgb_preds = cross_val_predict(model_xgb, X_xgb, y_xgb, cv=kf)

# --- Build confusion matrix: RF vs XGB predictions ---
conf_rf_vs_xgb = pd.crosstab(rf_preds, xgb_preds,
                             rownames=['RF Prediction'],
                             colnames=['XGB Prediction'])

# Optional: use label names if available
if 'ref_enc_rf' in globals() and len(ref_enc_rf) > 1:
    label_names = ref_enc_rf[1]
    conf_rf_vs_xgb.index = label_names[conf_rf_vs_xgb.index]
    conf_rf_vs_xgb.columns = label_names[conf_rf_vs_xgb.columns]



In [14]:
# Optional: Save the confusion matrix to a CSV file
print(conf_rf_vs_xgb)
conf_rf_vs_xgb.to_csv(".../data/confusion_matrix_rf_vs_xgb.csv")

In [26]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the plot size and font scale to match your reference
plt.figure(figsize=(12, 9))
sns.set(font_scale=1.4)
sns.set_style("whitegrid")

# Plot heatmap with similar aesthetics
ax = sns.heatmap(
    conf_rf_vs_xgb,
    annot=True,
    fmt='d',
    cmap='Blues',
 #   linewidths=1,
    linecolor='white',
    square=True,
    cbar_kws={"shrink": 0.8, "label": ""},
    annot_kws={"fontsize": 13, "weight": "normal"}
)

# Format axis titles and tick labels to match reference style
ax.set_title("Confusion Matrix: RF vs XGB Predictions (Training Set Only)", fontsize=12, pad=12)
ax.set_xlabel("XGBoost Prediction", fontsize=16, labelpad=15)
ax.set_ylabel("Random Forest Prediction", fontsize=16, labelpad=15)

# Tick label formatting
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=13)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=13)

# Layout
plt.tight_layout()

# Save the plot
plt.savefig(".../data/confusion_matrix_RF_vs_XGB_training_only_matched.pdf", dpi=300)

# Show the plot
plt.show()


In [28]:
import pandas as pd

# Load predictions from both models
rf_path = ".../data/rf_res_meta_predictions_probability.csv"
xgb_path = ".../data/NEN_metastasis_predictions_xgb.csv"

rf_df = pd.read_csv(rf_path)
xgb_df = pd.read_csv(xgb_path)

# Display heads to inspect structure
rf_df.head(), xgb_df.head()

In [34]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Extract predictions
rf_preds_meta = rf_df["prediction"]
xgb_preds_meta = xgb_df["Prediction"]

# Build confusion matrix
conf_rf_vs_xgb_meta = pd.crosstab(rf_preds_meta, xgb_preds_meta,
                                  rownames=['RF Prediction'],
                                  colnames=['XGB Prediction'])

# Regenerate the liver metastasis confusion matrix plot using the provided style settings
plt.figure(figsize=(12, 9))
sns.set(font_scale=1.4)
sns.set_style("whitegrid")

ax = sns.heatmap(
    conf_rf_vs_xgb_meta,
    annot=True,
    fmt='d',
    cmap='Blues',
    linecolor='white',
    square=True,
    cbar_kws={"shrink": 0.8, "label": ""},
    annot_kws={"fontsize": 13, "weight": "normal"}
)

ax.set_title("Confusion Matrix: RF vs XGB Predictions (Liver Metastasis Only)", fontsize=12, pad=12)
ax.set_xlabel("XGBoost Prediction", fontsize=16, labelpad=15)
ax.set_ylabel("Random Forest Prediction", fontsize=16, labelpad=15)

ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=13)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=13)

plt.tight_layout()
output_final_path = ".../data/confusion_matrix_RF_vs_XGB_liver_metastasis_only_matched.pdf"
plt.savefig(output_final_path, dpi=300)
plt.show()

output_final_path


In [35]:
import pandas as pd

# Load predictions from both models
rf_path = ".../data/res_cup_predictions probability.csv"
xgb_path = ".../data/res_cup_predictions probability_xgb.csv"

rf_df = pd.read_csv(rf_path)
xgb_df = pd.read_csv(xgb_path)

# Display heads to inspect structure
rf_df.head(), xgb_df.head()

In [37]:
# Extract predictions
rf_preds_meta = rf_df["prediction"]
xgb_preds_meta = xgb_df["prediction"]

# Build confusion matrix
conf_rf_vs_xgb_meta = pd.crosstab(rf_preds_meta, xgb_preds_meta,
                                  rownames=['RF Prediction'],
                                  colnames=['XGB Prediction'])

# Regenerate the liver metastasis confusion matrix plot using the provided style settings
plt.figure(figsize=(12, 9))
sns.set(font_scale=1.4)
sns.set_style("whitegrid")

ax = sns.heatmap(
    conf_rf_vs_xgb_meta,
    annot=True,
    fmt='d',
    cmap='Blues',
    linecolor='white',
    square=True,
    cbar_kws={"shrink": 0.8, "label": ""},
    annot_kws={"fontsize": 13, "weight": "normal"}
)

ax.set_title("Confusion Matrix: RF vs XGB Predictions (hepatic NEN Only)", fontsize=12, pad=12)
ax.set_xlabel("XGBoost Prediction", fontsize=16, labelpad=15)
ax.set_ylabel("Random Forest Prediction", fontsize=16, labelpad=15)

ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=13)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=13)

plt.tight_layout()
output_final_path = ".../data/confusion_matrix_RF_vs_XGB_hepatic NEN_matched.pdf"
plt.savefig(output_final_path, dpi=300)
plt.show()

output_final_path