In [1]:
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from config import SEED, TOP_N_GENES, PCA_VARIANCE_THRESHOLD
from helpers import map_clinical_categories, print_model_report, clean_labels
from preprocessing import preprocess_clinical, drop_constant_columns
from pca import apply_pca
from train import split_data, train_models
from external_validation import run_external_validation
from imputation import knn_impute
import numpy as np
import pandas as pd

In [2]:
pandas2ri.activate()
readRDS = robjects.r['readRDS']

In [3]:
r_obj = readRDS('data/UROMOL_TaLG.teachingcohort.rds')

# Drop 'exprs' column as before
colnames = list(r_obj.names)
clinical_cols = [name for name in colnames if name != 'exprs']
r_clinical = r_obj.rx(True, robjects.StrVector(clinical_cols))

# Convert to pandas
clinical_df = pandas2ri.rpy2py(r_clinical)

exprs_df = pd.read_csv('data/expr.csv', index_col=0)

In [None]:
# External Validation Dataset
external_val = readRDS('data/knowles_matched_TaLG_final.rds')

# Drop 'exprs' column as before
colnames = list(external_val.names)
clinical_cols = [name for name in colnames if name != 'exprs']
external_val = external_val.rx(True, robjects.StrVector(clinical_cols))

# Convert to pandas
external_clinical_df = pandas2ri.rpy2py(external_val)

external_expr = pd.read_csv('data/external_expr.csv', index_col=0)

In [None]:
cat_map = {
        'Sex': {'M': 0, 'F': 1},
        'Smoking': {'Never': 0, 'Former': 1, 'Current': 2},
        'Concomitant.CIS': {'No': 0, 'Yes': 1},
        'Incident.tumor': {'No': 0, 'Yes': 1},
        'Tumor.size': {'< 3 cm': 0, '>= 3 cm': 1},
        'EAU.risk': {'Low': 0, 'Intermediate': 1, 'High': 2},
        'UROMOL2021.classification': {'Class 1': 1, 'Class 2a': 4, 'Class 2b': 3, 'Class 3': 2}
    }

In [None]:
clinical_df = clean_labels(clinical_df)
clinical_df = map_clinical_categories(clinical_df, cat_map)
num_cols = ['Age', 'FUtime_days.']
cat_cols = [col for col in cat_map if col in clinical_df.columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[label_col] = df[label_col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [None]:
clinical_df, median_vals, mode_vals = preprocess_clinical(clinical_df, num_cols, cat_cols)
clinical_df = drop_constant_columns(clinical_df)

In [None]:
print("Missing values per column in clinical_df:")
print(clinical_df.isnull().sum()[clinical_df.isnull().sum() > 0])

Missing values per column in clinical_df:
PFS_time.    1
dtype: int64


In [None]:
feature_col = ['Age', 'Sex', 'Smoking', 'Concomitant.CIS', 'Tumor.size', 'Incident.tumor', 'EAU.risk', 'BCG', 'UROMOL2021.classification']
label_col = ['Recurrence']

In [None]:
exprs_top_df = exprs_df.loc[clinical_df.index]
TOP_N_GENES = 250
PCA_VARIANCE_THRESHOLD = 0.75
pca_df, pca_model, scaler, top_genes = apply_pca(exprs_top_df, top_n=TOP_N_GENES, variance_threshold=PCA_VARIANCE_THRESHOLD)

full_df = clinical_df[feature_col + label_col].join(pca_df)
full_df.columns = full_df.columns.astype(str)

X = full_df.drop(columns='Recurrence')
y = full_df['Recurrence']

# X = clinical_df[feature_col]
# y = clinical_df['Recurrence']

# X = pca_df
# y = clinical_df['Recurrence']


X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2)

models = train_models(X_train, y_train, n_estimators=60, max_depth=3)

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    print_model_report(f"{name} (Internal)", y_test, y_pred, y_proba)


📊 LogisticRegression (Internal) Evaluation
              precision    recall  f1-score   support

           0       0.53      0.53      0.53        17
           1       0.79      0.79      0.79        39

    accuracy                           0.71        56
   macro avg       0.66      0.66      0.66        56
weighted avg       0.71      0.71      0.71        56

AUC: 0.717948717948718

📊 RandomForest (Internal) Evaluation
              precision    recall  f1-score   support

           0       0.80      0.24      0.36        17
           1       0.75      0.97      0.84        39

    accuracy                           0.75        56
   macro avg       0.77      0.60      0.60        56
weighted avg       0.76      0.75      0.70        56

AUC: 0.7330316742081449

📊 XGBoost (Internal) Evaluation
              precision    recall  f1-score   support

           0       0.88      0.41      0.56        17
           1       0.79      0.97      0.87        39

    accuracy        

In [None]:
cat_map = {
    'Sex': {'M': 0, 'F': 1},
    'Smoking': {'Never': 0, 'Former': 1, 'Current': 2},
    'Concomitant.CIS': {'No': 0, 'Yes': 1},
    'Incident.tumor': {'No': 0, 'Yes': 1},
    'Tumor.size': {'< 3 cm': 0, '>= 3 cm': 1},
    'EAU.risk': {'Low': 0, 'Intermediate': 1, 'High': 2},
    'UROMOL2021.classification': {'Class_1': 1, 'Class_2a': 4, 'Class_2b': 3, 'Class_3': 2}
}

external_clinical_df = map_clinical_categories(external_clinical_df, cat_map)


clincal_col_feature = [i for i in external_clinical_df.columns if i in feature_col]
expr_columns_top_n = [i for i in external_expr.columns if i in top_genes]

comb_feature = clincal_col_feature + expr_columns_top_n


external_clinical_df, external_expr = knn_impute(external_clinical_df, external_expr, clinical_df, exprs_df, comb_feature, feature_col, top_genes, cat_cols)

num_cols = [i for i in num_cols if i in comb_feature]
cat_cols = [i for i in cat_cols if i in comb_feature]

external_clinical_df, _, _ = preprocess_clinical(external_clinical_df, num_cols, cat_cols, median_vals, mode_vals)

external_expr = external_expr[top_genes]
external_clinical_df = external_clinical_df[feature_col + label_col]


# Replace invalid value with NaN
external_clinical_df['Recurrence'] = external_clinical_df['Recurrence'].replace(-2147483648, np.nan)

# Drop rows with missing labels
external_clinical_df = external_clinical_df.dropna(subset=['Recurrence'])

# Convert to int (just in case)
external_clinical_df['Recurrence'] = external_clinical_df['Recurrence'].astype(int)



In [None]:
print("Missing values per column in clinical_df:")
print(external_expr.isnull().sum()[external_expr.isnull().sum() > 0])

Missing values per column in clinical_df:
Series([], dtype: int64)


In [None]:
external_pca_df = pd.DataFrame(pca_model.transform(external_expr), index=external_expr.index,
                                columns=[f'PC{i+1}' for i in range(pca_model.n_components_)])

external_df_final = external_clinical_df[X.columns.intersection(external_clinical_df.columns)].join(external_pca_df)
y_true_ext = external_clinical_df.loc[external_df_final.index, 'Recurrence']

external_clinical_df.columns = external_clinical_df.columns.astype(str)
external_df_final.columns = external_df_final.columns.astype(str)

run_external_validation(models, external_df_final, y_true_ext)
# run_external_validation(models, external_clinical_df[feature_col], external_clinical_df['Recurrence'])
# run_external_validation(models, external_pca_df.loc[external_df_final.index], y_true_ext)


📊 LogisticRegression (External) Evaluation
              precision    recall  f1-score   support

           0       0.75      0.07      0.12        44
           1       0.41      0.97      0.57        29

    accuracy                           0.42        73
   macro avg       0.58      0.52      0.35        73
weighted avg       0.61      0.42      0.30        73

AUC: 0.6434169278996865

📊 RandomForest (External) Evaluation
              precision    recall  f1-score   support

           0       0.56      0.61      0.59        44
           1       0.32      0.28      0.30        29

    accuracy                           0.48        73
   macro avg       0.44      0.44      0.44        73
weighted avg       0.47      0.48      0.47        73

AUC: 0.4373040752351097

📊 XGBoost (External) Evaluation
              precision    recall  f1-score   support

           0       0.63      0.91      0.75        44
           1       0.60      0.21      0.31        29

    accuracy       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
