In [28]:
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from config import SEED, TOP_N_GENES, PCA_VARIANCE_THRESHOLD
from helpers import map_clinical_categories, print_model_report, clean_labels
from preprocessing import preprocess_clinical, drop_constant_columns
from pca import apply_pca
from train import split_data, train_models
from external_validation import run_external_validation
from imputation import knn_impute
import numpy as np
import pandas as pd

In [29]:
pandas2ri.activate()
readRDS = robjects.r['readRDS']

In [30]:
r_obj = readRDS('data/UROMOL_TaLG.teachingcohort.rds')

# Drop 'exprs' column as before
colnames = list(r_obj.names)
clinical_cols = [name for name in colnames if name != 'exprs']
r_clinical = r_obj.rx(True, robjects.StrVector(clinical_cols))

# Convert to pandas
clinical_df = pandas2ri.rpy2py(r_clinical)

exprs_df = pd.read_csv('data/expr.csv', index_col=0)

In [31]:
# External Validation Dataset
external_val = readRDS('data/knowles_matched_TaLG_final.rds')

# Drop 'exprs' column as before
colnames = list(external_val.names)
clinical_cols = [name for name in colnames if name != 'exprs']
external_val = external_val.rx(True, robjects.StrVector(clinical_cols))

# Convert to pandas
external_clinical_df = pandas2ri.rpy2py(external_val)

external_expr = pd.read_csv('data/external_expr.csv', index_col=0)

In [32]:
cat_map = {
        'Sex': {'M': 0, 'F': 1},
        'Smoking': {'Never': 0, 'Former': 1, 'Current': 2},
        'Concomitant.CIS': {'No': 0, 'Yes': 1},
        'Incident.tumor': {'No': 0, 'Yes': 1},
        'Tumor.size': {'< 3 cm': 0, '>= 3 cm': 1},
        'EAU.risk': {'Low': 0, 'Intermediate': 1, 'High': 2},
        'UROMOL2021.classification': {'Class 1': 1, 'Class 2a': 4, 'Class 2b': 3, 'Class 3': 2}
    }

In [33]:
clinical_df = clean_labels(clinical_df)
clinical_df = map_clinical_categories(clinical_df, cat_map)
num_cols = ['Age', 'FUtime_days.']
cat_cols = [col for col in cat_map if col in clinical_df.columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[label_col] = df[label_col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [34]:
clinical_df, median_vals, mode_vals = preprocess_clinical(clinical_df, num_cols, cat_cols)
clinical_df = drop_constant_columns(clinical_df)

In [35]:
print("Missing values per column in clinical_df:")
print(clinical_df.isnull().sum()[clinical_df.isnull().sum() > 0])

Missing values per column in clinical_df:
PFS_time.    1
dtype: int64


In [36]:
feature_col = ['Age', 'Sex', 'Smoking', 'Concomitant.CIS', 'Tumor.size', 'Incident.tumor', 'EAU.risk', 'BCG', 'UROMOL2021.classification']
label_col = ['Recurrence']

In [37]:
clinical_df[feature_col]

Unnamed: 0,Age,Sex,Smoking,Concomitant.CIS,Tumor.size,Incident.tumor,EAU.risk,BCG,UROMOL2021.classification
U0001,86,0,0.0,0,0.0,0,1.0,0,1
U0002,73,0,1.0,0,0.0,0,1.0,0,3
U0007,66,0,2.0,1,0.0,0,2.0,1,1
U0010,76,0,2.0,0,0.0,0,1.0,0,3
U0012,83,1,0.0,0,0.0,0,1.0,0,4
...,...,...,...,...,...,...,...,...,...
U2116,59,1,2.0,0,0.0,0,1.0,1,2
U2117,61,0,2.0,0,0.0,1,1.0,1,4
U2118,60,1,2.0,0,0.0,0,1.0,1,3
U2120,52,1,2.0,0,0.0,0,1.0,1,4


In [38]:
exprs_top_df = exprs_df.loc[clinical_df.index]
TOP_N_GENES = 400
PCA_VARIANCE_THRESHOLD = 0.75
pca_df, pca_model, scaler, top_genes = apply_pca(exprs_top_df, top_n=TOP_N_GENES, variance_threshold=PCA_VARIANCE_THRESHOLD)

full_df = clinical_df[feature_col + label_col].join(pca_df)
full_df.columns = full_df.columns.astype(str)

X = full_df.drop(columns='Recurrence')
y = full_df['Recurrence']


X_train, X_test, y_train, y_test = split_data(X, y)

models = train_models(X_train, y_train)

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    print_model_report(f"{name} (Internal)", y_test, y_pred, y_proba)


📊 LogisticRegression (Internal) Evaluation
              precision    recall  f1-score   support

           0       0.57      0.47      0.52        17
           1       0.79      0.85      0.81        39

    accuracy                           0.73        56
   macro avg       0.68      0.66      0.67        56
weighted avg       0.72      0.73      0.72        56

AUC: 0.7405731523378583

📊 RandomForest (Internal) Evaluation
              precision    recall  f1-score   support

           0       0.38      0.18      0.24        17
           1       0.71      0.87      0.78        39

    accuracy                           0.66        56
   macro avg       0.54      0.52      0.51        56
weighted avg       0.61      0.66      0.62        56

AUC: 0.4766214177978884

📊 XGBoost (Internal) Evaluation
              precision    recall  f1-score   support

           0       0.40      0.24      0.30        17
           1       0.72      0.85      0.78        39

    accuracy       

In [39]:
feature_col

['Age',
 'Sex',
 'Smoking',
 'Concomitant.CIS',
 'Tumor.size',
 'Incident.tumor',
 'EAU.risk',
 'BCG',
 'UROMOL2021.classification']

In [40]:
cat_map = {
    'Sex': {'M': 0, 'F': 1},
    'Smoking': {'Never': 0, 'Former': 1, 'Current': 2},
    'Concomitant.CIS': {'No': 0, 'Yes': 1},
    'Incident.tumor': {'No': 0, 'Yes': 1},
    'Tumor.size': {'< 3 cm': 0, '>= 3 cm': 1},
    'EAU.risk': {'Low': 0, 'Intermediate': 1, 'High': 2},
    'UROMOL2021.classification': {'Class_1': 1, 'Class_2a': 4, 'Class_2b': 3, 'Class_3': 2}
}

In [41]:
external_clinical_df['UROMOL2021.classification'].value_counts()

Class_1     67
Class_3      8
Class_2b     2
Name: UROMOL2021.classification, dtype: int64

In [42]:
external_clinical_df

Unnamed: 0,Progression,PFS_time.,Recurrence,RFS_time,FUtime_days.,Age,Sex,Tumor.stage,Tumor.grade,Concomitant.CIS,BCG,UROMOL2021.classification,knowles_ID
MK66,0,,1,5.0,4110,64,F,Ta,Low,No,0,Class_1,MK66
MK103,0,,1,14.0,840,33,F,Ta,Low,No,0,Class_1,MK103
MK239,0,,0,,1830,59,M,Ta,Low,No,0,Class_1,MK239
MK390,0,,0,,3660,28,M,Ta,Low,No,0,Class_1,MK390
MK419,0,,1,11.0,2010,70,M,Ta,Low,No,0,Class_2b,MK419
...,...,...,...,...,...,...,...,...,...,...,...,...,...
MK2357,0,,0,,2100,84,M,Ta,Low,No,0,Class_1,MK2357
MK2425,0,,0,,960,67,F,Ta,Low,No,0,Class_1,MK2425
MK2452,0,,0,,870,60,F,Ta,Low,No,0,Class_1,MK2452
MK2472,0,,0,,1020,68,F,Ta,Low,No,0,Class_1,MK2472


In [43]:
external_clinical_df = map_clinical_categories(external_clinical_df, cat_map)

In [44]:
external_clinical_df

Unnamed: 0,Progression,PFS_time.,Recurrence,RFS_time,FUtime_days.,Age,Sex,Tumor.stage,Tumor.grade,Concomitant.CIS,BCG,UROMOL2021.classification,knowles_ID
MK66,0,,1,5.0,4110,64,1,Ta,Low,0,0,1,MK66
MK103,0,,1,14.0,840,33,1,Ta,Low,0,0,1,MK103
MK239,0,,0,,1830,59,0,Ta,Low,0,0,1,MK239
MK390,0,,0,,3660,28,0,Ta,Low,0,0,1,MK390
MK419,0,,1,11.0,2010,70,0,Ta,Low,0,0,3,MK419
...,...,...,...,...,...,...,...,...,...,...,...,...,...
MK2357,0,,0,,2100,84,0,Ta,Low,0,0,1,MK2357
MK2425,0,,0,,960,67,1,Ta,Low,0,0,1,MK2425
MK2452,0,,0,,870,60,1,Ta,Low,0,0,1,MK2452
MK2472,0,,0,,1020,68,1,Ta,Low,0,0,1,MK2472


In [45]:
external_clinical_df.columns,external_expr.columns

(Index(['Progression', 'PFS_time.', 'Recurrence', 'RFS_time', 'FUtime_days.',
        'Age', 'Sex', 'Tumor.stage', 'Tumor.grade', 'Concomitant.CIS', 'BCG',
        'UROMOL2021.classification', 'knowles_ID'],
       dtype='object'),
 Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'FIRRM', 'FGR', 'CFH', 'FUCA2',
        'GCLC', 'NFYA',
        ...
        'F8A1', 'H2AC18', 'H2AC19', 'PPIAL4C', 'PPIAL4D', 'TARP',
        'GUCA1ANB.GUCA1A', 'HOMEZ', 'SOD2', 'PRAMEF22'],
       dtype='object', length=20280))

In [46]:
clincal_col_feature = [i for i in external_clinical_df.columns if i in feature_col]
expr_columns_top_n = [i for i in external_expr.columns if i in top_genes]

In [47]:
comb_feature = clincal_col_feature + expr_columns_top_n

In [48]:
clinical_df[feature_col]

Unnamed: 0,Age,Sex,Smoking,Concomitant.CIS,Tumor.size,Incident.tumor,EAU.risk,BCG,UROMOL2021.classification
U0001,86,0,0.0,0,0.0,0,1.0,0,1
U0002,73,0,1.0,0,0.0,0,1.0,0,3
U0007,66,0,2.0,1,0.0,0,2.0,1,1
U0010,76,0,2.0,0,0.0,0,1.0,0,3
U0012,83,1,0.0,0,0.0,0,1.0,0,4
...,...,...,...,...,...,...,...,...,...
U2116,59,1,2.0,0,0.0,0,1.0,1,2
U2117,61,0,2.0,0,0.0,1,1.0,1,4
U2118,60,1,2.0,0,0.0,0,1.0,1,3
U2120,52,1,2.0,0,0.0,0,1.0,1,4


In [49]:
external_clinical_df, _, _ = preprocess_clinical(external_clinical_df, num_cols, cat_cols, median_vals, mode_vals)

In [50]:
external_clinical_df, external_expr = knn_impute(external_clinical_df, external_expr, clinical_df, exprs_df, comb_feature, feature_col, top_genes, cat_cols)

  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out_df.at[i, col] = fill_val
  target_out

In [51]:
external_expr = external_expr[top_genes]
external_clinical_df = external_clinical_df[feature_col + label_col]

In [52]:
external_expr

Unnamed: 0,RN7SK,RN7SL1,LncRNA5978_ENSG00000280800,LncRNA6003_ENSG00000281181,LncRNA5967_ENSG00000280614,LncRNA6013_ENSG00000281383,XIST,GSTM1,LncRNA1620_ENSG00000240801,HSPA1A,...,HIST2H2AA4,KRT16P6,F5,DSG3,HLA.E,PLEKHS1,SDR16C5,TEPP,GRIN2D,ATP7A
MK66,12.870962,7.821870,10.400390,10.400525,10.400393,4.897733,7.714373,13.178860,3.261883,10.728666,...,4.709677,0.250603,4.562016,4.559483,12.325323,4.511209,5.203619,2.550790,4.651881,8.941772
MK103,12.870962,7.821870,10.400390,10.400525,10.400393,4.897733,7.714373,13.341340,3.261883,11.558671,...,4.709677,0.250603,4.723045,4.634116,13.266673,5.751118,4.270999,2.550790,4.544121,8.374304
MK239,13.544259,4.974818,9.214611,9.214742,9.214611,3.394364,6.260654,9.302856,3.261883,11.229165,...,5.320135,-0.441089,11.808340,4.471249,11.853296,5.407393,5.152960,0.972987,4.300750,9.247746
MK390,13.016067,8.497137,8.382140,8.382144,8.382142,2.640335,4.049221,6.674160,1.559029,11.337702,...,5.285472,-0.886513,5.104369,4.474458,12.128286,5.077584,5.069198,2.138777,4.506620,8.642154
MK419,13.764138,0.155335,10.412718,10.412849,10.412719,4.826256,7.720590,6.383024,2.438618,9.448853,...,4.483570,-0.252655,5.600309,4.541472,14.381657,4.430404,5.748945,0.726356,4.601668,8.238297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MK2357,13.992218,3.429023,7.059733,7.059732,7.059733,0.367407,3.272114,6.104260,0.104059,11.310214,...,4.815904,-0.137032,4.637630,4.513762,13.393627,7.853032,4.908701,-1.566815,4.989958,8.696990
MK2425,13.674056,6.985609,6.851406,6.851405,6.851406,0.890151,5.275400,12.902710,3.257597,11.437647,...,5.777514,-0.535376,4.499330,4.732277,12.874150,4.398595,5.441872,0.316159,4.605361,8.964309
MK2452,12.870962,7.821870,10.400390,10.400525,10.400393,4.897733,7.714373,6.736081,3.261883,11.067221,...,4.709677,0.250603,5.108353,4.492287,12.152093,4.881509,4.273776,2.550790,4.598988,9.605453
MK2472,13.544259,4.974818,9.214611,9.214742,9.214611,3.394364,6.260654,9.574966,3.261883,9.805158,...,5.320135,-0.441089,4.895480,4.639116,12.686610,4.630273,5.474574,0.972987,4.819488,8.343604


In [55]:
# Replace invalid value with NaN
external_clinical_df['Recurrence'] = external_clinical_df['Recurrence'].replace(-2147483648, np.nan)

# Drop rows with missing labels
external_clinical_df = external_clinical_df.dropna(subset=['Recurrence'])

# Convert to int (just in case)
external_clinical_df['Recurrence'] = external_clinical_df['Recurrence'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  external_clinical_df['Recurrence'] = external_clinical_df['Recurrence'].replace(-2147483648, np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  external_clinical_df['Recurrence'] = external_clinical_df['Recurrence'].astype(int)


In [56]:
external_pca_df = pd.DataFrame(pca_model.transform(external_expr), index=external_expr.index,
                                columns=[f'PC{i+1}' for i in range(pca_model.n_components_)])

external_df_final = external_clinical_df[X.columns.intersection(external_clinical_df.columns)].join(external_pca_df)
y_true_ext = external_clinical_df.loc[external_df_final.index, 'Recurrence']

external_df_final.columns = external_df_final.columns.astype(str)

run_external_validation(models, external_df_final, y_true_ext)


📊 LogisticRegression (External) Evaluation
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        44
           1       0.40      1.00      0.57        29

    accuracy                           0.40        73
   macro avg       0.20      0.50      0.28        73
weighted avg       0.16      0.40      0.23        73

AUC: 0.5556426332288401

📊 RandomForest (External) Evaluation
              precision    recall  f1-score   support

           0       0.65      0.34      0.45        44
           1       0.42      0.72      0.53        29

    accuracy                           0.49        73
   macro avg       0.54      0.53      0.49        73
weighted avg       0.56      0.49      0.48        73

AUC: 0.47962382445141066

📊 XGBoost (External) Evaluation
              precision    recall  f1-score   support

           0       0.64      0.86      0.74        44
           1       0.57      0.28      0.37        29

    accuracy      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
