In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler, MinMaxScaler, MaxAbsScaler, PowerTransformer, StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc, make_scorer, average_precision_score, recall_score, precision_score, f1_score, ConfusionMatrixDisplay, balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, SelectFromModel
import torch
from transformers import AutoTokenizer, AutoModel
import json
import os

In [None]:
model_name_MB = "Charangan/MedBERT"
tokenizer_MB = AutoTokenizer.from_pretrained(model_name_MB)
model_MB = AutoModel.from_pretrained(model_name_MB)
model_MB.eval()

def get_embedding_MB(text, feature_value):
    inputs = tokenizer_MB(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model_MB(**inputs)
    text_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy().squeeze()
    feature_value_array = np.array([feature_value], dtype=np.float32)
    combined_embedding = np.concatenate([text_embeddings, feature_value_array])
    return combined_embedding.astype(np.float32)


model_name_CB = "medicalai/ClinicalBERT"
tokenizer_CB = AutoTokenizer.from_pretrained(model_name_CB)
model_CB = AutoModel.from_pretrained(model_name_CB)

model_CB.eval()

def get_embedding_CB(text, feature_value):
    inputs = tokenizer_CB(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model_CB(**inputs)
    text_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy().squeeze()
    feature_value_array = np.array([feature_value], dtype=np.float32)
    combined_embedding = np.concatenate([text_embeddings, feature_value_array])
    return combined_embedding.astype(np.float32)

Model_name_Bert = "google-bert/bert-base-uncased"
tokenizer_Bert = AutoTokenizer.from_pretrained(Model_name_Bert)
model_Bert = AutoModel.from_pretrained(Model_name_Bert)


model_Bert.eval()

def get_embedding_Bert(text, feature_value):
    inputs = tokenizer_Bert(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model_Bert(**inputs)
    text_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy().squeeze()
    feature_value_array = np.array([feature_value], dtype=np.float32)
    combined_embedding = np.concatenate([text_embeddings, feature_value_array])
    return combined_embedding.astype(np.float32)

Model_name_BB = "dmis-lab/biobert-v1.1"
tokenizer_BB = AutoTokenizer.from_pretrained(Model_name_BB)
model_BB = AutoModel.from_pretrained(Model_name_BB)

model_BB.eval()

def get_embedding(text, feature_value, method="cls"):
    inputs = tokenizer_BB(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = model_BB(**inputs)

    hidden_states = outputs.last_hidden_state  # shape: (1, seq_len, hidden_dim)

    if method == "cls":
        # CLS token embedding
        embedding = hidden_states[:, 0, :]  # shape: (1, hidden_dim)
        text_embedding = embedding.squeeze(0).cpu().numpy()  # shape: (hidden_dim,)
    elif method == "mean":
        # Mean pooling over tokens, masking out padding
        input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(hidden_states.size()).float()
        sum_embeddings = torch.sum(hidden_states * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        embedding = sum_embeddings / sum_mask  # shape: (1, hidden_dim)
        text_embedding = embedding.squeeze(0).cpu().numpy()  # shape: (hidden_dim,)
    #elif method == "token":
        #return hidden_states.squeeze(0).cpu().numpy()
    else:
        raise ValueError(f"Unknown method: {method}")

    # Ensure scalar feature_value is valid float
    try:
        feature_value_float = float(feature_value)
    except (ValueError, TypeError):
        feature_value_float = 0.0

    feature_value_array = np.array([feature_value_float], dtype=np.float32)
    combined_embedding = np.concatenate([text_embedding, feature_value_array])
    return combined_embedding.astype(np.float32)

def embedd_text(text):
    inputs = tokenizer_BB(text, return_tensors="pt")

    with torch.no_grad():
        outputs = model_BB(**inputs)

    # Get all token embeddings
    token_embeddings = outputs.last_hidden_state[:, 0, :]  # shape: (seq_len, hidden_dim)
    
    return token_embeddings.numpy().squeeze()

def get_word_level_embedding(text, feature_value):
    # Tokenize
    inputs = tokenizer_BB(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model_BB(**inputs)

    # Get all token embeddings
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # shape: (seq_len, hidden_dim)
    
    # Get attention mask and remove [CLS] and [SEP] tokens (usually first and last)
    attention_mask = inputs['attention_mask'].squeeze(0)
    valid_token_embeddings = token_embeddings[attention_mask == 1][1:-1]  # exclude CLS/SEP

    # Average over valid tokens
    mean_token_embedding = valid_token_embeddings.mean(dim=0).cpu().numpy()

    # Append the scalar feature
    feature_value_array = np.array([feature_value], dtype=np.float32)
    combined_embedding = np.concatenate([mean_token_embedding, feature_value_array])
    return combined_embedding.astype(np.float32)


def stack_embeddings(group):
    return np.mean(np.stack(group['embedding'].tolist()), axis=0)

def save_embeddings(df, embeddings, output_path):
    df['embedding'] = [e.astype(np.float32) for e in embeddings]
    output_dir = os.path.dirname(output_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created directory: {output_dir}")
    grouped_df = df.groupby("patient_id", group_keys=False)[["embedding"]].apply(stack_embeddings).reset_index()
    grouped_df.columns = ["patient_id", "combined_embedding"]

    # Sanity check before saving
    grouped_df["combined_embedding"] = grouped_df["combined_embedding"].apply(lambda x: json.dumps(x.tolist()))
    lengths = grouped_df["combined_embedding"].apply(lambda x: len(json.loads(x)))
    print("Embedding length distribution:", lengths.value_counts())
    grouped_df.to_csv(output_path, index=False)
    print(f"Embeddings saved to {output_path}")


def fit_logistic_regression(X, y,):
    model = LogisticRegression(solver='liblinear', max_iter=5000, class_weight='balanced')
    return model.fit(X, y)


def find_best_threshold(y_true, y_pred_probs):
    thresholds = np.arange(0.1, 0.9, 0.01)
    best_threshold = 0.5
    best_balanced_acc = -1

    for threshold in thresholds:
        y_pred = (y_pred_probs >= threshold).astype(int)
        bal_acc = balanced_accuracy_score(y_true, y_pred)

        if bal_acc > best_balanced_acc:
            best_balanced_acc = bal_acc
            best_threshold = threshold

    return best_threshold

def metrics(y_test, y_pred_probs, threshold):
    y_pred = (y_pred_probs >= threshold).astype(int)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_probs)
    f1 = f1_score(y_test, y_pred)
    sensitivity = recall_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    return bal_acc, auc, f1, sensitivity, specificity

def bootstrap_metrics(X_train, X_test, y_train, y_test, threshold, n_bootstrap=1000):
    """Performs bootstrap resampling and collects all metrics"""
    
    # Fit model once and get ORs
    model = fit_logistic_regression(X_train, y_train)

    
    boot_auc, boot_bal_acc, boot_sens, boot_spec, boot_f1 = [], [], [], [], []
    
    # Stratified resampling (to ensure label distribution is maintained in each bootstrap sample)
    sss = StratifiedShuffleSplit(n_splits=n_bootstrap, test_size=len(X_test)-5, random_state=42)

    y_pred_probs_full = model.predict_proba(X_test)[:, 1]  # Get predicted probabilities for the full test set
    if threshold is None:
        threshold = find_best_threshold(y_test, y_pred_probs_full)
        print(threshold)
    
    for _, test_index in sss.split(X_test, y_test):  # Splitting indices based on stratification
        # Get the stratified bootstrap sample
        sample_df_test = X_test.iloc[test_index]
        sample_dep_test = y_test.iloc[test_index]
        
        # Get predicted probabilities for the resampled test data
        y_pred_probs = model.predict_proba(sample_df_test)[:, 1]
        
        # Compute performance metrics
        bal_acc, auc, f1, sens, spec = metrics(sample_dep_test, y_pred_probs, threshold)
        
        # Append metrics to the lists
        boot_auc.append(auc)
        boot_bal_acc.append(bal_acc)
        boot_sens.append(sens)
        boot_spec.append(spec)
        boot_f1.append(f1)
    
    # Compute means and 95% Confidence Intervals
    def ci(data):
        return np.mean(data), np.percentile(data, [2.5, 97.5])

    return {
        "model": model,
        "Bootstrapped Metrics": {
            "AUC": ci(boot_auc),
            "Balanced Accuracy": ci(boot_bal_acc),
            "Sensitivity": ci(boot_sens),
            "Specificity": ci(boot_spec),
            "F1-score": ci(boot_f1),
        },
        "Arrays with bootstrapping":{
            "AUC": boot_auc,
            "Balanced Accuracy": boot_bal_acc,
            "Sensitivity": boot_sens,
            "Specificity": boot_spec,
            "F1-score": boot_f1,
        }
    }



def ready_for_test(df_path):
    df = pd.read_csv(df_path)
    df_marksheet = pd.read_csv("Data/Preprocessed_marksheet_all.csv")
    df_csPCa = df_marksheet.drop(columns=["study_id", "mri_date", "patient_age", "psa", "psad", "prostate_volume", "histopath_type", "lesion_GS", "lesion_ISUP", "case_ISUP", "center"])
    merge_df = pd.merge(df_csPCa, df, on="patient_id", how="left")
    merge_df = merge_df.dropna()
    y = merge_df["case_csPCa"].values
    merge_df = merge_df.drop(columns=["patient_id", "case_csPCa"])
    X = np.vstack(merge_df["combined_embedding"].apply(lambda x: np.array(json.loads(x))).values)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_test = pd.DataFrame(X_test)
    y_test = pd.Series(y_test)
    return X_train, X_test, y_train, y_test



def summarize_results(model_name, embeddings, context, feat_name, feat_expl, bootstrap_results):
    summary = bootstrap_results["Bootstrapped Metrics"]
    return {
        "Model": model_name,
        "Embedding": embeddings,
        "Context": context,
        "Feature Name": feat_name,
        "Feature Explanation": feat_expl,
        "AUC (95% CI)": f"{summary['AUC'][0]:.3f} ({summary['AUC'][1][0]:.3f}-{summary['AUC'][1][1]:.3f})",
        "Balanced Accuracy (95% CI)": f"{summary['Balanced Accuracy'][0]:.3f} ({summary['Balanced Accuracy'][1][0]:.3f}-{summary['Balanced Accuracy'][1][1]:.3f})",
        "F1 (95% CI)": f"{summary['F1-score'][0]:.3f} ({summary['F1-score'][1][0]:.3f}-{summary['F1-score'][1][1]:.3f})",
        "Sensitivity (95% CI)": f"{summary['Sensitivity'][0]:.3f} ({summary['Sensitivity'][1][0]:.3f}-{summary['Sensitivity'][1][1]:.3f})",
        "Specificity (95% CI)": f"{summary['Specificity'][0]:.3f} ({summary['Specificity'][1][0]:.3f}-{summary['Specificity'][1][1]:.3f})"
    }


def add_to_table(model_name, embeddings, context, feat_name, feat_expl, bootstrap_results):
    summary = summarize_results(model_name, embeddings, context, feat_name, feat_expl, bootstrap_results)
    df_table = pd.read_csv("Data/Results_table3.csv")
    df_summary = pd.DataFrame([summary])
    df_table = pd.concat([df_table, df_summary], ignore_index=True)
    df_table.to_csv("Data/Results_table3.csv", index=False)

def process(model, embeddings, guideline, feature_name, feature_context):
    df = pd.read_csv("Data/Guidelines/guidelines_with_features_resampled2_bin5.csv")
    if guideline == "Yes" and feature_name == "Yes" and feature_context == "Yes":
        if model == "BioBERT":
            embedding = df.apply(lambda x: get_embedding(x["guideline"] + " " + x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
        elif model == "ClinicalBERT":
            embedding = df.apply(lambda x: get_embedding_CB(x["guideline"] + " " + x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
        elif model == "BERT":
            embedding = df.apply(lambda x: get_embedding_Bert(x["guideline"] + " " + x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
        elif model == "MedBERT":
            embedding = df.apply(lambda x: get_embedding_MB(x["guideline"] + " " + x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
    elif guideline == "Yes" and feature_name == "Yes" and feature_context == "No":
        if model == "BioBERT":
            embedding = df.apply(lambda x: get_embedding(x["guideline"] + " " + x["feature_name"], x["feature_value"]), axis=1)
        elif model == "ClinicalBERT":
            embedding = df.apply(lambda x: get_embedding_CB(x["guideline"] + " " + x["feature_name"], x["feature_value"]), axis=1)
        elif model == "BERT":
            embedding = df.apply(lambda x: get_embedding_Bert(x["guideline"] + " " + x["feature_name"], x["feature_value"]), axis=1)
        elif model == "MedBERT":
            embedding = df.apply(lambda x: get_embedding_MB(x["guideline"] + " " + x["feature_name"], x["feature_value"]), axis=1)
    elif guideline == "Yes" and feature_name == "No" and feature_context == "Yes":
        if model == "BioBERT":
            embedding = df.apply(lambda x: get_embedding(x["guideline"] + " " + x["feature_context"], x["feature_value"]), axis=1)
        elif model == "ClinicalBERT":
            embedding = df.apply(lambda x: get_embedding_CB(x["guideline"] + " " + x["feature_context"], x["feature_value"]), axis=1)
        elif model == "BERT":
            embedding = df.apply(lambda x: get_embedding_Bert(x["guideline"] + " " + x["feature_context"], x["feature_value"]), axis=1)
        elif model == "MedBERT":
            embedding = df.apply(lambda x: get_embedding_MB(x["guideline"] + " " + x["feature_context"], x["feature_value"]), axis=1)
    elif guideline == "Yes" and feature_name == "No" and feature_context == "No":
        if model == "BioBERT":
            embedding = df.apply(lambda x: get_embedding(x["guideline"], x["feature_value"]), axis=1)
        elif model == "ClinicalBERT":
            embedding = df.apply(lambda x: get_embedding_CB(x["guideline"], x["feature_value"]), axis=1)
        elif model == "BERT":
            embedding = df.apply(lambda x: get_embedding_Bert(x["guideline"], x["feature_value"]), axis=1)
        elif model == "MedBERT":
            embedding = df.apply(lambda x: get_embedding_MB(x["guideline"], x["feature_value"]), axis=1)
    elif guideline == "No" and feature_name == "Yes" and feature_context == "Yes":
        if model == "BioBERT":
            embedding = df.apply(lambda x: get_embedding(x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
        elif model == "ClinicalBERT":
            embedding = df.apply(lambda x: get_embedding_CB(x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
        elif model == "BERT":
            embedding = df.apply(lambda x: get_embedding_Bert(x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
        elif model == "MedBERT":
            embedding = df.apply(lambda x: get_embedding_MB(x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
    elif guideline == "No" and feature_name == "Yes" and feature_context == "No":
        if model == "BioBERT":
            embedding = df.apply(lambda x: get_embedding(x["feature_name"], x["feature_value"]), axis=1)
        elif model == "ClinicalBERT":
            embedding = df.apply(lambda x: get_embedding_CB(x["feature_name"], x["feature_value"]), axis=1)
        elif model == "BERT":
            embedding = df.apply(lambda x: get_embedding_Bert(x["feature_name"], x["feature_value"]), axis=1)
        elif model == "MedBERT":
            embedding = df.apply(lambda x: get_embedding_MB(x["feature_name"], x["feature_value"]), axis=1)
    elif guideline == "No" and feature_name == "No" and feature_context == "Yes":
        if model == "BioBERT":
            embedding = df.apply(lambda x: get_embedding(x["feature_context"], x["feature_value"]), axis=1)
        elif model == "ClinicalBERT":
            embedding = df.apply(lambda x: get_embedding_CB(x["feature_context"], x["feature_value"]), axis=1)
        elif model == "BERT":
            embedding = df.apply(lambda x: get_embedding_Bert(x["feature_context"], x["feature_value"]), axis=1)
        elif model == "MedBERT":
            embedding = df.apply(lambda x: get_embedding_MB(x["feature_context"], x["feature_value"]), axis=1)
    elif guideline == "No" and feature_name == "No" and feature_context == "No":
        if model == "BioBERT":
            embedding = df.apply(lambda x: get_embedding("", x["feature_value"]), axis=1)
        elif model == "ClinicalBERT":
            embedding = df.apply(lambda x: get_embedding_CB("", x["feature_value"]), axis=1)
        elif model == "BERT":
            embedding = df.apply(lambda x: get_embedding_Bert("", x["feature_value"]), axis=1)
        elif model == "MedBERT":
            embedding = df.apply(lambda x: get_embedding_MB("", x["feature_value"]), axis=1)
    else:
        print("Invalid combination of guideline, feature_name, and feature_context.")

    save_embeddings(df, embedding, f"Data/embeddings/{model}_4/{model}_{guideline}_{feature_name}_{feature_context}.csv")
    X_train, X_test, y_train, y_test = ready_for_test(f"Data/embeddings/{model}_4/{model}_{guideline}_{feature_name}_{feature_context}.csv")
    results = bootstrap_metrics(X_train, X_test, y_train, y_test, threshold=0.535, n_bootstrap=1000)
    add_to_table(model, embeddings, guideline, feature_name, feature_context, results)


In [None]:
df_marksheet = pd.read_csv("Data/Preprocessed_marksheet_all.csv")

y = df_marksheet["case_csPCa"].values
X = df_marksheet.drop(columns=["study_id", "mri_date", "histopath_type", "lesion_GS", "lesion_ISUP", "case_ISUP", "center", "patient_id", "case_csPCa"]).values
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_test = pd.DataFrame(X_test)
y_test = pd.Series(y_test)
bootstrap_results_model = bootstrap_metrics(X_train, X_test, y_train, y_test, threshold=0.535, n_bootstrap=1000)
add_to_table("Clinical features", "No", "No", "No", "No", bootstrap_results_model)

In [None]:
df_features = pd.read_csv("Data/Radiomic_features/Radiomic_features.csv")
df_marksheet = pd.read_csv("Data/Preprocessed_marksheet_all.csv")

df_csPCa = df_marksheet.drop(columns=["study_id", "mri_date", "patient_age", "psa", "psad", "prostate_volume", "histopath_type", "lesion_GS", "lesion_ISUP", "case_ISUP", "center"])
df_features.drop(columns=["Unnamed: 0"], inplace=True)

merge_df = pd.merge(df_csPCa, df_features, on="patient_id", how="left")
merge_df = merge_df.dropna()
y = merge_df["case_csPCa"].values
merge_df = merge_df.drop(columns=["patient_id", "case_csPCa"])
X_raw = merge_df.values
df_X = pd.DataFrame(X_raw, columns=merge_df.columns)
X_train, X_test, y_train, y_test = train_test_split(df_X, y, test_size=0.3, random_state=42, stratify=y)

X_test = pd.DataFrame(X_test)
y_test = pd.Series(y_test)
bootstrap_results_model = bootstrap_metrics(X_train, X_test, y_train, y_test, n_bootstrap=1000, threshold=0.535)
add_to_table("Radiomic features", "No", "No", "No", "No", bootstrap_results_model)

In [117]:
process("BioBERT", "Yes", "Yes", "Yes", "Yes")
process("BioBERT", "Yes", "Yes", "Yes", "No")
process("BioBERT", "Yes", "Yes", "No", "Yes")
process("BioBERT", "Yes", "Yes", "No", "No")
process("BioBERT", "Yes", "No", "Yes", "Yes")
process("BioBERT", "Yes", "No", "Yes", "No")
process("BioBERT", "Yes", "No", "No", "Yes")
process("BioBERT", "Yes", "No", "No", "No")

Created directory: Data/embeddings/BioBERT_4
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_4/BioBERT_Yes_Yes_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_4/BioBERT_Yes_Yes_No.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_4/BioBERT_Yes_No_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_4/BioBERT_Yes_No_No.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_4/BioBERT_No_Yes_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_4/BioBERT_No_Yes_No.csv
Embedding length dis

In [118]:
process("BERT", "Yes", "Yes", "Yes", "Yes")
process("BERT", "Yes", "Yes", "Yes", "No")
process("BERT", "Yes", "Yes", "No", "Yes")
process("BERT", "Yes", "Yes", "No", "No")
process("BERT", "Yes", "No", "Yes", "Yes")
process("BERT", "Yes", "No", "Yes", "No")
process("BERT", "Yes", "No", "No", "Yes")
process("BERT", "Yes", "No", "No", "No")

Created directory: Data/embeddings/BERT_4
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BERT_4/BERT_Yes_Yes_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BERT_4/BERT_Yes_Yes_No.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BERT_4/BERT_Yes_No_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BERT_4/BERT_Yes_No_No.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BERT_4/BERT_No_Yes_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BERT_4/BERT_No_Yes_No.csv
Embedding length distribution: combined_embedding
769    77

In [119]:
process("ClinicalBERT", "Yes", "Yes", "Yes", "Yes")
process("ClinicalBERT", "Yes", "Yes", "Yes", "No")
process("ClinicalBERT", "Yes", "Yes", "No", "Yes")
process("ClinicalBERT", "Yes", "Yes", "No", "No")
process("ClinicalBERT", "Yes", "No", "Yes", "Yes")
process("ClinicalBERT", "Yes", "No", "Yes", "No")
process("ClinicalBERT", "Yes", "No", "No", "Yes")
process("ClinicalBERT", "Yes", "No", "No", "No")

Created directory: Data/embeddings/ClinicalBERT_4
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/ClinicalBERT_4/ClinicalBERT_Yes_Yes_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/ClinicalBERT_4/ClinicalBERT_Yes_Yes_No.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/ClinicalBERT_4/ClinicalBERT_Yes_No_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/ClinicalBERT_4/ClinicalBERT_Yes_No_No.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/ClinicalBERT_4/ClinicalBERT_No_Yes_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddin

In [116]:
process("MedBERT", "Yes", "Yes", "Yes", "Yes")
process("MedBERT", "Yes", "Yes", "Yes", "No")
process("MedBERT", "Yes", "Yes", "No", "Yes")
process("MedBERT", "Yes", "Yes", "No", "No")
process("MedBERT", "Yes", "No", "Yes", "Yes")
process("MedBERT", "Yes", "No", "Yes", "No")
process("MedBERT", "Yes", "No", "No", "Yes")
process("MedBERT", "Yes", "No", "No", "No")

Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/MedBERT_4/MedBERT_No_Yes_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/MedBERT_4/MedBERT_No_Yes_No.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/MedBERT_4/MedBERT_No_No_Yes.csv
Embedding length distribution: combined_embedding
769    772
Name: count, dtype: int64
Embeddings saved to Data/embeddings/MedBERT_4/MedBERT_No_No_No.csv


In [36]:
process("BioBERT", "Yes", "No", "No", "No")

Embedding length distribution: combined_embedding
14611    1324
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_3/BioBERT_No_No_No.csv


In [77]:
df = pd.read_csv("guidelines_scaled_features_MaxAbsScaler.csv")
emb = df.apply(lambda x: get_embedding(x["guideline"] + " " + x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
save_embeddings(df, emb, "Data/embeddings/BioBERT_3/BioBERT_MaxAbs.csv")
X_train, X_test, y_train, y_test = ready_for_test(f"Data/embeddings/BioBERT_3/BioBERT_MaxAbs.csv")
results = bootstrap_metrics(X_train, X_test, y_train, y_test, threshold=0.535, n_bootstrap=1000)
add_to_table("BioBERT", "Yes", "Yes", "Yes", "Yes_MaxAbs", results)

Embedding length distribution: combined_embedding
14611    1324
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_3/BioBERT_MaxAbs.csv


In [78]:
df = pd.read_csv("guidelines_scaled_features_MinMaxScaler.csv")
emb = df.apply(lambda x: get_embedding(x["guideline"] + " " + x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
save_embeddings(df, emb, "Data/embeddings/BioBERT_3/BioBERT_MinMax.csv")
X_train, X_test, y_train, y_test = ready_for_test(f"Data/embeddings/BioBERT_3/BioBERT_MinMax.csv")
results = bootstrap_metrics(X_train, X_test, y_train, y_test, threshold=0.535, n_bootstrap=1000)
add_to_table("BioBERT", "Yes", "Yes", "Yes", "Yes_MinMax", results)

Embedding length distribution: combined_embedding
14611    1324
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_3/BioBERT_MinMax.csv


In [79]:
df = pd.read_csv("guidelines_scaled_features_RobustScaler.csv")
emb = df.apply(lambda x: get_embedding(x["guideline"] + " " + x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
save_embeddings(df, emb, "Data/embeddings/BioBERT_3/BioBERT_Robust.csv")
X_train, X_test, y_train, y_test = ready_for_test(f"Data/embeddings/BioBERT_3/BioBERT_Robust.csv")
results = bootstrap_metrics(X_train, X_test, y_train, y_test, threshold=0.535, n_bootstrap=1000)
add_to_table("BioBERT", "Yes", "Yes", "Yes", "Yes_Robust", results)

Embedding length distribution: combined_embedding
14611    1324
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_3/BioBERT_Robust.csv


In [80]:
df = pd.read_csv("guidelines_scaled_features_StandardScaler.csv")
emb = df.apply(lambda x: get_embedding(x["guideline"] + " " + x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
save_embeddings(df, emb, "Data/embeddings/BioBERT_3/BioBERT_Standard.csv")
X_train, X_test, y_train, y_test = ready_for_test(f"Data/embeddings/BioBERT_3/BioBERT_Standard.csv")
results = bootstrap_metrics(X_train, X_test, y_train, y_test, threshold=0.535, n_bootstrap=1000)
add_to_table("BioBERT", "Yes", "Yes", "Yes", "Yes_Standard", results)

Embedding length distribution: combined_embedding
14611    1324
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_3/BioBERT_Standard.csv


In [81]:
df = pd.read_csv("guidelines_features_rounded.csv")
emb = df.apply(lambda x: get_embedding(x["guideline"] + " " + x["feature_name"] + " " + x["feature_context"], x["feature_value"]), axis=1)
save_embeddings(df, emb, "Data/embeddings/BioBERT_3/BioBERT_rounded.csv")
X_train, X_test, y_train, y_test = ready_for_test(f"Data/embeddings/BioBERT_3/BioBERT_rounded.csv")
results = bootstrap_metrics(X_train, X_test, y_train, y_test, threshold=0.535, n_bootstrap=1000)
add_to_table("BioBERT", "Yes", "Yes", "Yes", "Yes_Rounded", results)

Embedding length distribution: combined_embedding
14611    1324
Name: count, dtype: int64
Embeddings saved to Data/embeddings/BioBERT_3/BioBERT_rounded.csv


In [99]:
df = pd.read_csv("Data/removed_low_correlation_features_bin5.csv")
df_marksheet = pd.read_csv("Data/Preprocessed_marksheet_all.csv")

df_csPCa = df_marksheet.drop(columns=["study_id", "mri_date", "patient_age", "psa", "psad", "prostate_volume", "histopath_type", "lesion_GS", "lesion_ISUP", "case_ISUP", "center"])

merge_df = pd.merge(df_csPCa, df, on="patient_id", how="left")
merge_df = merge_df.dropna()
y = merge_df["case_csPCa"].values
merge_df = merge_df.drop(columns=["patient_id", "case_csPCa"])
X_raw = merge_df.values
df_X = pd.DataFrame(X_raw, columns=merge_df.columns)
X_train, X_test, y_train, y_test = train_test_split(df_X, y, test_size=0.3, random_state=42, stratify=y)

X_test = pd.DataFrame(X_test)
y_test = pd.Series(y_test)
bootstrap_results_model = bootstrap_metrics(X_train, X_test, y_train, y_test, n_bootstrap=1000, threshold=0.535)
add_to_table("Radiomic features_Removedlowcorr", "No", "No", "No", "No", bootstrap_results_model)

In [102]:
df = pd.read_csv("scaled_radiomic_features_MaxAbsScaler.csv")
df_marksheet = pd.read_csv("Data/Preprocessed_marksheet_all.csv")

df_csPCa = df_marksheet.drop(columns=["study_id", "mri_date", "patient_age", "psa", "psad", "prostate_volume", "histopath_type", "lesion_GS", "lesion_ISUP", "case_ISUP", "center"])

merge_df = pd.merge(df_csPCa, df, on="patient_id", how="left")
merge_df = merge_df.dropna()
y = merge_df["case_csPCa"].values
merge_df = merge_df.drop(columns=["patient_id", "case_csPCa"])
X_raw = merge_df.values
df_X = pd.DataFrame(X_raw, columns=merge_df.columns)
X_train, X_test, y_train, y_test = train_test_split(df_X, y, test_size=0.3, random_state=42, stratify=y)

X_test = pd.DataFrame(X_test)
y_test = pd.Series(y_test)
bootstrap_results_model = bootstrap_metrics(X_train, X_test, y_train, y_test, n_bootstrap=1000, threshold=0.535)
add_to_table("Radiomic features_Removedlowcorr", "No", "No", "No", "No", bootstrap_results_model)