In [None]:
# General Packages
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing

os.environ['HF_HOME'] = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read())
import torch
from transformers import BertModel, BertTokenizer

# Sklearn
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Packages for results/plotting
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib import rc

# Change default font of matplotlib to monospace
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams["font.family"] = "monospace"


In [None]:
DATASET = 'cmcqrd' # 'usmle', 'bio' or 'cmcqrd'
RESULTS_DATASET = '../data/' + DATASET + '/preprocessed/combined_results'
TARGET_LABEL_COL_NAME = 'Correct_Answer_Rate' # "Correct_Answer_Rate" or "Difficulty" (latter only for cmcqrd) or "Response_Time"(for usmle only)
REPETITIONS = 10 # Number of repetitions for each experiment, average will be taken
EMBEDDINGS_MODEL = 'bert-base-uncased' # 'bert-base-uncased' or 'emilyalsentzer/Bio_ClinicalBERT'
HF_TOKEN = open('../tokens/HF_TOKEN.txt', 'r').read()
# Number of cores to use for sklearn's n_jobs parameter, whenever possible
NUM_OF_CORES_TO_USE = multiprocessing.cpu_count() 
print("Using ", NUM_OF_CORES_TO_USE, " cores.")

MODEL_NAMES = ['phi3_5-chat', 'Llama3_2-3b-chat', 'Qwen2_5-3b-chat', 'Llama3_1-8b-chat', 'Qwen2_5-14b-chat', 'Qwen2_5-32b-chat', 'Yi-34b-chat', 'Qwen2_5-72b-chat', 'Llama3_1-70b-chat']

# Number of most important features to print for the random forest model
NUMBER_OF_IMPORTANT_FEATURES_TO_PRINT = 10

FULL_PRECISION_MODELS = False
TEXT_COLUMNS_TO_EMBED = ['question_with_options']
# Define the feature columns from the other models, that contain other uncertainty features

def retrieve_models_uncertainties_col_names(metric_names):
    uncertainty_feature_column_names = []
    for metric in metric_names:
        for model in MODEL_NAMES:
            uncertainty_feature_column_names.append(f'{metric}_{model}')
    return uncertainty_feature_column_names

## Load the data and predefined splits

In [None]:
# Occasionally the columns are numbers, so we convert them to text strings
def convert_cols_to_str(df, cols_names):
    """Convert columns to string type."""
    for col in cols_names:
        df[col] = df[col].apply(str)
    return df


train = None
test = None

if FULL_PRECISION_MODELS:
    train = convert_cols_to_str(pd.read_csv(RESULTS_DATASET + '_fp_train_set.csv'), TEXT_COLUMNS_TO_EMBED)
    test = convert_cols_to_str(pd.read_csv(RESULTS_DATASET + '_fp_test_set.csv', index_col=0), TEXT_COLUMNS_TO_EMBED)
else:
    train = convert_cols_to_str(pd.read_csv(RESULTS_DATASET + '_train_set.csv'), TEXT_COLUMNS_TO_EMBED)
    test = convert_cols_to_str(pd.read_csv(RESULTS_DATASET + '_test_set.csv', index_col=0), TEXT_COLUMNS_TO_EMBED)

train_with_linguistic = pd.read_csv('../data/' + DATASET + '/with_ling_features/' + 'train.csv')
test_with_linguistic = pd.read_csv('../data/' + DATASET + '/with_ling_features/' + 'test.csv', index_col=0)

# Size per split
print("Train size: ", len(train))
print("Test size: ", len(test))
train.head()

# Calculate the Text Embeddings for the text data

In [None]:
def get_bert_embeddings(text, model, tokenizer):
    """Extract BERT embeddings for a given text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the [CLS] token embedding (typically at index 0) for each sequence
    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return cls_embeddings

def extract_combined_embeddings(df, text_columns):
    """Extract BERT embeddings for multiple text columns and concatenate them."""
    embeddings = []
    for text_col in text_columns:
        col_embeddings = np.vstack(df[text_col].apply(lambda x: get_bert_embeddings(x, bert_model, tokenizer)).values)
        embeddings.append(col_embeddings)
    # Combine embeddings from multiple text columns (concatenation)
    combined_embeddings = np.hstack(embeddings)
    return combined_embeddings

In [None]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained(EMBEDDINGS_MODEL, token = HF_TOKEN)
bert_model = BertModel.from_pretrained(EMBEDDINGS_MODEL, token = HF_TOKEN)
bert_model.eval()  # Set to evaluation mode to freeze the weights
bert_model = bert_model.to('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Calculate the text embeddings for the train and test sets
text_embeddings_train = extract_combined_embeddings(train, TEXT_COLUMNS_TO_EMBED)
text_embeddings_test = extract_combined_embeddings(test, TEXT_COLUMNS_TO_EMBED)

# Experiments

In [None]:
GLOBAL_ALL_RESULTS = {} # Store all results here

def test_random_forest(features_train, target_train, features_test, target_test, description, features_label_list):
    """Runs a Random Forest model on the given data and returns the RMSE and the top features. Average of REPETITIONS is taken."""
    feature_importances_sum = None

    all_rmses_for_model = []

    # Initialize a sum array for feature importances
    feature_importances_sum = np.zeros(features_train.shape[1])  # Number of features in the dataset)
    for repetition in tqdm(range(REPETITIONS)):
        model = RandomForestRegressor(n_jobs=NUM_OF_CORES_TO_USE) # Re-initialize the model every time
        # Fit the random forest. model.fit resets the model every time so it doesn't remember the previous fit.
        model.fit(features_train, target_train)
        # Predict the target values
        predictions = model.predict(features_test)
        rmse = root_mean_squared_error(predictions, target_test)  # Calculate RMSE
        all_rmses_for_model.append(rmse)
        # Collect feature importances
        feature_importances_sum += model.feature_importances_


    # Calculate statistics for the model
    average_rmse_for_model = float(np.mean(all_rmses_for_model)) # Mean RMSE
    std_dev_rmse = float(np.std(all_rmses_for_model, ddof=1))  # Sample standard deviation
    std_error_rmse = float(std_dev_rmse / np.sqrt(REPETITIONS))  # Standard error

    # Store a summary of the results
    rmse_results_summary = {
        'rmse': round(average_rmse_for_model, 4),
        'std_dev': round(std_dev_rmse, 4),
        'std_error': round(std_error_rmse, 4)
    }

    # Print the top NUMBER_OF_IMPORTANT_FEATURES_TO_PRINT important features for Random Forest
    # Calculate the average feature importances across all repetitions
    avg_feature_importances = feature_importances_sum / REPETITIONS    
    indices = np.argsort(avg_feature_importances)[-NUMBER_OF_IMPORTANT_FEATURES_TO_PRINT:][::-1]
    top_features_and_importances = [(features_label_list[i], float(round(avg_feature_importances[i], 4))) for i in indices]
    
    return rmse_results_summary, top_features_and_importances

## Only BERT Embeddings

In [None]:
# Only text embeddings
X_train = text_embeddings_train
X_test = text_embeddings_test
y_train = train[TARGET_LABEL_COL_NAME].values
y_test = test[TARGET_LABEL_COL_NAME].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

features_label_list = ['EMB_' + str(i) for i in range(X_train.shape[1])]
description = "Only BERT embeddings"
GLOBAL_ALL_RESULTS[description] = test_random_forest(X_train_scaled, y_train, X_test_scaled, y_test, description, features_label_list)

## Bert Embeddings & 1st Token Probability

In [None]:
train_uncertainties = train[retrieve_models_uncertainties_col_names(['first_token_probability'])].values
test_uncertainties = test[retrieve_models_uncertainties_col_names(['first_token_probability'])].values

X_train = np.hstack([text_embeddings_train, train_uncertainties])
X_test = np.hstack([text_embeddings_test, test_uncertainties])

y_train = train[TARGET_LABEL_COL_NAME].values
y_test = test[TARGET_LABEL_COL_NAME].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

features_label_list = ['EMB_' + str(i) for i in range(text_embeddings_train.shape[1])]
features_label_list += retrieve_models_uncertainties_col_names(['first_token_probability'])

description = "BERT Embeddings and All Models' 1st Token Probabilities"
GLOBAL_ALL_RESULTS[description] = test_random_forest(X_train_scaled, y_train, X_test_scaled, y_test, description, features_label_list)

## Bert Embeddings & Choice Order Probability

In [None]:
train_uncertainties = train[retrieve_models_uncertainties_col_names(['order_probability'])].values
test_uncertainties = test[retrieve_models_uncertainties_col_names(['order_probability'])].values

X_train = np.hstack([text_embeddings_train, train_uncertainties])
X_test = np.hstack([text_embeddings_test, test_uncertainties])

y_train = train[TARGET_LABEL_COL_NAME].values
y_test = test[TARGET_LABEL_COL_NAME].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

features_label_list = ['EMB_' + str(i) for i in range(text_embeddings_train.shape[1])]
features_label_list += retrieve_models_uncertainties_col_names(['order_probability'])

description = "BERT Embeddings and All Models' Choice Order Probabilities"
GLOBAL_ALL_RESULTS[description] = test_random_forest(X_train_scaled, y_train, X_test_scaled, y_test, description, features_label_list)

## Bert Embeddings, 1st Token Probability and Choice Order Probability

In [None]:
train_uncertainties = train[retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])].values
test_uncertainties = test[retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])].values

X_train = np.hstack([text_embeddings_train, train_uncertainties])
X_test = np.hstack([text_embeddings_test, test_uncertainties])

y_train = train[TARGET_LABEL_COL_NAME].values
y_test = test[TARGET_LABEL_COL_NAME].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

features_label_list = ['EMB_' + str(i) for i in range(text_embeddings_train.shape[1])]
features_label_list += retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])

description = "BERT Embeddings and All Models' Uncertainties"
GLOBAL_ALL_RESULTS[description] = test_random_forest(X_train_scaled, y_train, X_test_scaled, y_test, description, features_label_list)

## Bert Embeddings, 1st Token Probability, Choice Order Probability and  all-MiniLM-L6-v2 Choice Similarity

In [None]:
train_uncertainties = train[retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])].values
test_uncertainties = test[retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])].values

X_train = np.hstack([text_embeddings_train, train_uncertainties])
X_test = np.hstack([text_embeddings_test, test_uncertainties])

# Add choices similarity
X_train = np.hstack([X_train, train['choices_similarity'].values.reshape(-1, 1)])
X_test = np.hstack([X_test, test['choices_similarity'].values.reshape(-1, 1)])

y_train = train[TARGET_LABEL_COL_NAME].values
y_test = test[TARGET_LABEL_COL_NAME].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

features_label_list = ['EMB_' + str(i) for i in range(text_embeddings_train.shape[1])]
features_label_list += retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])
features_label_list += ['Choices Similarity']

description = "BERT Embeddings and All Models' Uncertainties and Choices Similarity"
GLOBAL_ALL_RESULTS[description] = test_random_forest(X_train_scaled, y_train, X_test_scaled, y_test, description, features_label_list)


## Only Medical Choice Similarity

In [None]:
X_train = train['choices_similarity_clinical'].values.reshape(-1, 1)
X_test = test['choices_similarity_clinical'].values.reshape(-1, 1)

y_train = train[TARGET_LABEL_COL_NAME].values
y_test = test[TARGET_LABEL_COL_NAME].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

features_label_list = ['EMB_' + str(i) for i in range(text_embeddings_train.shape[1])]
features_label_list += retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])
features_label_list += ['Choices Similarity (Clinical)']

description = "BERT Embeddings and All Models' Uncertainties and (clinical) Choices Similarity"
GLOBAL_ALL_RESULTS[description] = test_random_forest(X_train_scaled, y_train, X_test_scaled, y_test, description, features_label_list)

## BERT Embeddings and All Models' Uncertainties and (clinical) Choices Similarity

In [None]:
train_uncertainties = train[retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])].values
test_uncertainties = test[retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])].values

X_train = np.hstack([text_embeddings_train, train_uncertainties])
X_test = np.hstack([text_embeddings_test, test_uncertainties])

# Add choices similarity
X_train = np.hstack([X_train, train['choices_similarity_clinical'].values.reshape(-1, 1)])
X_test = np.hstack([X_test, test['choices_similarity_clinical'].values.reshape(-1, 1)])

y_train = train[TARGET_LABEL_COL_NAME].values
y_test = test[TARGET_LABEL_COL_NAME].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

features_label_list = ['EMB_' + str(i) for i in range(text_embeddings_train.shape[1])]
features_label_list += retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])
features_label_list += ['Choices Similarity (Clinical)']

description = "BERT Embeddings and All Models' Uncertainties and (clinical) Choices Similarity"
GLOBAL_ALL_RESULTS[description] = test_random_forest(X_train_scaled, y_train, X_test_scaled, y_test, description, features_label_list)

## Bert Embeddings, Both Uncertainties & both similarities

In [None]:
train_uncertainties = train[retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])].values
test_uncertainties = test[retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])].values

X_train = np.hstack([text_embeddings_train, train_uncertainties])
X_test = np.hstack([text_embeddings_test, test_uncertainties])

# Add choices similarity
X_train = np.hstack([X_train, train['choices_similarity_clinical'].values.reshape(-1, 1), train['choices_similarity'].values.reshape(-1, 1)])
X_test = np.hstack([X_test, test['choices_similarity_clinical'].values.reshape(-1, 1), test['choices_similarity'].values.reshape(-1, 1)])

y_train = train[TARGET_LABEL_COL_NAME].values
y_test = test[TARGET_LABEL_COL_NAME].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

features_label_list = ['EMB_' + str(i) for i in range(text_embeddings_train.shape[1])]
features_label_list += retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])
features_label_list += ['Choices Similarity Clinical', 'Choices Similarity']

description = "BERT Embeddings, Both Uncertainties & Both similarities"
GLOBAL_ALL_RESULTS[description] = test_random_forest(X_train_scaled, y_train, X_test_scaled, y_test, description, features_label_list)

## Bert Embeddings, Both Uncertainties, both similarities & Linguistic Features

In [None]:
train_uncertainties = train[retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])].values
test_uncertainties = test[retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])].values

linguistic_features= ["Word_Count", "Word_Count_No_stop_words", "Avg_Word_Length", "Sentence_Count", "Avg_Sent_Length_in_Words", "Noun_Count", "Verb_Count", "Adjective_Count", "Adverb_Count", "Number_of_NPs", "Number_of_PPs", "Number_of_VPs", "Temporal_Connectives_Count", "Causal_Connectives_Count", "Exemplifying_Connectives_Count", "Additive_Connectives_Count", "Contrastive_Connectives_Count"]

X_train = np.hstack([text_embeddings_train, train_uncertainties, train_with_linguistic[linguistic_features]])
X_test = np.hstack([text_embeddings_test, test_uncertainties, test_with_linguistic[linguistic_features]])

# Add choices similarity
X_train = np.hstack([X_train, train['choices_similarity_clinical'].values.reshape(-1, 1), train['choices_similarity'].values.reshape(-1, 1)])
X_test = np.hstack([X_test, test['choices_similarity_clinical'].values.reshape(-1, 1), test['choices_similarity'].values.reshape(-1, 1)])

y_train = train[TARGET_LABEL_COL_NAME].values
y_test = test[TARGET_LABEL_COL_NAME].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

features_label_list = ['EMB_' + str(i) for i in range(text_embeddings_train.shape[1])]
features_label_list += retrieve_models_uncertainties_col_names(['first_token_probability', 'order_probability'])
features_label_list += linguistic_features
features_label_list += ['Choices Similarity Clinical', 'Choices Similarity']

description = "BERT Embeddings, Both Uncertainties, both similarities & Linguistic Features"
GLOBAL_ALL_RESULTS[description] = test_random_forest(X_train_scaled, y_train, X_test_scaled, y_test, description, features_label_list)

# Results RMSE Overview and Top Features

In [None]:
# Now we filter results to only the ones we are interested in
checkboxes = [widgets.Checkbox(value=True, description=label, layout=widgets.Layout(
    width='1000px')) for label in GLOBAL_ALL_RESULTS]
output = widgets.VBox(children=checkboxes)
display(output)

In [None]:
selected_keys = []
for i in range(0, len(checkboxes)):
    if checkboxes[i].value == True:
        selected_keys = selected_keys + [checkboxes[i].description]

for key in selected_keys:
    print(key)
    print("RMSE: ", GLOBAL_ALL_RESULTS[key][0]['rmse'], "±", GLOBAL_ALL_RESULTS[key][0]['std_error'], " (STDEV: ", GLOBAL_ALL_RESULTS[key][0]['std_dev'], ")")
    print("--------------------")
    ### Plotting ###
    plt.rcParams["font.family"] = "monospace"
    
    # Create the main plot
    fig, ax = plt.subplots(figsize=(10, 5))
    GLOBAL_ALL_RESULTS[key][1].sort(key=lambda x: x[1]) # descending order by importance
    ax.barh([x[0] for x in GLOBAL_ALL_RESULTS[key][1]], [x[1] for x in GLOBAL_ALL_RESULTS[key][1]], color='#478058')
    
    # Set title and labels
    title_text = 'Feature Contributions to Model Performance\n' + '(' + key + ')' # title text
    fig.suptitle(title_text, ha='center', fontsize=20, y=1.01) # set title
    ax.set_xlabel('Feature Importance', fontsize=18) # x label
    # set y ticks manually, so they are within the barplot using plt.text
    max_importance = max([x[1] for x in GLOBAL_ALL_RESULTS[key][1]])
    for i, (feature, importance) in enumerate(GLOBAL_ALL_RESULTS[key][1]):
        if importance > max_importance/2:
            text_length = len(str(feature)) * 0.002 # This is very hacky: manually tune this
            ax.text(0.01, i, feature, ha='left', va='center', fontsize=10, color='white') # aligned to the right of the bar
        else:
            ax.text(0.0001+importance, i, feature, ha='left', va='center', fontsize=10, color='black') # might need to adjust the 0.0001

    ax.set_yticks([])
    # Save and show the plot
    plt.show()
    fig.savefig(f'plots/{DATASET}/{key}.png', dpi=200, bbox_inches='tight')
