In this notebook, the goal is two folds:
1. Model the different covariates from the text only
2. Then model the outcome given the predicted covariates & compare this with model build on the true covariates

An important consideration is that we want the split to be the same across all notebooks, we save this information to be sure to be consistent across all experiments.

In [None]:
import os
import numpy as np
import pandas as pd 

os.environ['TRANSFORMERS_CACHE'] = '/projects/wangc/jeanselme/conda_env/' # BE SURE TO NOT SAVE in your home otherwise will be blocked

In [None]:
outcomes  = pd.read_csv('data/TGCA_Merged.csv', index_col = 0)

### Extract target labels

In [None]:
outcomes_to_predict = outcomes[['type', 'gender', 'race', 'ajcc_pathologic_tumor_stage']]
outcomes_to_predict['ajcc_pathologic_tumor_stage'] = outcomes_to_predict.ajcc_pathologic_tumor_stage.astype('category')
outcomes_to_predict_dummy = pd.get_dummies(outcomes_to_predict, dummy_na = True).astype(int)

In [None]:
outcomes_to_predict_dummy.to_csv('data/binary_embedding.csv')

### Split the data 

We propose different evaluation procedures:
1. One hospital out evaluation: to evaluate how well the model generalise outside the cohort. To limit the number of split, we compute only for hospitals with more than 100 patients.
2. One cancer group out

In [None]:
split = pd.DataFrame({
        'Hospital': pd.factorize(outcomes.Hospital.replace({'Other': np.nan}))[0],
        'Grouping' : pd.factorize(outcomes.grouping.replace({'Other': np.nan}))[0],
    }, index = outcomes.index).replace({-1: np.nan})
split.to_csv('results/split.csv')

Run one of the section Embedding, Fine-Tuning or Promptin

-----------

# Embedding

From extracted embedding from LLM try to predict the different covariates of interest.

### Open embedding

In [None]:
# Embedding to use for the notebook
embedding_type = 'BERT' # BERT, clinicalBERT, gpt, gpt+framing

In [None]:
embedding = pd.read_csv('data/{}_embedding.csv'.format(embedding_type), index_col = 0)

In [None]:
assert (outcomes.index == embedding.index).all(), 'Misaligned index may create an issue - How is the embedding obtained?'

In [None]:
outcomes.head()

In [None]:
embedding.head()

### Model the different covariates

We aim to predict from the text each manually extracted covariates. 

Then we save these covariates for future predictions.

In [None]:
# For simplicity, we rely on a NN from sklearn for this task
from sklearn.neural_network import MLPClassifier

In [None]:
predictions = {}
for split_type in split.columns:
    predictions[split_type] = pd.DataFrame().reindex_like(outcomes_to_predict_dummy)
    for fold in split[split_type].dropna().unique():
        train = split[split_type].values != fold
        test = split[split_type].values == fold

        model = MLPClassifier(hidden_layer_sizes = [], random_state = 42, 
                              learning_rate_init = 0.01, max_iter = 10, 
                              early_stopping = True).fit(embedding[train].values, outcomes_to_predict_dummy[train].values)
        predictions[split_type][test] = model.predict_proba(embedding[test].values)

In [None]:
predictions = pd.concat(predictions)

In [None]:
assert False, "Go to Section 'Binarise and Save' to save this extraction."

--------

# Fine - Tuning

Be careful this will overwrite the previous Section (jump to last to save and evaluate).
In this Section, we aim to fine-tune a neural network to extract the concept of interest.

In [None]:
embedding_type = 'BERT_finetune'

In [None]:
from transformers import Trainer, TrainingArguments
import torch

In [None]:
assert torch.cuda.is_available(), 'Machine or configuration not using GPU'

In [None]:
def get_model(embedding):
    if embedding == 'BERT_finetune':
        from transformers import BertTokenizer, BertForSequenceClassification
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = outcomes_to_predict_dummy.shape[1],
            output_attentions = False, output_hidden_states = False, problem_type="multi_label_classification")
        
    return tokenizer, model

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.values

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
training_args = TrainingArguments(output_dir = 'results/', num_train_epochs = 100)

In [None]:
predictions = {}
for split_type in split.columns:
    predictions[split_type] = pd.DataFrame().reindex_like(outcomes_to_predict_dummy)
    for fold in split[split_type].dropna().unique():
        train = split[split_type].values != fold
        test = split[split_type].values == fold

        # Load model and encode data
        tokenizer, model = get_model(embedding_type)
        train_endcoded = Dataset(tokenizer(outcomes[train].text.tolist(), truncation = True, padding = True), outcomes_to_predict_dummy[train])
        test_endcoded = Dataset(tokenizer(outcomes[test].text.tolist(), truncation = True, padding = True), outcomes_to_predict_dummy[test])

        # Train model
        trainer = Trainer(model = model, args = training_args, 
                          train_dataset = train_endcoded)
        trainer.train()

        # Predict
        predictions[split_type][test] = trainer.predict(test_endcoded).predictions

In [None]:
predictions = pd.concat(predictions)

In [None]:
assert False, "Go to Section 'Binarise and Save' to save this extraction."

-----------

# Prompting

In [None]:
from transformers import pipeline

In [None]:
model = pipeline("text-generation", model="medalpaca/medalpaca-7b", tokenizer="medalpaca/medalpaca-7b") # Should be downloaded in my folder (10 Gb)

In [None]:
outcomes_match = {
    "type": ("cancer_type", outcomes_to_predict["type"].dropna().unique()),
    "gender": ('sex', ['male', 'female']),
    "race": ('ethnicity', ['white', 'non-white']),
    "ajcc_pathologic_tumor_stage": ('Cancer stage', ['I', 'II', 'III'])
}

In [None]:
binarised_predictions = pd.DataFrame().reindex_like(outcomes_to_predict)
for outcome in outcomes_to_predict:
    name, values = outcomes_match[outcome]
    for patient in binarised_predictions.index:
        patient_text = outcomes.text[patient]
        prompt = "What is the {} for a patient with the following pathology report: '{}'. Respond with exactly one of the following categories: {}.".format(name, patient_text, values)
        binarised_predictions.loc[patient, outcome] = model(prompt)

In [None]:
binarised_predictions.to_csv('data/alpaca_predicted_binary.csv')

In [None]:
assert False, "You are done ! This method does not require binarisations."

-------

# Discretise and save

As the previous sections predict probabilities, we then discretise the outputs to match the concept of interest.

In [None]:
# Binarisation by a softmax
binarised_predictions = []
for column in outcomes_to_predict.columns:
    if column == 'ajcc_pathologic_tumor_stage': # Special float case
        pred_col = predictions.loc[:, predictions.columns.str.contains(column)].idxmax(axis = 1).str.replace(column + '_', '').astype(float)
    elif column == 'type': # Special categorial case (change for other dataset)
        pred_col = predictions.loc[:, predictions.columns.str.contains(column)].idxmax(axis = 1).str.replace(column + '_', '')
    else: # All other binary variable
        pred_col = predictions.loc[:, column] > 0.5

    binarised_predictions.append(pred_col.rename(column))
binarised_predictions = pd.concat(binarised_predictions, axis = 1)
binarised_predictions.head()

In [None]:
binarised_predictions.to_csv('data/{}_predicted_binary.csv'.format(embedding_type))

You have now saved the predicted binary outcome.

----------

### Measure performance of the extraction

This section allows you to check the performance for extracting the concetps of interest. Maybe the text does not contain the information or any indicator of the conctept you aim to extract.

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
performance = {}
for split_type in split.columns:
    columns = split[split_type].dropna().unique()
    performance[split_type] = pd.DataFrame(index = columns, columns = predictions.columns)
    for fold in columns:
        for dimension in predictions.columns:
            test = split[split_type] == fold
            mean = outcomes_to_predict_dummy.loc[test, dimension].mean()
            if mean != 1 and mean != 0:
                # The class contains some positive
                performance[split_type].loc[fold, dimension] = roc_auc_score(outcomes_to_predict_dummy.loc[test, dimension], predictions.loc[(split_type, test[test].index), dimension])
performance = pd.concat(performance)

In [None]:
performance.loc['Hospital'].astype(float).describe()

In [None]:
performance.loc['Grouping'].astype(float).describe()