In this notebook, the goal is two folds:
1. Model the different covariates from the text only
2. Then model the outcome given the predicted covariates & compare this with model build on the true covariates

An important consideration is that we want the split to be the same across all notebooks, we save this information to be sure to be consistent across all experiments.

In [None]:
import os
import re 
import torch
import numpy as np
import pandas as pd 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from import pipeline

In [None]:
outcomes  = pd.read_csv('data/TGCA_Merged.csv', index_col = 0)

In [None]:
# Extract quantiles for model prediction (and DeepHit discretisation)
predictions_horizons = [1, 3, 5] # Time horizon in years (check that your data outcomes.t is in the same unit) - Limited time horizon for comparison with LLM

### Split the data 

Reopen the split used for LLM1

In [None]:
split = pd.read_csv('results/split.csv', index_col = 0)

Run one of the section Extracted Concepts, Embedding, Fine-Tuning or Prompting

------

# Extracted concepts 

From the previously extracted concept, we aim to predict the survival outcome.
1. Reopen the previous predictions
2. Build a DeepHit model

### Open concept

In [None]:
# Concepts used for predictions
model_type= 'BERT' # BERT, clinicalBERT
concept_type = 'embedding' # embedding, predicted_binary 

In [None]:
concepts = pd.read_csv('data/{}_{}.csv'.format(model_type, concept_type), index_col = [0, 1] if 'predicted_binary' in concept_type else [0])

In [None]:
assert (outcomes.index == concepts.index).all(), 'Misaligned index may create an issue - How is the embedding obtained?'

### DeepHit Cross-Validation

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, t, e):
        self.data = data.values
        self.labels = (e.values, t.values)
        
    def __getitem__(self, idx):
        item = {"x": torch.tensor(self.data[idx])}
        item['labels'] = [torch.tensor(self.labels[0][idx]).float(), torch.tensor(self.labels[1][idx]).float()]
        item['label_ids'] = [idx]
        return item

    def __len__(self):
        return len(self.data)

In [None]:
from model.deephit import DeepHitTorch
from model.training import DeepHitTrainer
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments(output_dir = 'results/', num_train_epochs = 100)

In [None]:
predictions = {}
for split_type in split.columns:
    predictions[split_type] = pd.DataFrame(index = outcomes.index, columns = predictions_horizons)
    data = concepts.loc[(split_type,)] if concept_type == 'predicted_binary' else concepts
    for fold in split[split_type].dropna().unique():
        train = split[split_type].values != fold
        test = split[split_type].values == fold

        # Create Dataset Objects
        train_encoded = Dataset(data[train], outcomes.t[train], outcomes.e[train])
        test_encoded = Dataset(data[test], outcomes.t[test], outcomes.e[test])

        # Train Model
        model = DeepHitTorch(inputdim = data.shape[1], layers = [50, 50, 50], splits = predictions_horizons).double()
        trainer = DeepHitTrainer(model, args = training_args, 
                                train_dataset = train_encoded)
        trainer.train()

        predictions[split_type][test] = trainer.predict(test_encoded)

In [None]:
predictions = pd.concat(predictions)

In [None]:
predictions.to_csv('results/{}_{}_predictions.csv'.format(model_type, concept_type))

-------------

# Fine - Tuning

Be careful this will override the previous (jump to last to save and evaluate)

In [None]:
model_type = 'BERT' # BERT, clinicalBERT
model_type += '_finetune' # For naming convention

In [None]:
from transformers import TrainingArguments
from model.training import DeepHitTrainer
import torch

In [None]:
assert torch.cuda.is_available(), 'Machine or configuration not using GPU - This will be very slow.'

In [None]:
def get_model(embedding):
    if embedding == 'BERT_finetune':
        from transformers import BertTokenizer, BertForSequenceClassification
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = len(predictions_horizons),
            output_attentions = False, output_hidden_states = False, problem_type="multi_label_classification")
    elif embedding == 'clinicalBERT_finetune':
        from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
        tokenizer = DistilBertTokenizer.from_pretrained('medicalai/ClinicalBERT')
        model = DistilBertForSequenceClassification.from_pretrained("medicalai/ClinicalBERT", num_labels = len(predictions_horizons),
            output_attentions = False, output_hidden_states = False, problem_type="multi_label_classification")

    return tokenizer, model

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, e, t):
        self.encodings = encodings
        self.labels = (e.values, t.values)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = [torch.tensor(self.labels[0][idx]).float(), torch.tensor(self.labels[1][idx]).float()]
        return item

    def __len__(self):
        return len(self.labels[0])

In [None]:
training_args = TrainingArguments(output_dir = 'results/', num_train_epochs = 10)

In [None]:
predictions = {}
for split_type in split.columns:
    predictions[split_type] = pd.DataFrame(index = outcomes.index, columns = predictions_horizons)
    for fold in split[split_type].dropna().unique():
        train = split[split_type].values != fold
        test = split[split_type].values == fold

        # Load model and encode data
        tokenizer, model = get_model(model_type)
        model.splits = predictions_horizons
        train_encoded = Dataset(tokenizer(outcomes[train].text.tolist(), truncation = True, padding = True), outcomes.t[train], outcomes.e[train])
        test_encoded = Dataset(tokenizer(outcomes[test].text.tolist(), truncation = True, padding = True), outcomes.t[test], outcomes.e[test])

        # Train model
        trainer = DeepHitTrainer(model = model, args = training_args, 
                          train_dataset = train_encoded)
        trainer.train()

        # Predict
        predictions[split_type][test] = trainer.predict(test_encoded)

In [None]:
predictions = pd.concat(predictions)

In [None]:
predictions.to_csv('results/{}_predictions.csv'.format(model_type))

-----------

# Prompting

In [None]:
def get_pipeline(llm):
    if llm == 'llama':
        model_name = "NousResearch/Llama-2-7b-chat-hf"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        return pipeline("text-generation", model=model, tokenizer=tokenizer)
    elif llm == 'medalpaca':
        model = "medalpaca/medalpaca-7b"
        return pipeline("text-generation", model=model, tokenizer=model, max_length=1000)

assert torch.cuda.is_available(), 'Machine or configuration not using GPU.'

llm = 'medalpaca'
pipeline = get_pipeline(llm)

predictions = pd.DataFrame(index=outcomes.index, columns=[1, 3, 5])

for patient in predictions.index:
    patient_text = outcomes.text[patient]   
    for horizon in predictions.columns:
        context = f"Pathology Report:\n{patient_text}"
        question = f"Based on the provided pathology report, what is the estimated probability (between 0 and 1) that the patient will die within the next {horizon} years? Please provide your answer as a single decimal number rounded to two decimal places, without any additional text or explanations."
        
        if llm == 'llama':
            prompt = f"""<human>: Context: {context}Question: {question}Respond with only a float between 0 and 1, without any additional text.<assistant>:"""
            inputs = pipeline.tokenizer(prompt, add_special_tokens=False, return_tensors=pipeline.framework)
            #inputs = pipeline.tokenizer(prompt)
            response = pipeline.model.generate(**inputs, max_length=1000, temperature=0.3, do_sample=False)[0]
            response_text = pipeline.tokenizer.decode(response, skip_special_tokens=True)
            response = response_text.split('<assistant>:')[-1].strip()
            match = re.search(r'\d+\.\d+', response)
            if match:
                value = match.group()
                response = value
            else:
                print("No numeric value found.")
            #print(response)
            
        elif llm == 'medalpaca':
            response = pipeline(f"Context: {context}\n\nQuestion: {question}\n\nAnswer:")[0]['generated_text'].split('Answer:')[-1].strip()
        
        #print(response)
        predictions.loc[patient, horizon] = response

In [None]:
predictions.to_csv('results/{}_predictions.csv'.format(llm))

Rerun these files with all the models you would like to use before evaluating using the `LLM.Analysis.ipynb`