In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import torch
import transformers
import shap
import pickle

# load the model and tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained("NikolajMunch/test", use_fast=True)
model = transformers.AutoModelForSequenceClassification.from_pretrained("NikolajMunch/test").cuda()

# build a pipeline object to do predictions
pred = transformers.pipeline("text-classification", model=model, device=0, tokenizer=tokenizer, return_all_scores=True)

df = pd.read_csv("copd_heart_failure_df_no2.csv")

#Select only columns reason_clean, label_list
df = df[['reason_clean', 'disease_label']]

#Split into train and test
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

train_df.columns = ["text", "labels"]
test_df.columns = ["text", "labels"]

explainer = shap.Explainer(pred, masker=shap.maskers.Text(collapse_mask_token=True))

test_df2 = test_df[test_df['text'].apply(lambda x: len(x.split()) > 3)] #Ensure enough words for token masking



In [None]:
shap_values = explainer(test_df2['text'])

In [None]:
#Save shap-values for all pre-exam texts in test df
with open('shap_values.pkl', 'wb') as f:
    pickle.dump(shap_values, f)

In [None]:
#Plot most important words towards label 1 /hf
shap.plots.bar(shap_values[:,:,"LABEL_1"].mean(0), order=shap.Explanation.argsort.flip, max_display=20)

In [None]:
#Same for label 0 /copd
shap.plots.bar(shap_values[:,:,"LABEL_1"].mean(0), order=shap.Explanation.argsort.flip)

In [None]:
#Plot individual sentences, this is done for a few example sentences as shown in the paper Figure 4
shap_values_sent = explainer("Insert example sentence here")

shap.plots.text(shap_values_sent)