In [25]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from transformers import LEDForConditionalGeneration,PegasusXForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer
import torch
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from tqdm.notebook import tqdm

RANDOM_SEED = 42
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
models = ["longformer","centrum","primera","pegasus"]
chosen = 1
model_name = models[chosen]

In [27]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [28]:
class Test_Medical_Dataset(Dataset):
    def __init__(self,tokenizer:AutoTokenizer,test_data,test_label,docsep):
        self.data = test_data
        self.label = test_label
        self.tokenizer = tokenizer
        self.docsep_token_id = docsep
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self,id):
        sentence = self.data.at[id,'Abstracts']
        target = self.label.at[id,'Target']
        review_id = self.data.at[id,"ReviewID"]
        encoding = self.tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=4096)
        global_attention_mask = [[1 if y in [self.tokenizer.cls_token_id, self.docsep_token_id] else 0 for y in x]
                                                for x in encoding['input_ids']]
        return {
            'input_ids': encoding['input_ids'].squeeze(0), # Squeeze to remove the extra dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': target,
            'global_attention_mask': torch.tensor(global_attention_mask).squeeze(0),
            'abstracts': sentence,
            'review_id': review_id
        }
        
class Test_Medical_Dataset_2(Dataset):
    def __init__(self,tokenizer:AutoTokenizer,test_data,test_label):
        self.data = test_data
        self.label = test_label
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self,id):
        sentence = self.data.at[id,'Abstracts']
        target = self.label.at[id,'Target']
        review_id = self.data.at[id,"ReviewID"]
        encoding = self.tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=4096)
        return {
            'input_ids': encoding['input_ids'].squeeze(0), # Squeeze to remove the extra dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': target,
            'abstracts': sentence,
            'review_id': review_id
        }
        


In [29]:
if model_name!="pegasus":
    PATH = f'./model/{model_name}'
    tokenizer = AutoTokenizer.from_pretrained(PATH)
    model = AutoModelForSeq2SeqLM.from_pretrained(PATH)
    DOC_SEP_ = "<doc-sep>"
    docsep_token_id = tokenizer.convert_tokens_to_ids(DOC_SEP_)
    cochrane_dev_input = pd.read_csv("datasets/mslr_data/cochrane/dev-inputs.csv")
    cochrane_dev_input['Abstract'].fillna("",inplace = True)
    cochrane_dev_input = cochrane_dev_input.groupby('ReviewID').apply(lambda group:"".join([f"{row['Title']}{DOC_SEP_}{row['Abstract']}{DOC_SEP_}" for index, row in group.iterrows()])).reset_index(name="Abstracts")
    cochrane_dev_input.sort_values(by='ReviewID', inplace=True)
    cochrane_dev_input.reset_index(drop=True, inplace=True)
    cochrane_dev_target = pd.read_csv("datasets/mslr_data/cochrane/dev-targets.csv")
    cochrane_dev_target.sort_values(by='ReviewID', inplace=True)
    cochrane_dev_target.reset_index(drop=True, inplace=True)

    test_dataset = Test_Medical_Dataset(tokenizer,cochrane_dev_input,cochrane_dev_target,docsep_token_id)
    model.to(device)
else:
    PATH = f'./model/{model_name}'
    model = PegasusXForConditionalGeneration.from_pretrained(PATH)
    tokenizer = AutoTokenizer.from_pretrained(PATH)


    DOC_SEP_ = '<SEP>'
    cochrane_dev_input = pd.read_csv("datasets/mslr_data/cochrane/dev-inputs.csv")
    cochrane_dev_input['Abstract'].fillna("",inplace = True)
    cochrane_dev_input = cochrane_dev_input.groupby('ReviewID').apply(lambda group:"".join([f"{row['Title']}\n{row['Abstract']}{DOC_SEP_}" for index, row in group.iterrows()])).reset_index(name="Abstracts")
    cochrane_dev_input.sort_values(by='ReviewID', inplace=True)
    cochrane_dev_input.reset_index(drop=True, inplace=True)
    cochrane_dev_target = pd.read_csv("datasets/mslr_data/cochrane/dev-targets.csv")
    cochrane_dev_target.sort_values(by='ReviewID', inplace=True)
    cochrane_dev_target.reset_index(drop=True, inplace=True)

    test_dataset = Test_Medical_Dataset(tokenizer,cochrane_dev_input,cochrane_dev_target)
    model.to(device)

In [30]:
k = test_dataset[13]
for sentence in k["abstracts"].split("<doc-sep>"):
    print(sentence+"\n")

A double-blind controlled trial of etretinate (Tigason) and ibuprofen in psoriatic arthritis.

Etretinate (Tigason) and ibuprofen have been compared in a double-blind controlled trial in psoriatic arthritis to see if we could confirm a specific action for this vitamin A derivative suggested from earlier uncontrolled studies. Eleven out of 20 patients completed 24 weeks of therapy with etretinate (up to 0.5 mg/kg/day) whereas only 1/20 patients completed 24 weeks of therapy with ibuprofen alone. Etretinate improved skin lesions, and this may have encouraged patients to persist with it. Improvement of statistical significance was seen for articular index in both groups. In addition significant improvement in ESR, haemoglobin, C-reactive protein, and histidine occurred in the etretinate group. The main side effects of etretinate (which may preclude its use at a higher dose in this condition) included cracked and dried lips and sore mouth.

Therapeutic value of colchicine in the treatment 

In [31]:
print("Groundtruth: " + k["labels"])

Groundtruth:Parenteral high dose methotrexate and sulfasalazine are the only two agents with well demonstrated published efficacy in psoriatic arthritis. The magnitude of the effect seen with azathioprine, etretinate, oral low dose methotrexate and perhaps colchicine suggests that they may be effective but that further multicentre clinical trials are required to establish their efficacy. Furthermore, the magnitude of the improvement observed in the placebo group strongly suggests that uncontrolled trials should not be used to guide management decisions in this condition.


In [32]:
if model_name!="pegasus":
    gen = model.generate(input_ids=k['input_ids'].unsqueeze(0).to(device),attention_mask=k['attention_mask'].unsqueeze(0).to(device),global_attention_mask=k['global_attention_mask'].unsqueeze(0).to(device),max_length=1024, num_beams=4,repetition_penalty=1.15,no_repeat_ngram_size=4)
else:
    gen = model.generate(input_ids=k['input_ids'].unsqueeze(0).to(device),attention_mask=k['attention_mask'].unsqueeze(0).to(device),max_length=1024, num_beams=4,repetition_penalty=1.15,no_repeat_ngram_size=4)
generated_sentence = tokenizer.decode(gen[0], skip_special_tokens=True)
print("Generated result: " + generated_sentence)

Generated result:Sulphasalazine appears to be an effective treatment for psoriatic arthritis.
