In [3]:
import os

if os.path.basename(os.getcwd()) != 'HUST-NLP-Medical-MultiDocument-Summarization-':
    %cd ../../

e:\pyenv\GTCC\KPG-RL\HUST-NLP-Medical-MultiDocument-Summarization-


In [4]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from transformers import LEDForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer
import torch
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from tqdm.notebook import tqdm

In [5]:
RANDOM_SEED = 42
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
PATH = 'ratishsp/Centrum'
tokenizer = AutoTokenizer.from_pretrained(PATH)

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(PATH)

In [7]:
DOC_SEP_ = "<doc-sep>"
docsep_token_id = tokenizer.convert_tokens_to_ids(DOC_SEP_)

In [16]:
import evaluate

rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_ids[pred_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str
    )
    
    bertscore_output = bertscore.compute(
        predictions=pred_str, references=label_str, lang='en', 
    )
    
    bertscore_output = {a:sum(x)/len(x) for a,x in bertscore_output.items() if a in ['precision', 'recall', 'f1']}
    
    final_output = {**rouge_output, **bertscore_output}


    return final_output

In [8]:
class Medical_Dataset(Dataset):
    def __init__(self,tokenizer:AutoTokenizer,train_data,train_label):
        self.data = train_data
        self.label = train_label
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.label.shape[0]
    
    def __getitem__(self,id):
        sentence = self.data.at[id,'Abstracts']
        target = self.label.at[id,'Target']
        encoding = self.tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=4096)
        target_encoding = self.tokenizer(target, return_tensors='pt', padding='max_length', truncation=True, max_length=1024)
        global_attention_mask = [[1 if y in [tokenizer.cls_token_id, docsep_token_id] else 0 for y in x]
                                                 for x in encoding['input_ids']]
        return {
            'input_ids': encoding['input_ids'].squeeze(0), # Squeeze to remove the extra dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': target_encoding['input_ids'].squeeze(0),
            'global_attention_mask': torch.tensor(global_attention_mask).squeeze(0),
        }

In [9]:
ms2_train_input = pd.read_csv("datasets/mslr_data/ms2/train-inputs.csv")
ms2_train_input["Abstract"].fillna("",inplace = True)
ms2_train_input = ms2_train_input.groupby('ReviewID').apply(lambda group: f"{DOC_SEP_}".join(group['Abstract'].tolist()) + f"{DOC_SEP_}").reset_index(name = "Abstracts")
ms2_train_target = pd.read_csv("datasets/mslr_data/ms2/train-targets.csv")
cochrane_train_input = pd.read_csv("datasets/mslr_data/cochrane/train-inputs.csv")
cochrane_train_input["Abstract"].fillna("",inplace = True)
cochrane_train_input = cochrane_train_input.groupby('ReviewID').apply(lambda group: f"{DOC_SEP_}".join(group['Abstract'].tolist()) + f"{DOC_SEP_}").reset_index(name = "Abstracts")
cochrane_train_target = pd.read_csv("datasets/mslr_data/cochrane/train-targets.csv")
k = pd.concat([ms2_train_input,cochrane_train_input])
k = k.iloc[0:2,:]
del ms2_train_input,cochrane_train_input
kk = pd.concat([ms2_train_target,cochrane_train_target])
kk = kk.iloc[0:2,:]
del ms2_train_target,cochrane_train_target
train_dataset = Medical_Dataset(tokenizer,k,kk)

ms2_dev_input = pd.read_csv("datasets/mslr_data/ms2/dev-inputs.csv")
ms2_dev_input["Abstract"].fillna("",inplace = True)
ms2_dev_input = ms2_dev_input.groupby('ReviewID').apply(lambda group: f"{DOC_SEP_}".join(group['Abstract'].tolist()) + f"{DOC_SEP_}").reset_index(name = "Abstracts")
ms2_dev_target = pd.read_csv("datasets/mslr_data/ms2/dev-targets.csv")
cochrane_dev_input = pd.read_csv("datasets/mslr_data/cochrane/dev-inputs.csv")
cochrane_dev_input["Abstract"].fillna("",inplace = True)
cochrane_dev_input = cochrane_dev_input.groupby('ReviewID').apply(lambda group: f"{DOC_SEP_}".join(group['Abstract'].tolist()) + f"{DOC_SEP_}").reset_index(name = "Abstracts")
cochrane_dev_target = pd.read_csv("datasets/mslr_data/cochrane/dev-targets.csv")
kkk = pd.concat([ms2_dev_input,cochrane_dev_input])
kkk = kkk.iloc[0:2,:]
del ms2_dev_input,cochrane_dev_input
kkkk = pd.concat([ms2_dev_target,cochrane_dev_target])
kkkk = kkkk.iloc[0:2,:]
del ms2_dev_target,cochrane_dev_target
test_dataset = Medical_Dataset(tokenizer,kkk,kkkk)

In [10]:
len(train_dataset), len(test_dataset)

(2, 2)

In [14]:
pred = train_dataset[0]['input_ids']
ref = train_dataset[0]['labels']

In [10]:
import wandb

In [None]:
learning_rate = 1e-5
epochs = 5

In [None]:
wandb.init()
#api_key = "b837839166bd4f97a07e90a26fa965ee17f8b64f"

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',      
    num_train_epochs=epochs,            
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,  
    warmup_steps=500,              
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    save_steps=10,
    eval_steps=10,
    evaluation_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to = "wandb",
    predict_with_generate=True,
    learning_rate=learning_rate,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    
    )

  trainer = Seq2SeqTrainer(


In [14]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msylvis[0m ([33msylvis-hanoi-university-of-science-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


0it [00:00, ?it/s]

{'train_runtime': 2.0619, 'train_samples_per_second': 0.0, 'train_steps_per_second': 0.0, 'train_loss': 0.0, 'epoch': 0}


TrainOutput(global_step=0, training_loss=0.0, metrics={'train_runtime': 2.0619, 'train_samples_per_second': 0.0, 'train_steps_per_second': 0.0, 'total_flos': 0, 'train_loss': 0.0, 'epoch': 0})

In [15]:
trainer.evaluate()

  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 10.764432907104492,
 'eval_rouge1': 0.15179738562091505,
 'eval_rouge2': 0.02857142857142857,
 'eval_rougeL': 0.1400326797385621,
 'eval_rougeLsum': 0.15179738562091505,
 'eval_runtime': 1.8834,
 'eval_samples_per_second': 1.062,
 'eval_steps_per_second': 1.062,
 'epoch': 0}

In [16]:
class Test_Medical_Dataset(Dataset):
    def __init__(self,tokenizer:AutoTokenizer,test_data):
        self.data = test_data
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self,id):
        sentence = self.data.at[id,'Abstracts']
        encoding = self.tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=4096)
        global_attention_mask = [[1 if y in [tokenizer.cls_token_id, docsep_token_id] else 0 for y in x]
                                                 for x in encoding['input_ids']]
        return {
            'input_ids': encoding['input_ids'].squeeze(0), # Squeeze to remove the extra dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'global_attention_mask': torch.tensor(global_attention_mask).squeeze(0),
        }

In [20]:
ms2_test_input = pd.read_csv("datasets/mslr_data/ms2/test-inputs.csv")
ms2_test_input['Abstract'].fillna("",inplace = True)
ms2_test_input = ms2_test_input.groupby('ReviewID').apply(lambda group: f"{DOC_SEP_}".join(group['Abstract'].tolist()) + f"{DOC_SEP_}").reset_index(name = "Abstracts")
test_dataset = Test_Medical_Dataset(tokenizer,ms2_test_input)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

{'input_ids': tensor([    0,   250,   910,  ..., 15981,   479,     2]),
 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1]),
 'global_attention_mask': tensor([1, 0, 0,  ..., 0, 0, 0])}

In [None]:
res = []

In [21]:
for k in test_dataloader:
    gen = model.generate(input_ids=k['input_ids'].unsqueeze(0).to(device),attention_mask=k['attention_mask'].unsqueeze(0).to(device),global_attention_mask=k['global_attention_mask'].unsqueeze(0).to(device),max_length=1024, num_beams=4, early_stopping=True)
    generated_sentence = tokenizer.decode(gen[0], skip_special_tokens=True)
    res.append(generated_sentence)

In [16]:
# trainer.save_model("./finetuned-electra")
# tokenizer.save_pretrained("./finetuned-electra")