In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import LEDForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer
import torch
from torch.nn import CrossEntropyLoss,DataParallel
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from tqdm.notebook import tqdm
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [2]:
RANDOM_SEED = 42
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
learning_rate = 1e-5
epochs = 10

In [4]:
def load():
    tokenizer_save_path = "allenai/led-base-16384"
    model_save_path = "allenai/led-base-16384"
    
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_save_path)
    return tokenizer,model

In [5]:
tokenizer,model= load()

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [6]:
DOC_SEP_ = "<doc-sep>"
docsep_token_id = tokenizer.convert_tokens_to_ids(DOC_SEP_)

In [7]:
%pip install evaluate
%pip install rouge-score
%pip install bert_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=20737ceded97a647c8b3c5a51c5defda8d26ba1bcf487bf19d1e5d59cf70a570
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score


In [8]:
import evaluate

rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_ids[pred_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str
    )
    
    bertscore_output = bertscore.compute(
        predictions=pred_str, references=label_str, lang='en', 
    )
    
    bertscore_output = {a:sum(x)/len(x) for a,x in bertscore_output.items() if a in ['precision', 'recall', 'f1']}
    
    final_output = {**rouge_output, **bertscore_output}


    return final_output

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [9]:
class Medical_Dataset(Dataset):
    def __init__(self,tokenizer:AutoTokenizer,train_data,train_label):
        self.data = train_data
        self.label = train_label
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.label.shape[0]
    
    def __getitem__(self,id):
        sentence = self.data.at[id,'Abstracts']
        target = self.label.at[id,'Target']
        encoding = self.tokenizer(sentence, return_tensors='pt', truncation=True, max_length=4096)
        target_encoding = self.tokenizer(target, return_tensors='pt', truncation=True, max_length=1024)
        global_attention_mask = [[1 if y in [tokenizer.cls_token_id, docsep_token_id] else 0 for y in x]
                                                 for x in encoding['input_ids']]
        return {
            'input_ids': encoding['input_ids'].squeeze(0), # Squeeze to remove the extra dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': target_encoding['input_ids'].squeeze(0),
            'global_attention_mask': torch.tensor(np.array(global_attention_mask)).squeeze(0),
        }

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [11]:
cochrane_train_input = pd.read_csv("/kaggle/input/mslr2022/mslr_data/cochrane/train-inputs.csv")
cochrane_train_input["Abstract"].fillna("",inplace = True)
cochrane_train_input = cochrane_train_input.groupby('ReviewID').apply(lambda group:
    "".join([f"{row['Title']}{DOC_SEP_}{row['Abstract']}{DOC_SEP_}" for index, row in group.iterrows()])
).reset_index(name="Abstracts")
cochrane_train_label = pd.read_csv("/kaggle/input/mslr2022/mslr_data/cochrane/train-targets.csv")

cochrane_train_input.sort_values(by='ReviewID', inplace=True)
cochrane_train_input.reset_index(drop=True, inplace=True)

cochrane_train_label.drop_duplicates(subset=['ReviewID'], keep='first', inplace=True)
cochrane_train_label.sort_values(by='ReviewID', inplace=True)
cochrane_train_label.reset_index(drop=True, inplace=True)

# cochrane_train_input = cochrane_train_input.iloc[0:2,:]
# cochrane_train_label = cochrane_train_label.iloc[0:2,:]

train_dataset = Medical_Dataset(tokenizer,cochrane_train_input,cochrane_train_label)


cochrane_dev_input = pd.read_csv("/kaggle/input/mslr2022/mslr_data/cochrane/dev-inputs.csv")
cochrane_dev_input["Abstract"].fillna("",inplace = True)
cochrane_dev_input = cochrane_dev_input.groupby('ReviewID').apply(lambda group:
    "".join([f"{row['Title']}{DOC_SEP_}{row['Abstract']}{DOC_SEP_}" for index, row in group.iterrows()])
).reset_index(name="Abstracts")
cochrane_dev_label = pd.read_csv("/kaggle/input/mslr2022/mslr_data/cochrane/dev-targets.csv")

cochrane_dev_input.sort_values(by='ReviewID', inplace=True)
cochrane_dev_input.reset_index(drop=True, inplace=True)

cochrane_dev_label.drop_duplicates(subset=['ReviewID'], keep='first', inplace=True)
cochrane_dev_label.sort_values(by='ReviewID', inplace=True)
cochrane_dev_label.reset_index(drop=True, inplace=True)

# cochrane_dev_input = cochrane_dev_input.iloc[0:2,:]
# cochrane_dev_label = cochrane_dev_label.iloc[0:2,:]

valid_dataset = Medical_Dataset(tokenizer,cochrane_dev_input,cochrane_dev_label)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cochrane_train_input["Abstract"].fillna("",inplace = True)
  cochrane_train_input = cochrane_train_input.groupby('ReviewID').apply(lambda group:
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cochrane_dev_input["Abstract"].fillna("",inplace = True)
  cochrane_dev_input = cochran

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',      
    num_train_epochs=epochs,            
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=2,  
    warmup_steps=500,              
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=1000,
    save_steps=1000,
    eval_steps=1000,
    evaluation_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    predict_with_generate=True,
    learning_rate=learning_rate,
    report_to=[],
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
    )

  trainer = Seq2SeqTrainer(


In [13]:
trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Input ids are automatically padded from 1134 to 2048 to be a multiple of `config.attention_window`: 1024


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Precision,Recall,F1
1000,2.9884,2.767788,0.194913,0.048368,0.150414,0.151579,0.888284,0.83967,0.863116
2000,2.7186,2.68429,0.194397,0.048051,0.149921,0.151134,0.885977,0.838755,0.861535
3000,2.5612,2.643972,0.185898,0.046407,0.142028,0.142901,0.884589,0.839638,0.861341
4000,2.4831,2.632432,0.184957,0.045812,0.141834,0.14312,0.882279,0.838402,0.859597
5000,2.3804,2.61231,0.193357,0.047986,0.148707,0.149529,0.888338,0.839547,0.863084
6000,2.3105,2.615707,0.197062,0.049129,0.150948,0.152199,0.890291,0.840809,0.864664
7000,2.2508,2.605395,0.195306,0.047542,0.14885,0.1499,0.88612,0.839175,0.861823
8000,2.1928,2.61867,0.191812,0.048008,0.147364,0.148298,0.887919,0.839918,0.863068
9000,2.1278,2.608866,0.181938,0.046835,0.141175,0.142304,0.88274,0.838742,0.859982
10000,2.0734,2.621601,0.195721,0.049419,0.150306,0.151601,0.887721,0.84007,0.86307


Input ids are automatically padded from 2638 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2638 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1523 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3794 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1966 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3963 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1533 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 946 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1245 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1745 to 2048 to 

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Input ids are automatically padded from 1224 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2878 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 2878 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3654 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3654 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 991 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1114 to 2048 to be a multiple of `confi

TrainOutput(global_step=18760, training_loss=2.180251496077092, metrics={'train_runtime': 37090.9901, 'train_samples_per_second': 1.012, 'train_steps_per_second': 0.506, 'total_flos': 7.842311433274982e+16, 'train_loss': 2.180251496077092, 'epoch': 10.0})

In [14]:
trainer.save_model("my_final_centrum")