In [1]:
import numpy as np 
import pandas as pd
import os
import torch
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from transformers import LEDConfig,LEDForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from nltk import sent_tokenize
from torch.nn import DataParallel
from sklearn.model_selection import train_test_split
import ast
import random

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
input_dir="/kaggle/input/"
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pretrain-ms2/pretrained_data.csv
/kaggle/input/tokenizer/primera-tokenizer/merges.txt
/kaggle/input/tokenizer/primera-tokenizer/tokenizer.json
/kaggle/input/tokenizer/primera-tokenizer/vocab.json
/kaggle/input/tokenizer/primera-tokenizer/tokenizer_config.json
/kaggle/input/tokenizer/primera-tokenizer/special_tokens_map.json
/kaggle/input/primera-ckp/checkpoint-3024/config.json
/kaggle/input/primera-ckp/checkpoint-3024/trainer_state.json
/kaggle/input/primera-ckp/checkpoint-3024/training_args.bin
/kaggle/input/primera-ckp/checkpoint-3024/scheduler.pt
/kaggle/input/primera-ckp/checkpoint-3024/model.safetensors
/kaggle/input/primera-ckp/checkpoint-3024/optimizer.pt
/kaggle/input/primera-ckp/checkpoint-3024/rng_state.pth
/kaggle/input/primera-ckp/checkpoint-3024/generation_config.json


In [None]:
print("Number of GPUs available:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

#### Load tokenizer

In [3]:
tokenizer=AutoTokenizer.from_pretrained(input_dir+"tokenizer/primera-tokenizer")

#### Load data

In [4]:
pretrain_df=pd.read_csv(input_dir+"pretrain-ms2/pretrained_data.csv")
pretrain_df["truncated_docs"] = pretrain_df["truncated_docs"].apply(ast.literal_eval)
pretrain_df["selected_sents"] = pretrain_df["selected_sents"].apply(ast.literal_eval)
pretrain_df.head()

Unnamed: 0,truncated_docs,selected_sents
0,[[Improved Cell Survival and Paracrine Capacit...,"[(4, 0), (3, 4), (4, 3), (3, 1), (3, 7), (4, 6..."
1,[[A comparison of continuous intravenous epopr...,"[(0, 2), (0, 5), (1, 0), (1, 6), (0, 8), (1, 3..."
2,[[Relationship of TIMI myocardial perfusion gr...,"[(0, 2), (0, 5), (2, 2), (1, 0), (1, 6), (2, 5..."
3,[[Effect of cessation interventions on hookah ...,"[(3, 4), (3, 1), (0, 2), (0, 5), (2, 2), (1, 0..."
4,[[The Arizona Sexual Experiences Scale: a vali...,"[(0, 2), (0, 5), (1, 0), (0, 8), (1, 3), (0, 1..."


#### Divide into train and eval datasets

In [5]:
train_df,val_df=train_test_split(pretrain_df,test_size=0.2,shuffle=True,random_state=40)
train_df.reset_index(drop=True,inplace=True)
val_df.reset_index(drop=True,inplace=True)
print("Length of train df: ", len(train_df))
print("Length of val df: ", len(val_df) )

Length of train df:  11347
Length of val df:  2837


#### Define dataset

In [6]:
MAX_INPUT_LENGTH=4096
MAX_OUTPUT_LENGTH=512
NON_MASK_RATIO=0.5
def get_src_tgt_and_mask(truncated_docs, selected_sents,tokenizer,max_len_input,max_len_output,non_mask_ratio):
    """
    Get source and tgt

    Args:
        truncated_docs (list of list of str): list of documents with sentences.
        selected_sents (list of (doc_idx, sent_idx)): Indices of sentences to mask.

    Returns:
        Src: the cluster with masked salient sentences
        Target: The masked salient sentences
    """
    non_mask_sents = random.sample(
            list(selected_sents), int(len(selected_sents) * non_mask_ratio)
        )
    masked_docs = [doc.copy() for doc in truncated_docs] 
    tgt=[]
    for doc_idx, sent_idx in selected_sents:
        tgt.append(truncated_docs[doc_idx][sent_idx])
        if (doc_idx,sent_idx) in non_mask_sents: 
            continue
        masked_docs[doc_idx][sent_idx] = tokenizer.mask_token
    src="<doc-sep>".join([" ".join(doc) for doc in masked_docs])
    src=tokenizer(src,max_length=max_len_input,padding="max_length",truncation=True)
    tgt=" ".join(tgt)
    tgt=tokenizer(tgt,max_length=max_len_output,padding="max_length",truncation=True)
    input_ids=src.input_ids
    global_attention_mask=[0 for _ in range(len(input_ids))]
    global_attention_mask[input_ids==tokenizer.vocab["<doc-sep>"]]=1
    global_attention_mask[0]=1
    labels=tgt.input_ids
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels] 
    return {
        "input_ids":torch.tensor(input_ids,dtype=torch.long),
        "attention_mask":torch.tensor(src.attention_mask,dtype=torch.long),
        "global_attention_mask":torch.tensor(global_attention_mask,dtype=torch.long),
        "labels":torch.tensor(labels,dtype=torch.long)
    }
class PretrainDataset(Dataset):
    def __init__(self,data,tokenizer,max_input_len=4096,max_output_len=512,non_mask_ratio=0.5):
        self.data=data
        self.max_input_len=max_input_len
        self.max_output_len=max_output_len
        self.tokenizer=tokenizer
        self.non_mask_ratio=non_mask_ratio
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self,index):
        row=self.data.loc[index]
        data=get_src_tgt_and_mask(row["truncated_docs"],row["selected_sents"],self.tokenizer,self.max_input_len,
                          self.max_output_len,self.non_mask_ratio)
        return data

In [7]:
train_dataset=PretrainDataset(train_df,tokenizer,MAX_INPUT_LENGTH,MAX_OUTPUT_LENGTH,NON_MASK_RATIO)
val_dataset=PretrainDataset(val_df,tokenizer,MAX_INPUT_LENGTH,MAX_OUTPUT_LENGTH,NON_MASK_RATIO)
print("Length of train dataset: ", len(train_dataset))
print("Length of val dataset: ", len(val_dataset) )

Length of train dataset:  11347
Length of val dataset:  2837


#### Load base model

In [None]:
#BASE_MODEL_PATH="allenai/led-base-16384"
BASE_MODEL_PATH="allenai/PRIMERA"
config = LEDConfig.from_pretrained(BASE_MODEL_PATH)
model = LEDForConditionalGeneration.from_pretrained(
    BASE_MODEL_PATH,
    config=config,
)
#resize vocab size of model
model.resize_token_embeddings(len(tokenizer))
model.config.max_decoder_position_embeddings=512
model.gradient_checkpointing_enable()

In [None]:
model

In [9]:
batch_size=3
num_devices=torch.cuda.device_count()
batch_size_per_device=batch_size//num_devices
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True, 
    per_device_train_batch_size=batch_size_per_device,
    per_device_eval_batch_size=batch_size_per_device,
    output_dir="./pretrained-primera",
    logging_dir="./logs/pretrain/",
    save_strategy="steps",
    save_steps=50,
    logging_strategy="steps",
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=250,
    save_total_limit=1,
    #load_best_model_at_end=True,
    #metric_for_best_model="eval_loss",
    #greater_is_better=False,
    gradient_accumulation_steps=5,
    num_train_epochs=12,
    max_grad_norm=1.0,
    learning_rate=5e-5,
    warmup_steps=5000,
    lr_scheduler_type="linear",
    report_to="tensorboard",
    run_name="pretraining_primera",
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
torch.cuda.empty_cache()

In [None]:
%load_ext tensorboard

# Set log directory (change if needed)
log_dir = "./logs"

# Start TensorBoard
%tensorboard --logdir {log_dir}

In [None]:
trainer.train(resume_from_checkpoint=True)

In [11]:
!zip -r folder.zip ./pretrained-primera


zip error: Zip file structure invalid (folder.zip)


In [None]:
!pip install IPython

In [2]:
from IPython.display import FileLink
FileLink(r'folder.zip')

In [None]:
import os

folder_path = "/kaggle/working/pretrained-primera/checkpoint-2000"  # Replace with your folder path
os.system(f"rm -rf {folder_path}")