In [None]:
!pip install pandas openpyxl

In [None]:
!pip install transformers

In [None]:
import numpy as np
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

In [None]:
max_input_length = 16384
max_output_length = 750
batch_size = 2

In [None]:
import pandas as pd
from datasets import Dataset

# Load the Excel file using pandas
train_excel_file = pd.read_csv('train_data_pov.csv')
val_excel_file = pd.read_csv('val_data_pov.csv')

# Convert the pandas DataFrame to a datasets Dataset
train_dataset = Dataset.from_pandas(train_excel_file)
val_dataset = Dataset.from_pandas(val_excel_file)

# Now you have two separate datasets for training and validation
print("Training dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))

In [None]:
train_excel_file.head()

In [None]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["judgement"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
        return_tensors="pt",
    )
    outputs = tokenizer(
        batch["prosecutor_pov"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids.tolist()  # Convert to list
    batch["attention_mask"] = inputs.attention_mask.tolist()

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ] * len(batch["input_ids"])
    

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [  # Convert PyTorch tensor to numpy array
        np.array([-100 if token == tokenizer.pad_token_id else token for token in labels])
        for labels in batch["labels"]
    ]

    return batch

In [None]:
train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["source_column", "target_column"],
)

In [None]:
val_dataset = val_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["source_column", "target_column"],
)

In [None]:
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

In [None]:
val_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

In [None]:
from transformers import AutoModelForSeq2SeqLM

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)

In [None]:
# set generate hyperparameters
model.config.num_beams = 2
model.config.max_length = max_output_length
model.config.min_length = 300
model.config.length_penalty = 2.0
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 4

In [None]:
from transformers import Seq2SeqTrainingArguments

In [None]:
# enable fp16 apex training
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    output_dir="folder_name",
    logging_steps=50,
    save_steps=150,
    save_total_limit=1,
    gradient_accumulation_steps=32,
    num_train_epochs=3,
    warmup_steps=200,
)

In [None]:
from transformers import Seq2SeqTrainer

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained('name_your_model')

In [None]:
tokenizer.save_pretrained('name_your_model')

In [None]:
!zip -r name_your_model.zip name_your_model