In [None]:
!pip install pandas openpyxl

[0m

In [None]:
!pip install transformers

[0m

In [None]:
import numpy as np
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

Downloading tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
max_input_length = 16384
max_output_length = 750
batch_size = 2

In [None]:
import pandas as pd
from datasets import Dataset

# Load the CSV file using pandas
train_excel_file = pd.read_csv('train_data.csv')
val_excel_file = pd.read_csv('val_data.csv')

# Convert the pandas DataFrame to a datasets Dataset
train_dataset = Dataset.from_pandas(train_excel_file)
val_dataset = Dataset.from_pandas(val_excel_file)

In [None]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["Judgement"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
        return_tensors="pt",
    )
    outputs = tokenizer(
        batch["Perspective-based Summary"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids.tolist()  # Convert to list
    batch["attention_mask"] = inputs.attention_mask.tolist()

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ] * len(batch["input_ids"])


    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [  # Convert PyTorch tensor to numpy array
        np.array([-100 if token == tokenizer.pad_token_id else token for token in labels])
        for labels in batch["labels"]
    ]

    return batch

In [None]:
train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["Judgement", "Perspective-based Summary"],
)

  0%|          | 0/1917 [00:00<?, ?ba/s]

In [None]:
val_dataset = val_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["Judgement", "Perspective-based Summary"],
)

  0%|          | 0/50 [00:00<?, ?ba/s]

In [None]:
train_excel_file.describe()
val_excel_file.describe()

Unnamed: 0,judgement,summary,prosecutor_pov,defense_pov
count,100,100,100,100
unique,100,100,100,100
top,"iminal Appeals No,%.\n79 and 89 of 1959.\nAppe...","\n ""prosecution"": ""From the perspective of th...",From the perspective of the prosecution attorn...,"From the defense attorney's perspective, this ..."
freq,1,1,1,1


In [None]:
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

In [None]:
val_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

In [None]:
from transformers import AutoModelForSeq2SeqLM

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)

Downloading pytorch_model.bin:   0%|          | 0.00/618M [00:00<?, ?B/s]

In [None]:
# set generate hyperparameters
model.config.num_beams = 2
model.config.max_length = max_output_length
model.config.min_length = 300
model.config.length_penalty = 2.0
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 4

In [None]:
from transformers import Seq2SeqTrainingArguments

In [None]:
# enable fp16 apex training
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    output_dir="Untitled Folder",
    logging_steps=50,
    save_steps=150,
    save_total_limit=1,
    gradient_accumulation_steps=32,
    num_train_epochs=3,
    warmup_steps=200,
)

In [None]:
from transformers import Seq2SeqTrainer

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    #compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using cuda_amp half precision backend


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `LEDForConditionalGeneration.forward` and have been ignored: defense_pov, summary. If defense_pov, summary are not expected by `LEDForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3833
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 32
  Total optimization steps = 177
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss
50,2.4152,1.754704
100,1.711,1.548222
150,1.5499,1.464082


The following columns in the evaluation set don't have a corresponding argument in `LEDForConditionalGeneration.forward` and have been ignored: defense_pov, summary. If defense_pov, summary are not expected by `LEDForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)
wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)
wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)
wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)
The following columns in the evaluation set don't have a corresponding argument in `LEDForConditionalGeneration.forward` and have been ignored: defense_pov, summary. If defense_pov, summary are not expected by `LEDForConditionalGeneration.forward`,  you can safely ignore this message.
***** Runni

TrainOutput(global_step=177, training_loss=1.8248963652357544, metrics={'train_runtime': 14343.5337, 'train_samples_per_second': 0.802, 'train_steps_per_second': 0.012, 'total_flos': 1.2358297487985869e+17, 'train_loss': 1.8248963652357544, 'epoch': 2.98})

In [None]:
model.save_pretrained('legal-led-pro-3')

Configuration saved in legal-led-pro-3/config.json
Model weights saved in legal-led-pro-3/pytorch_model.bin


In [None]:
tokenizer.save_pretrained('legal-led-pro-3')

tokenizer config file saved in legal-led-pro-3/tokenizer_config.json
Special tokens file saved in legal-led-pro-3/special_tokens_map.json


('legal-led-pro-3/tokenizer_config.json',
 'legal-led-pro-3/special_tokens_map.json',
 'legal-led-pro-3/vocab.json',
 'legal-led-pro-3/merges.txt',
 'legal-led-pro-3/added_tokens.json',
 'legal-led-pro-3/tokenizer.json')

In [None]:
!zip -r legal-led-pro-3.zip legal-led-pro-3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: legal-led-pro-3/ (stored 0%)
  adding: legal-led-pro-3/pytorch_model.bin (deflated 9%)
  adding: legal-led-pro-3/vocab.json (deflated 59%)
  adding: legal-led-pro-3/tokenizer.json (deflated 72%)
  adding: legal-led-pro-3/config.json (deflated 60%)
  adding: legal-led-pro-3/tokenizer_config.json (deflated 74%)
  adding: legal-led-pro-3/special_tokens_map.json (deflated 85%)
  adding: legal-led-pro-3/merges.txt (deflated 53%)
