In [1]:
!pip install -U torch datasets transformers sentencepiece rouge_score wandb -qq
!pip install accelerate>=0.21.0 -U

[0m

In [2]:
import os
import zipfile
import numpy as np
import wandb
import pandas as pd
from datasets import load_dataset, load_metric, Dataset
from accelerate import Accelerator
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [3]:
#unzip dataset

# Specify the path to the zipped dataset
zip_file_path = "/content/IN-Abs.zip"
extraction_path = "/content/"

# Open the zip file
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    # Extract all files to the current directory
    zip_ref.extractall(extraction_path)

%ls -l

total 71104
-rw-r--r-- 1 root root     4701 Mar 31 14:02 '=0.21.0'
drwxr-xr-x 4 root root     4096 Mar 31 12:52  [0m[01;34mIN-Abs[0m/
-rw-r--r-- 1 root root 72782902 Mar 31 12:48  IN-Abs.zip
drwxr-xr-x 1 root root     4096 Mar 28 23:00  [01;34msample_data[0m/
drwxr-xr-x 4 root root     4096 Mar 31 13:23  [01;34mwandb[0m/


In [4]:
# Define the paths for train and test data
train_data_path = "/content/IN-Abs/train-data"
test_data_path = "/content/IN-Abs/test-data"

# Function to read the contents of a file
def read_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read().strip()
    return content

# Load and explore the train dataset
train_data = []

for file_name in os.listdir(os.path.join(train_data_path, 'judgement')):
    judgement_path = os.path.join(train_data_path, 'judgement', file_name)
    summary_path = os.path.join(train_data_path, 'summary', file_name)

    judgement = read_file(judgement_path)
    summary = read_file(summary_path)

    train_data.append({
        'Judgement': judgement,
        'Summary': summary
    })

train_df = pd.DataFrame(train_data)

# Load and explore the test dataset
test_data = []

for file_name in os.listdir(os.path.join(test_data_path, 'judgement')):
    judgement_path = os.path.join(test_data_path, 'judgement', file_name)
    summary_path = os.path.join(test_data_path, 'summary', file_name)

    judgement = read_file(judgement_path)
    summary = read_file(summary_path)

    test_data.append({
        'Judgement': judgement,
        'Summary': summary
    })

test_df = pd.DataFrame(test_data)

# Read word and sentence count statistics for train and test
train_stats_path = os.path.join(train_data_path, 'stats-IN-train.txt')
test_stats_path = os.path.join(test_data_path, 'stats-IN-test.txt')

train_stats = read_file(train_stats_path)
test_stats = read_file(test_stats_path)

# Display dataset statistics
print(f"Number of training documents: {len(train_df)}")
print(f"Number of test documents: {len(test_df)}")
print(f"Example Judgement:\n{train_df['Judgement'][0][:500]}...\n")
print(f"Example Summary:\n{train_df['Summary'][0][:200]}...\n")

Number of training documents: 7030
Number of test documents: 100
Example Judgement:
(Civil) No. 1615 of 1986.
(Under Article 32 of the Constitution of India).
R.P. Gupta for the Petitioner.
C.V. Subba Rao for the Respondents.
1140 The Order of the Court was delivered by B.C. RAY, J.
The petitioner who was appointed as a constable in the Haryana Police Force on November 7, 1979 has challenged in this writ petition the order dated August 24, 1982 issued by the Commandant, 2nd Bn.
Haryana Armed Police, Madhuban on the ground that the impugned order of removal from service was in e...

Example Summary:
The petitioner, a Constable in the Haryana Police Serv ice, was removed from service during the period of proba tion.
However, he was not given any opportunity of hearing against the purported order o...



In [5]:
rouge = load_metric("rouge")

  rouge = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [6]:
# wandb.login()
# wandb.init(project="led_legalease", entity="capstone")

In [7]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
led = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
max_input_length = 7120
max_output_length = 600
batch_size = 2

In [9]:
def process_data_to_model_inputs(row):
    inputs = tokenizer(
        row["Judgement"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        row["Summary"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    row["input_ids"] = inputs["input_ids"]
    row["attention_mask"] = inputs["attention_mask"]

    row["global_attention_mask"] = [[1] * len(inputs["input_ids"][0])] * len(inputs["input_ids"])
    row["global_attention_mask"][0][0] = 1
    row["labels"] = outputs["input_ids"]
    row["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in row["labels"]]

    return row

In [10]:
train_df.head()

Unnamed: 0,Judgement,Summary
0,(Civil) No. 1615 of 1986.\n(Under Article 32 o...,"The petitioner, a Constable in the Haryana Pol..."
1,iminal Appeal Nos. 211 & 212 of 1969 and Revie...,An air parcel declared by the consigner to con...
2,Appeal No. 383 of 1961.\nAppeal from the judgm...,"The respondent imported 2,000 drums of mineral..."
3,Appeals Nos.\n196 to 201 of 1953.\nAppeals fro...,The appellant as the proprietor of Nada un Jag...
4,Civil Appeal No. 874 of 1971 On appeal by Spec...,An order granting two stage carriage permits b...


For the sake of this notebook, we will reduce the training and validation data to a dummy dataset of sizes 1000 and 100 respectively. For a full training run, those lines should be commented out.

In [11]:
# Convert the DataFrames to datasets
train_dataset = Dataset.from_pandas(train_df.head(500))
test_dataset = Dataset.from_pandas(test_df.head(80))

# Apply the custom function to the datasets
train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=['Judgement', 'Summary'],
)

test_dataset = test_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=['Judgement', 'Summary'],
)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [12]:
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)
test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

In [13]:
led.config.num_beams = 2
led.config.max_length = 512
led.config.min_length = 100
led.config.length_penalty = 2.0
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 3

In [14]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [15]:
# Initialize the accelerator
accelerator = Accelerator()


training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=False,  # Disable FP16
    output_dir="./",
    logging_steps=5,
    eval_steps=10,
    save_steps=10,
    save_total_limit=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    report_to="wandb",
)


In [16]:
trainer = Seq2SeqTrainer(
            model=led,
            tokenizer=tokenizer,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33msbbm-03[0m ([33mlily21[0m). Use [1m`wandb login --relogin`[0m to force relogin


Input ids are automatically padded from 7120 to 7168 to be a multiple of `config.attention_window`: 1024
