In [None]:
import torch
import tensorflow as tf
import subprocess

def activate_torch_gpu():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        device = torch.device("cpu")
        print("GPU not available, using CPU.")
    return device

def activate_tf_gpu():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("TensorFlow GPU activated.")
        except RuntimeError as e:
            print(e)
    else:
        print("No GPU found, using CPU.")

if __name__ == "__main__":
    print("Activating PyTorch GPU...")
    torch_device = activate_torch_gpu()
    print("\nActivating TensorFlow GPU...")
    activate_tf_gpu()


Activating PyTorch GPU...
GPU not available, using CPU.

Activating TensorFlow GPU...
No GPU found, using CPU.


In [1]:
pip install datasets transformers

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [14]:
from datasets import load_dataset

ds = load_dataset("Yashaswat/Indian-Legal-Text-ABS")

In [15]:
# Assuming the dataset is in the 'train' split
train_dataset = ds['train'].select(range(1600))

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [16]:
#tokenization

def preprocess_function(batch):
    source = batch['judgement']
    target = batch["summary"]
    source_ids = tokenizer(source, truncation=True, padding="max_length", max_length=128)
    target_ids = tokenizer(target, truncation=True, padding="max_length", max_length=128)

    # Replace pad token id with -100 for labels to ignore padding in loss computation
    labels = target_ids["input_ids"]
    labels = [[(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels]

    return {
        "input_ids": source_ids["input_ids"],
        "attention_mask": source_ids["attention_mask"],
        "labels": labels
    }

df_source = train_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

In [17]:
from transformers import TrainingArguments, Trainer, TrainerCallback
import os
import glob
# Define a custom callback to delete the second last saved model
class DeleteSecondLastModelCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        # Get the list of all saved checkpoints
        checkpoints = sorted(glob.glob(f"{args.output_dir}/checkpoint-*"), key=os.path.getmtime)

        # If there are at least two checkpoints, delete the second last one
        if len(checkpoints) >= 2:
            second_last_checkpoint = checkpoints[-2]
            print(f"Deleting second last checkpoint: {second_last_checkpoint}")
            os.system(f"rm -rf {second_last_checkpoint}")


In [18]:
# Define training arguments
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/content/sample_data/output",  # Replace with your output directory
    per_device_train_batch_size=8,
    num_train_epochs=75,  # Adjust number of epochs as needed
    remove_unused_columns=False,
    save_strategy="steps",  # Save model based on steps
    save_steps=600, # Save model at the end of each epoch
    save_total_limit=2,  # Keep only the last 2 saved models
)


In [19]:
df_source = df_source.train_test_split(test_size=0.2)  # Adjust test_size as needed

In [None]:
# Create Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=df_source["train"],
    eval_dataset=df_source["test"],
    callbacks=[DeleteSecondLastModelCallback()]
)

trainer.train()

Step,Training Loss
500,1.1545
1000,0.4655
1500,0.2067
2000,0.1161
2500,0.0794
3000,0.059
3500,0.0461
4000,0.0365
4500,0.0294


Deleting second last checkpoint: /content/sample_data/output/checkpoint-800
Deleting second last checkpoint: /content/sample_data/output/checkpoint-600
Deleting second last checkpoint: /content/sample_data/output/checkpoint-1200
Deleting second last checkpoint: /content/sample_data/output/checkpoint-1800
Deleting second last checkpoint: /content/sample_data/output/checkpoint-2400
Deleting second last checkpoint: /content/sample_data/output/checkpoint-3000
Deleting second last checkpoint: /content/sample_data/output/checkpoint-3600


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/MyDrive/IN-Abs/model")
 #Save the trained model
model.save_pretrained('./my_model')
tokenizer.save_pretrained('./my_model')

Mounted at /content/drive


('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/vocab.json',
 './my_model/merges.txt',
 './my_model/added_tokens.json',
 './my_model/tokenizer.json')

In [None]:
 #Save the trained model
model.save_pretrained('./my_model')
tokenizer.save_pretrained('./my_model')

testing part

In [None]:
pip install transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

# Define the path where your model is stored
model_path = "/content/drive/MyDrive/IN-Abs/model/my_model"  # Update this path

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Create a summarization pipeline
# Specify the device to use (CPU in this case)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1)  # Use -1 for CPU

# Input text
text = """The appellant, as the owner and landlord of the said house,

filed Eviction Suit No.25/2001 against the respondents-
tenant on the ground of default in payment of rent and

refusal to vacate; and for personal need of the suit
premises for establishing an ultrasound machine for his
two unemployed sons.
5. The suit after contest was decreed by the court of first
instance vide judgment and order dated 15.07.2006 on the
ground of bona fide need of the appellant-landlord holding
that the oral and documentary evidence proves the bona
fide need of the appellant-landlord to install the
ultrasound machine for his two unemployed sons. The
appellant-landlord had established his capability to
purchase such a machine and had proved his annual
income to be Rs.4,00,000/-. He had also proved that the
suit premises is the most appropriate place for the
installation of such machines as there is a medical clinic
and a pathology center adjacent to it. The suit was,
however, dismissed on the ground of default in payment of
rent.

3

6. The aforesaid judgment and order of eviction passed by the
court of first instance was reversed by the First Appellate
Court and the same was also affirmed by the High Court
in Second Appeal. Thus, aggrieved by the impugned
judgment and order of the High Court of Jharkhand at
Ranchi dated 18.08.2022 passed in Second Appeal
No.317/20061, the appellant-landlord herein has
preferred this appeal.
7. It may not be out of context to mention here that the
appellant-landlord had not assailed the dismissal of the
suit on the ground of default in payment of rent and as
such the decree to that effect passed by the court of first

instance has become final and conclusive. The appellant-
landlord is, thus, confining his case for the decree of

eviction only on the ground of bona fide need of
establishing an ultrasound machine for the benefit of his
two unemployed sons.
"""

# Generate summary
summary = summarizer(text, max_length=400, min_length=50, do_sample=False)

# Print summary result
print("Summary:")
print(summary[0]['summary_text'])


OSError: Incorrect path_or_model_id: '/content/drive/MyDrive/IN-Abs/model/my_model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.